diff --git a/Code/TestTRTInterDll/TestTRTInterDll.cpp b/Code/TestTRTInterDll/TestTRTInterDll.cpp new file mode 100644 index 0000000..e86b2af --- /dev/null +++ b/Code/TestTRTInterDll/TestTRTInterDll.cpp @@ -0,0 +1,265 @@ +// TestTRTInterDll.cpp : 此文件包含 "main" 函数。程序执行将在此处开始并结束。 +// + +#include +#include +#include +#include +#include "utils.h" +#include "MI_Interface.h" +#include + + +namespace fs = std::filesystem; + +bool buffer2Mat(const MN_VisionImage::MS_ImageParam& _inImg, cv::Mat& _mat); + +void test01() +{ + std::vector imagePath; + imagePath.emplace_back("./imageDatas/daisy.jpg"); + imagePath.emplace_back("./imageDatas/dandelion.jpg"); + imagePath.emplace_back("./imageDatas/sunflower.jpg"); + imagePath.emplace_back("./imageDatas/tulip.jpg"); + + std::vector matImgs; + for (auto _var : imagePath) + { + matImgs.emplace_back(cv::imread(_var, 1)); + } + + utils::InitParameter params; + params.m_modelType = utils::ME_ModelType::E_RESNET34; + params.class_names = utils::dataSets::flower_labels; + //initParameters.class_names = utils::dataSets::voc20; + params.num_class = 5; // for flower_labels + //initParameters.num_class = 20; // for voc2012 + params.batch_size = 1; + params.dst_h = 224; + params.dst_w = 224; + memcpy(¶ms.mImage.m_data, matImgs[0].data, sizeof(matImgs[0].rows * matImgs[0].cols * matImgs[0].channels())); + params.mImage.m_height = matImgs[0].rows; + params.mImage.m_width = matImgs[0].cols; + params.mImage.m_channels = 3; + params.input_output_names = { "input", "output" }; + params.conf_thresh = 0.25f; + params.iou_thresh = 0.45f; + params.save_path = "./imageDatas"; + params.meanVec = { 0.406, 0.456, 0.485 }; + params.stdVec = { 0.225, 0.224, 0.229 }; + MI_VisionInterface* resnet34Ptr = getInterfacePtr(params); + if (!resnet34Ptr) + { + return; + } + + utils::InitParameter params1; + params1.m_modelType = utils::ME_ModelType::E_RESNET50; + params1.class_names = utils::dataSets::flower_labels; + //initParameters.class_names = utils::dataSets::voc20; + params1.num_class = 5; // for flower_labels + //initParameters.num_class = 20; // for voc2012 + params1.batch_size = 1; + params1.dst_h = 224; + params1.dst_w = 224; + memcpy(¶ms1.mImage.m_data, matImgs[0].data, sizeof(matImgs[0].rows * matImgs[0].cols * matImgs[0].channels())); + params1.mImage.m_height = matImgs[0].rows; + params1.mImage.m_width = matImgs[0].cols; + params1.mImage.m_channels = 3; + params1.input_output_names = { "input", "output" }; + params1.conf_thresh = 0.25f; + params1.iou_thresh = 0.45f; + params1.save_path = "./imageDatas"; + params1.meanVec = { 0.406, 0.456, 0.485 }; + params1.stdVec = { 0.225, 0.224, 0.229 }; + MI_VisionInterface* resnet50Ptr = getInterfacePtr(params1); + if (!resnet50Ptr) + { + return; + } + + // 线程测试,初始化在一个线程,推理在另外一个线程内 + std::string onnxFile = "./imageDatas/resnet34_0407.onnx"; + std::string onnxFile1 = "./imageDatas/resnet50.onnx"; + +#if 0 + bool bRet = false; + bRet = resnetPtr->initEngine(onnxFile); + if (!bRet) + { + return; + } + + bRet = resnetPtr->check(); + if (!bRet) + { + return; + } + + std::vector detectResVec; + bRet = resnetPtr->doTRTInfer(matImgs, &detectResVec, nullptr); + if (!bRet) + { + return; + } +#endif // 0 + + // 初始化线程 + auto initRes = std::async(std::launch::async, [&] { + + bool bRet = false; + // 加载第一个模型初始化并推理 + bRet = resnet34Ptr->initEngine(onnxFile); + if (!bRet) + { + return false; + } + + // 加载第二个模型初始化并推理 + bRet = resnet50Ptr->initEngine(onnxFile1); + if (!bRet) + { + return false; + } + + return true; + }); + initRes.wait(); + + auto initRes1 = std::async(std::launch::async, [&] { + + bool bRet = false; + bRet = resnet34Ptr->check(); + if (!bRet) + { + return false; + } + + std::vector detectResVec1; + bRet = resnet34Ptr->doTRTInfer(matImgs, &detectResVec1, nullptr); + if (!bRet) + { + return false; + } + + bRet = resnet50Ptr->check(); + if (!bRet) + { + return false; + } + + std::vector detectResVec2; + bRet = resnet50Ptr->doTRTInfer(matImgs, &detectResVec2, nullptr); + if (!bRet) + { + return false; + } + + return true; + + }); + + //initRes1.wait(); + + auto initRes2 = std::async(std::launch::async, [&] { + + bool bRet = false; + bRet = resnet34Ptr->check(); + if (!bRet) + { + return false; + } + + std::vector detectResVec1; + bRet = resnet34Ptr->doTRTInfer(matImgs, &detectResVec1, nullptr); + if (!bRet) + { + return false; + } + + bRet = resnet50Ptr->check(); + if (!bRet) + { + return false; + } + + std::vector detectResVec2; + bRet = resnet50Ptr->doTRTInfer(matImgs, &detectResVec2, nullptr); + if (!bRet) + { + return false; + } + + return true; + }); + + //initRes2.wait(); + +} + +void test02() +{ + cv::Mat img = cv::imread("./imageDatas/daisy.jpg", 1); + MN_VisionImage::MS_ImageParam bufImg((uchar*)img.data, img.cols, img.rows, MN_VisionImage::ME_ImageType::E_RGB); + + cv::Mat convertImg; + bool bRet = buffer2Mat(bufImg, convertImg); + cv::imshow("src", img); + cv::imshow("image", convertImg); + cv::waitKey(0); +} + +int main() +{ + + test01(); + + system("pause"); + return 0; +} + +// 运行程序: Ctrl + F5 或调试 >“开始执行(不调试)”菜单 +// 调试程序: F5 或调试 >“开始调试”菜单 + +bool buffer2Mat(const MN_VisionImage::MS_ImageParam& _inImg, cv::Mat& _mat) +{ + uchar* pBuf = _inImg.m_data.get(); //获取图像数据首地址 + int nW = _inImg.m_width; + int nH = _inImg.m_height; + int nChannel = _inImg.m_channels; + + if (pBuf == nullptr || nW <= 1 || nH <= 1) + { + // LOG_ERROR("convert buffer to mat, input image error. \n"); + return false; + } + + if (_inImg.mImgType == MN_VisionImage::ME_ImageType::E_GRAY) + { + _mat = cv::Mat(nH, nW, CV_8UC1, pBuf); + } + else if (_inImg.mImgType == MN_VisionImage::ME_ImageType::E_RGBA) + { + _mat = cv::Mat(nH, nW, CV_8UC4, pBuf); + } + else + { + _mat = cv::Mat(nH, nW, CV_8UC3, pBuf); + } + + if (_mat.data == nullptr || _mat.cols <= 1 || _mat.rows <= 1) + { + // LOG_ERROR("convert buffer to mat, convert image failed. \n"); + return false; + } + + return true; +} + + +// 0.3 * 0.3 - 12寸晶圆, 一次性识别9颗 -- 测试推理时间 +/* +1. 芯片分类:正常,边缘,蓝膜 +2. 对于正常的芯片进行检测--针对划痕和脏污分两个模型/或者一个模型; +3. 针对针测点检测 -- 传统算法; +*/ \ No newline at end of file diff --git a/Code/TestTRTInterDll/TestTRTInterDll.vcxproj b/Code/TestTRTInterDll/TestTRTInterDll.vcxproj new file mode 100644 index 0000000..768d55e --- /dev/null +++ b/Code/TestTRTInterDll/TestTRTInterDll.vcxproj @@ -0,0 +1,147 @@ + + + + + Debug + Win32 + + + Release + Win32 + + + Debug + x64 + + + Release + x64 + + + + 17.0 + Win32Proj + {3e72c625-2f8b-4fb6-aa05-70a5c3a44bb9} + TestTRTInterDll + 10.0 + + + + Application + true + v143 + Unicode + + + Application + false + v143 + true + Unicode + + + Application + true + v143 + MultiByte + + + Application + false + v143 + true + MultiByte + + + + + + + + + + + + + + + + + + + + + + + + + Level3 + true + WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) + true + + + Console + true + + + + + Level3 + true + true + true + WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + true + + + Console + true + true + true + + + + + Level3 + false + _DEBUG;_CONSOLE;%(PreprocessorDefinitions) + true + stdcpp17 + Default + /Zc:__cplusplus %(AdditionalOptions) + + + Console + true + + + + + Level3 + true + true + false + NDEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;%(PreprocessorDefinitions) + true + .\trtinfer_lib\include;.\trtinfer_lib\common;%(AdditionalIncludeDirectories) + Disabled + /Zc:__cplusplus %(AdditionalOptions) + stdcpp17 + + + Console + true + true + true + .\trtinfer_lib;%(AdditionalLibraryDirectories) + .\trtinfer_lib\*.lib;%(AdditionalDependencies) + + + + + + + + + \ No newline at end of file diff --git a/Code/TestTRTInterDll/TestTRTInterDll.vcxproj.filters b/Code/TestTRTInterDll/TestTRTInterDll.vcxproj.filters new file mode 100644 index 0000000..7f9ea1e --- /dev/null +++ b/Code/TestTRTInterDll/TestTRTInterDll.vcxproj.filters @@ -0,0 +1,22 @@ + + + + + {4FC737F1-C7A5-4376-A066-2A32D752A2FF} + cpp;c;cc;cxx;c++;cppm;ixx;def;odl;idl;hpj;bat;asm;asmx + + + {93995380-89BD-4b04-88EB-625FBE52EBFB} + h;hh;hpp;hxx;h++;hm;inl;inc;ipp;xsd + + + {67DA6AB6-F800-4c08-8B7A-83BB121AAD01} + rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms + + + + + 源文件 + + + \ No newline at end of file diff --git a/Code/TestTRTInterDll/cuda11_6_Release_x64.props b/Code/TestTRTInterDll/cuda11_6_Release_x64.props new file mode 100644 index 0000000..b712131 --- /dev/null +++ b/Code/TestTRTInterDll/cuda11_6_Release_x64.props @@ -0,0 +1,16 @@ + + + + + + + + C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.6\include;%(AdditionalIncludeDirectories) + + + C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.6\lib\x64;%(AdditionalLibraryDirectories) + C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.6\lib\x64\*.lib;%(AdditionalDependencies) + + + + \ No newline at end of file diff --git a/Code/TestTRTInterDll/imageDatas/classifier.onnx b/Code/TestTRTInterDll/imageDatas/classifier.onnx new file mode 100644 index 0000000..d9c5983 Binary files /dev/null and b/Code/TestTRTInterDll/imageDatas/classifier.onnx differ diff --git a/Code/TestTRTInterDll/imageDatas/daisy.jpg b/Code/TestTRTInterDll/imageDatas/daisy.jpg new file mode 100644 index 0000000..cad7c13 Binary files /dev/null and b/Code/TestTRTInterDll/imageDatas/daisy.jpg differ diff --git a/Code/TestTRTInterDll/imageDatas/dandelion.jpg b/Code/TestTRTInterDll/imageDatas/dandelion.jpg new file mode 100644 index 0000000..503eae2 Binary files /dev/null and b/Code/TestTRTInterDll/imageDatas/dandelion.jpg differ diff --git a/Code/TestTRTInterDll/imageDatas/dog.jpg b/Code/TestTRTInterDll/imageDatas/dog.jpg new file mode 100644 index 0000000..a1319df Binary files /dev/null and b/Code/TestTRTInterDll/imageDatas/dog.jpg differ diff --git a/Code/TestTRTInterDll/imageDatas/flower_labels.txt b/Code/TestTRTInterDll/imageDatas/flower_labels.txt new file mode 100644 index 0000000..0cf9a5e --- /dev/null +++ b/Code/TestTRTInterDll/imageDatas/flower_labels.txt @@ -0,0 +1,5 @@ +daisy +dandelion +rose +sunflower +tulip \ No newline at end of file diff --git a/Code/TestTRTInterDll/imageDatas/labels.imagenet.txt b/Code/TestTRTInterDll/imageDatas/labels.imagenet.txt new file mode 100644 index 0000000..61e3ed0 --- /dev/null +++ b/Code/TestTRTInterDll/imageDatas/labels.imagenet.txt @@ -0,0 +1,1000 @@ +丁鲷 +金鱼 +大白鲨 +虎鲨 +锤头鲨 +电鳐 +黄貂鱼 +公鸡 +母鸡 +鸵鸟 +燕雀 +金翅雀 +家朱雀 +灯芯草雀 +靛蓝雀 +蓝鹀 +夜莺 +松鸦 +喜鹊 +山雀 +河鸟 +鸢(猛禽) +秃头鹰 +秃鹫 +大灰猫头鹰 +欧洲火蝾螈 +普通蝾螈 +水蜥 +斑点蝾螈 +蝾螈 +牛蛙 +树蛙 +尾蛙 +红海龟 +皮革龟 +泥龟 +淡水龟 +箱龟 +带状壁虎 +普通鬣蜥 +美国变色龙 +鞭尾蜥蜴 +飞龙科蜥蜴 +褶边蜥蜴 +鳄鱼蜥蜴 +毒蜥 +绿蜥蜴 +非洲变色龙 +科莫多蜥蜴 +非洲鳄 +美国鳄鱼 +三角龙 +雷蛇 +环蛇 +希腊蛇 +绿蛇 +国王蛇 +袜带蛇 +水蛇 +藤蛇 +夜蛇 +大蟒蛇 +岩石蟒蛇 +印度眼镜蛇 +绿曼巴 +海蛇 +角腹蛇 +菱纹响尾蛇 +角响尾蛇 +三叶虫 +盲蜘蛛 +蝎子 +黑金花园蜘蛛 +谷仓蜘蛛 +花园蜘蛛 +黑寡妇蜘蛛 +狼蛛 +狼蜘蛛 +壁虱 +蜈蚣 +黑松鸡 +松鸡 +披肩鸡 +草原鸡 +孔雀 +鹌鹑 +鹧鸪 +非洲灰鹦鹉 +金刚鹦鹉 +硫冠鹦鹉 +短尾鹦鹉 +褐翅鸦鹃 +蜜蜂 +犀鸟 +蜂鸟 +鹟䴕 +犀鸟 +野鸭 +红胸秋沙鸭 +鹅 +黑天鹅 +大象 +针鼹鼠 +鸭嘴兽 +沙袋鼠 +考拉 +袋熊 +水母 +海葵 +脑珊瑚 +扁形虫扁虫 +线虫 +海螺 +蜗牛 +鼻涕虫 +海参 +石鳖 +鹦鹉螺 +珍宝蟹 +石蟹 +招潮蟹 +帝王蟹 +美国龙虾 +大螯虾 +小龙虾 +寄居蟹 +等足目动物(明虾和螃蟹近亲) +白鹳 +黑鹳 +鹭 +火烈鸟 +小蓝鹭 +美国鹭 +麻鸦 +鹤 +秧鹤 +欧洲水鸡 +沼泽泥母鸡 +鸨 +红翻石鹬 +红背鹬 +红脚鹬 +半蹼鹬 +蛎鹬 +鹈鹕 +国王企鹅 +信天翁 +灰鲸 +杀人鲸 +海牛 +海狮 +奇瓦瓦 +日本猎犬 +马尔济斯犬 +狮子狗 +西施犬 +布莱尼姆猎犬 +巴比狗 +玩具犬 +罗得西亚长背猎狗 +阿富汗猎犬 +猎犬 +比格犬 +侦探犬 +蓝色快狗 +黑褐猎浣熊犬 +沃克猎犬 +英国猎狐犬 +美洲赤狗 +俄罗斯猎狼犬 +爱尔兰猎狼犬 +意大利灰狗 +惠比特犬 +依比沙猎犬 +挪威猎犬 +奥达猎犬 +沙克犬 +苏格兰猎鹿犬 +威玛猎犬 +斯塔福德郡牛头梗 +美国斯塔福德郡梗 +贝德灵顿梗 +边境梗 +凯丽蓝梗 +爱尔兰梗 +诺福克梗 +诺维奇梗 +约克郡梗 +刚毛猎狐梗 +莱克兰梗 +锡利哈姆梗 +艾尔谷犬 +凯恩梗 +澳大利亚梗 +丹迪丁蒙梗 +波士顿梗 +迷你雪纳瑞犬 +巨型雪纳瑞犬 +标准雪纳瑞犬 +苏格兰梗 +西藏梗 +丝毛梗 +软毛麦色梗 +西高地白梗 +拉萨阿普索犬 +平毛寻回犬 +卷毛寻回犬 +金毛猎犬 +拉布拉多猎犬 +乞沙比克猎犬 +德国短毛猎犬 +维兹拉犬 +英国谍犬 +爱尔兰雪达犬 +戈登雪达犬 +布列塔尼犬猎犬 +黄毛 +英国史宾格犬 +威尔士史宾格犬 +可卡犬 +萨塞克斯猎犬 +爱尔兰水猎犬 +哥威斯犬 +舒柏奇犬 +比利时牧羊犬 +马里努阿犬 +伯瑞犬 +凯尔皮犬 +匈牙利牧羊犬 +老英国牧羊犬 +喜乐蒂牧羊犬 +牧羊犬 +边境牧羊犬 +法兰德斯牧牛狗 +罗特韦尔犬 +德国牧羊犬 +多伯曼犬 +迷你杜宾犬 +大瑞士山地犬 +伯恩山犬 +Appenzeller狗 +EntleBucher狗 +拳师狗 +斗牛獒 +藏獒 +法国斗牛犬 +大丹犬 +圣伯纳德狗 +爱斯基摩犬 +雪橇犬 +哈士奇 +达尔马提亚 +狮毛狗 +巴辛吉狗 +哈巴狗 +莱昂贝格狗 +纽芬兰岛狗 +大白熊犬 +萨摩耶犬 +博美犬 +松狮 +荷兰卷尾狮毛狗 +布鲁塞尔格林芬犬 +彭布洛克威尔士科基犬 +威尔士柯基犬 +玩具贵宾犬 +迷你贵宾犬 +标准贵宾犬 +墨西哥无毛犬 +灰狼 +白狼 +红太狼 +狼 +澳洲野狗 +豺 +非洲猎犬 +鬣狗 +红狐狸 +沙狐 +北极狐狸 +灰狐狸 +虎斑猫 +山猫 +波斯猫 +暹罗暹罗猫 +埃及猫 +美洲狮 +猞猁 +豹子 +雪豹 +美洲虎 +狮子 +老虎 +猎豹 +棕熊 +美洲黑熊 +冰熊 +懒熊 +猫鼬 +猫鼬 +虎甲虫 +瓢虫 +土鳖虫 +天牛 +龟甲虫 +粪甲虫 +犀牛甲虫 +象甲 +苍蝇 +蜜蜂 +蚂蚁 +蚱蜢 +蟋蟀 +竹节虫 +蟑螂 +螳螂 +蝉 +叶蝉 +草蜻蛉 +蜻蜓 +豆娘 +优红蛱蝶 +小环蝴蝶 +君主蝴蝶 +菜粉蝶 +白蝴蝶 +灰蝶 +海星 +海胆 +海参 +野兔 +兔 +安哥拉兔 +仓鼠 +刺猬 +黑松鼠 +土拨鼠 +海狸 +豚鼠 +栗色马 +斑马 +猪 +野猪 +疣猪 +河马 +牛 +水牛 +野牛 +公羊 +大角羊 +山羊 +狷羚 +黑斑羚 +瞪羚 +阿拉伯单峰骆驼 +骆驼 +黄鼠狼 +水貂 +臭猫 +黑足鼬 +水獭 +臭鼬 +獾 +犰狳 +树懒 +猩猩 +大猩猩 +黑猩猩 +长臂猿 +合趾猿长臂猿 +长尾猴 +赤猴 +狒狒 +恒河猴 +白头叶猴 +疣猴 +长鼻猴 +狨(美洲产小型长尾猴) +卷尾猴 +吼猴 +伶猴 +蜘蛛猴 +松鼠猴 +马达加斯加环尾狐猴 +大狐猴 +印度大象 +非洲象 +小熊猫 +大熊猫 +杖鱼 +鳗鱼 +银鲑 +三色刺蝶鱼 +海葵鱼 +鲟鱼 +雀鳝 +狮子鱼 +河豚 +算盘 +长袍 +学位袍 +手风琴 +原声吉他 +航空母舰 +客机 +飞艇 +祭坛 +救护车 +水陆两用车 +模拟时钟 +蜂房 +围裙 +垃圾桶 +攻击步枪 +背包 +面包店 +平衡木 +热气球 +圆珠笔 +创可贴 +班卓琴 +栏杆 +杠铃 +理发师的椅子 +理发店 +牲口棚 +晴雨表 +圆筒 +园地小车 +棒球 +篮球 +婴儿床 +巴松管 +游泳帽 +沐浴毛巾 +浴缸 +沙滩车 +灯塔 +高脚杯 +熊皮高帽 +啤酒瓶 +啤酒杯 +钟塔 +(小儿用的)围嘴 +串联自行车 +比基尼 +装订册 +双筒望远镜 +鸟舍 +船库 +雪橇 +饰扣式领带 +阔边女帽 +书橱 +书店 +瓶盖 +弓箭 +蝴蝶结领结 +铜制牌位 +奶罩 +防波堤 +铠甲 +扫帚 +桶 +扣环 +防弹背心 +动车 +肉铺 +出租车 +大锅 +蜡烛 +大炮 +独木舟 +开瓶器 +开衫 +车镜 +旋转木马 +木匠的工具包 +纸箱 +车轮 +取款机 +盒式录音带 +卡带播放器 +城堡 +双体船 +CD播放器 +大提琴 +移动电话 +铁链 +围栏 +链甲 +电锯 +箱子 +衣柜 +编钟 +中国橱柜 +圣诞袜 +教堂 +电影院 +切肉刀 +悬崖屋 +斗篷 +木屐 +鸡尾酒调酒器 +咖啡杯 +咖啡壶 +螺旋结构(楼梯) +组合锁 +电脑键盘 +糖果 +集装箱船 +敞篷车 +开瓶器 +短号 +牛仔靴 +牛仔帽 +摇篮 +起重机 +头盔 +板条箱 +小儿床 +砂锅 +槌球 +拐杖 +胸甲 +大坝 +书桌 +台式电脑 +有线电话 +尿布湿 +数字时钟 +数字手表 +餐桌板 +抹布 +洗碗机 +盘式制动器 +码头 +狗拉雪橇 +圆顶 +门垫 +钻井平台 +鼓 +鼓槌 +哑铃 +荷兰烤箱 +电风扇 +电吉他 +电力机车 +电视 +信封 +浓缩咖啡机 +扑面粉 +女用长围巾 +文件 +消防船 +消防车 +火炉栏 +旗杆 +长笛 +折叠椅 +橄榄球头盔 +叉车 +喷泉 +钢笔 +有四根帷柱的床 +运货车厢 +圆号 +煎锅 +裘皮大衣 +垃圾车 +防毒面具 +汽油泵 +高脚杯 +卡丁车 +高尔夫球 +高尔夫球车 +狭长小船 +锣 +礼服 +钢琴 +温室 +散热器格栅 +杂货店 +断头台 +小发夹 +头发喷雾 +半履带装甲车 +锤子 +大篮子 +手摇鼓风机 +手提电脑 +手帕 +硬盘 +口琴 +竖琴 +收割机 +斧头 +手枪皮套 +家庭影院 +蜂窝 +钩爪 +衬裙 +单杠 +马车 +沙漏 +iPod +熨斗 +南瓜灯笼 +牛仔裤 +吉普车 +运动衫 +拼图 +人力车 +操纵杆 +和服 +护膝 +蝴蝶结 +大褂 +长柄勺 +灯罩 +笔记本电脑 +割草机 +镜头盖 +开信刀 +图书馆 +救生艇 +点火器 +豪华轿车 +远洋班轮 +唇膏 +平底便鞋 +洗剂 +扬声器 +放大镜 +锯木厂 +磁罗盘 +邮袋 +信箱 +女游泳衣 +有肩带浴衣 +窨井盖 +沙球(一种打击乐器) +马林巴木琴 +面膜 +火柴 +花柱 +迷宫 +量杯 +药箱 +巨石 +麦克风 +微波炉 +军装 +奶桶 +迷你巴士 +迷你裙 +面包车 +导弹 +连指手套 +搅拌钵 +活动房屋(由汽车拖拉的) +T型发动机小汽车 +调制解调器 +修道院 +显示器 +电瓶车 +砂浆 +学士 +清真寺 +蚊帐 +摩托车 +山地自行车 +登山帐 +鼠标 +捕鼠器 +搬家车 +口套 +钉子 +颈托 +项链 +乳头(瓶) +笔记本 +方尖碑 +双簧管 +陶笛 +里程表 +滤油器 +风琴 +示波器 +罩裙 +牛车 +氧气面罩 +包装 +船桨 +明轮 +挂锁 +画笔 +睡衣 +宫殿 +排箫 +纸巾 +降落伞 +双杠 +公园长椅 +停车收费表 +客车 +露台 +付费电话 +基座 +铅笔盒 +卷笔刀 +香水(瓶) +培养皿 +复印机 +拨弦片 +尖顶头盔 +栅栏 +皮卡 +桥墩 +存钱罐 +药瓶 +枕头 +乒乓球 +风车 +海盗船 +水罐 +木工刨 +天文馆 +塑料袋 +板架 +犁型铲雪机 +手压皮碗泵 +宝丽来相机 +电线杆 +警车 +雨披 +台球桌 +充气饮料瓶 +花盆 +陶工旋盘 +电钻 +祈祷垫 +打印机 +监狱 +炮弹 +投影仪 +冰球 +沙包 +钱包 +羽管笔 +被子 +赛车 +球拍 +散热器 +收音机 +射电望远镜 +雨桶 +休闲车 +卷轴 +反射式照相机 +冰箱 +遥控器 +餐厅 +左轮手枪 +步枪 +摇椅 +电转烤肉架 +橡皮 +橄榄球 +直尺 +跑步鞋 +保险柜 +安全别针 +盐瓶(调味用) +凉鞋 +纱笼 +萨克斯管 +剑鞘 +秤 +校车 +帆船 +记分牌 +屏幕 +螺丝 +螺丝刀 +安全带 +缝纫机 +盾牌 +皮鞋店 +障子 +购物篮 +购物车 +铁锹 +浴帽 +浴帘 +滑雪板 +滑雪面罩 +睡袋 +滑尺 +滑动门 +角子老虎机 +潜水通气管 +雪橇 +扫雪机 +皂液器 +足球 +袜子 +碟式太阳能 +宽边帽 +汤碗 +空格键 +空间加热器 +航天飞机 +铲(搅拌或涂敷用的) +快艇 +蜘蛛网 +纺锤 +跑车 +聚光灯 +舞台 +蒸汽机车 +钢拱桥 +钢滚筒 +听诊器 +女用披肩 +石头墙 +秒表 +火炉 +过滤器 +有轨电车 +担架 +沙发床 +佛塔 +潜艇 +套装 +日晷 +太阳镜 +太阳镜 +防晒霜 +悬索桥 +拖把 +运动衫 +游泳裤 +秋千 +开关 +注射器 +台灯 +坦克 +磁带播放器 +茶壶 +泰迪 +电视 +网球 +茅草 +幕布 +顶针 +脱粒机 +宝座 +瓦屋顶 +烤面包机 +烟草店 +马桶 +火炬 +图腾柱 +拖车 +玩具店 +拖拉机 +拖车 +托盘 +风衣 +三轮车 +三体船 +三脚架 +凯旋门 +无轨电车 +长号 +浴盆 +旋转式栅门 +打字机键盘 +伞 +独轮车 +直立式钢琴 +真空吸尘器 +花瓶 +拱顶 +天鹅绒 +自动售货机 +祭服 +高架桥 +小提琴 +排球 +松饼机 +挂钟 +钱包 +衣柜 +军用飞机 +洗脸盆 +洗衣机 +水瓶 +水壶 +水塔 +威士忌壶 +哨子 +假发 +纱窗 +百叶窗 +温莎领带 +葡萄酒瓶 +飞机翅膀 +炒菜锅 +木制的勺子 +毛织品 +栅栏 +沉船 +双桅船 +蒙古包 +网站 +漫画 +纵横字谜 +路标 +交通信号灯 +防尘罩 +菜单 +盘子 +鳄梨酱 +清汤 +罐焖土豆烧肉 +蛋糕 +冰淇淋 +雪糕 +法式面包 +百吉饼 +椒盐脆饼 +芝士汉堡 +热狗 +土豆泥 +结球甘蓝 +西兰花 +菜花 +绿皮密生西葫芦 +西葫芦 +小青南瓜 +南瓜 +黄瓜 +朝鲜蓟 +甜椒 +刺棘蓟 +蘑菇 +绿苹果 +草莓 +橘子 +柠檬 +无花果 +菠萝 +香蕉 +菠萝蜜 +蛋奶冻苹果 +石榴 +干草 +烤面条加干酪沙司 +巧克力酱 +面团 +瑞士肉包 +披萨 +馅饼 +卷饼 +红葡萄酒 +意大利浓咖啡 +杯子 +蛋酒 +高山 +泡泡 +悬崖 +珊瑚礁 +间歇泉 +湖边 +海角 +沙洲 +海滨 +峡谷 +火山 +棒球 +新郎 +潜水员 +油菜 +雏菊 +杓兰 +玉米 +橡子 +玫瑰果 +七叶树果实 +珊瑚菌 +木耳 +鹿花菌 +鬼笔菌 +地星 +多叶奇果菌 +牛肝菌 +玉米穗 +卫生纸 \ No newline at end of file diff --git a/Code/TestTRTInterDll/imageDatas/ng.jpg b/Code/TestTRTInterDll/imageDatas/ng.jpg new file mode 100644 index 0000000..d7a3e9e Binary files /dev/null and b/Code/TestTRTInterDll/imageDatas/ng.jpg differ diff --git a/Code/TestTRTInterDll/imageDatas/res.jpg b/Code/TestTRTInterDll/imageDatas/res.jpg new file mode 100644 index 0000000..c131bd9 Binary files /dev/null and b/Code/TestTRTInterDll/imageDatas/res.jpg differ diff --git a/Code/TestTRTInterDll/imageDatas/resnet34_0407.onnx b/Code/TestTRTInterDll/imageDatas/resnet34_0407.onnx new file mode 100644 index 0000000..888c0be Binary files /dev/null and b/Code/TestTRTInterDll/imageDatas/resnet34_0407.onnx differ diff --git a/Code/TestTRTInterDll/imageDatas/resnet34_0407.trt b/Code/TestTRTInterDll/imageDatas/resnet34_0407.trt new file mode 100644 index 0000000..0ca4171 Binary files /dev/null and b/Code/TestTRTInterDll/imageDatas/resnet34_0407.trt differ diff --git a/Code/TestTRTInterDll/imageDatas/resnet34_0407.trtmodel b/Code/TestTRTInterDll/imageDatas/resnet34_0407.trtmodel new file mode 100644 index 0000000..97b6ced Binary files /dev/null and b/Code/TestTRTInterDll/imageDatas/resnet34_0407.trtmodel differ diff --git a/Code/TestTRTInterDll/imageDatas/resnet34_0407_int32.onnx b/Code/TestTRTInterDll/imageDatas/resnet34_0407_int32.onnx new file mode 100644 index 0000000..5952f46 Binary files /dev/null and b/Code/TestTRTInterDll/imageDatas/resnet34_0407_int32.onnx differ diff --git a/Code/TestTRTInterDll/imageDatas/resnet50.onnx b/Code/TestTRTInterDll/imageDatas/resnet50.onnx new file mode 100644 index 0000000..d998aa3 Binary files /dev/null and b/Code/TestTRTInterDll/imageDatas/resnet50.onnx differ diff --git a/Code/TestTRTInterDll/imageDatas/resnet50.trt b/Code/TestTRTInterDll/imageDatas/resnet50.trt new file mode 100644 index 0000000..7371580 Binary files /dev/null and b/Code/TestTRTInterDll/imageDatas/resnet50.trt differ diff --git a/Code/TestTRTInterDll/imageDatas/resnet50.trtmodel b/Code/TestTRTInterDll/imageDatas/resnet50.trtmodel new file mode 100644 index 0000000..43d6fc1 Binary files /dev/null and b/Code/TestTRTInterDll/imageDatas/resnet50.trtmodel differ diff --git a/Code/TestTRTInterDll/imageDatas/sunflower.jpg b/Code/TestTRTInterDll/imageDatas/sunflower.jpg new file mode 100644 index 0000000..674a1aa Binary files /dev/null and b/Code/TestTRTInterDll/imageDatas/sunflower.jpg differ diff --git a/Code/TestTRTInterDll/imageDatas/tulip.jpg b/Code/TestTRTInterDll/imageDatas/tulip.jpg new file mode 100644 index 0000000..4c83caa Binary files /dev/null and b/Code/TestTRTInterDll/imageDatas/tulip.jpg differ diff --git a/Code/TestTRTInterDll/imageDatas/yq.jpg b/Code/TestTRTInterDll/imageDatas/yq.jpg new file mode 100644 index 0000000..2aad9f8 Binary files /dev/null and b/Code/TestTRTInterDll/imageDatas/yq.jpg differ diff --git a/Code/TestTRTInterDll/imageDatas/zand.jpg b/Code/TestTRTInterDll/imageDatas/zand.jpg new file mode 100644 index 0000000..92d72ea Binary files /dev/null and b/Code/TestTRTInterDll/imageDatas/zand.jpg differ diff --git a/Code/TestTRTInterDll/logFiles/vision_log-20240418.txt b/Code/TestTRTInterDll/logFiles/vision_log-20240418.txt new file mode 100644 index 0000000..500b5ce --- /dev/null +++ b/Code/TestTRTInterDll/logFiles/vision_log-20240418.txt @@ -0,0 +1,590 @@ +[2024-04-24 14:10:05.326] [info] +[D:\00_SST-Work\SST-Code\MF_TRTInfer\MF_Resnet34Infer.cpp:30,MF_Resnet34Infer::initEngine] +on the init engine, input onnx file : ./imageDatas/resnet34_0407.onnx + +[2024-04-24 14:10:18.478] [info] +[D:\00_SST-Work\SST-Code\MF_TRTInfer\MF_Resnet34Infer.cpp:30,MF_Resnet34Infer::initEngine] +on the init engine, input onnx file : ./imageDatas/resnet50.onnx + +[2024-04-24 14:10:20.401] [info] +[D:\00_SST-Work\SST-Code\MF_TRTInfer\MF_Resnet34Infer.cpp:48,MF_Resnet34Infer::initEngine] +trt model has existed. + + +[2024-04-24 14:10:21.501] [info] +[D:\00_SST-Work\SST-Code\MF_TRTInfer\MF_Resnet34Infer.cpp:48,MF_Resnet34Infer::initEngine] +trt model has existed. + + +[2024-04-24 14:10:23.149] [info] +[D:\00_SST-Work\SST-Code\MF_TRTInfer\MF_Resnet34Infer.cpp:30,MF_Resnet34Infer::initEngine] +on the init engine, input onnx file : ./imageDatas/resnet50.onnx + +[2024-04-24 14:10:24.470] [info] +[D:\00_SST-Work\SST-Code\MF_TRTInfer\MF_Resnet34Infer.cpp:48,MF_Resnet34Infer::initEngine] +trt model has existed. + + +[2024-04-24 14:10:29.415] [info] +[D:\00_SST-Work\SST-Code\MF_TRTInfer\MA_TRTInferAlgoBase.cpp:41,MA_TRTInferAlgoBase::check] +The engine's info: + +[2024-04-24 14:10:29.426] [info] +[D:\00_SST-Work\SST-Code\MF_TRTInfer\MA_TRTInferAlgoBase.cpp:47,MA_TRTInferAlgoBase::check] +idx = 0, input + +[2024-04-24 14:10:29.503] [info] +[D:\00_SST-Work\SST-Code\MF_TRTInfer\MA_TRTInferAlgoBase.cpp:53,MA_TRTInferAlgoBase::check] + + + +[2024-04-24 14:10:29.516] [info] +[D:\00_SST-Work\SST-Code\MF_TRTInfer\MA_TRTInferAlgoBase.cpp:47,MA_TRTInferAlgoBase::check] +idx = 1, output + +[2024-04-24 14:10:29.517] [info] +[D:\00_SST-Work\SST-Code\MF_TRTInfer\MA_TRTInferAlgoBase.cpp:53,MA_TRTInferAlgoBase::check] + + + +[2024-04-24 14:10:29.518] [info] +[D:\00_SST-Work\SST-Code\MF_TRTInfer\MA_TRTInferAlgoBase.cpp:56,MA_TRTInferAlgoBase::check] +The context's info: + +[2024-04-24 14:10:54.633] [info] +[D:\00_SST-Work\SST-Code\MF_TRTInfer\MA_TRTInferAlgoBase.cpp:41,MA_TRTInferAlgoBase::check] +The engine's info: + +[2024-04-24 14:10:54.634] [info] +[D:\00_SST-Work\SST-Code\MF_TRTInfer\MA_TRTInferAlgoBase.cpp:47,MA_TRTInferAlgoBase::check] +idx = 0, input + +[2024-04-24 14:10:54.634] [info] +[D:\00_SST-Work\SST-Code\MF_TRTInfer\MA_TRTInferAlgoBase.cpp:53,MA_TRTInferAlgoBase::check] + + + +[2024-04-24 14:10:54.634] [info] +[D:\00_SST-Work\SST-Code\MF_TRTInfer\MA_TRTInferAlgoBase.cpp:47,MA_TRTInferAlgoBase::check] +idx = 1, output + +[2024-04-24 14:10:54.635] [info] +[D:\00_SST-Work\SST-Code\MF_TRTInfer\MA_TRTInferAlgoBase.cpp:53,MA_TRTInferAlgoBase::check] + + + +[2024-04-24 14:10:54.636] [info] +[D:\00_SST-Work\SST-Code\MF_TRTInfer\MA_TRTInferAlgoBase.cpp:56,MA_TRTInferAlgoBase::check] +The context's info: + +[2024-04-24 14:11:03.643] [info] +[D:\00_SST-Work\SST-Code\MF_TRTInfer\MA_TRTInferAlgoBase.cpp:41,MA_TRTInferAlgoBase::check] +The engine's info: + +[2024-04-24 14:11:03.643] [info] +[D:\00_SST-Work\SST-Code\MF_TRTInfer\MA_TRTInferAlgoBase.cpp:47,MA_TRTInferAlgoBase::check] +idx = 0, input + +[2024-04-24 14:11:03.643] [info] +[D:\00_SST-Work\SST-Code\MF_TRTInfer\MA_TRTInferAlgoBase.cpp:53,MA_TRTInferAlgoBase::check] + + + +[2024-04-24 14:11:03.643] [info] +[D:\00_SST-Work\SST-Code\MF_TRTInfer\MA_TRTInferAlgoBase.cpp:47,MA_TRTInferAlgoBase::check] +idx = 1, output + +[2024-04-24 14:11:03.644] [info] +[D:\00_SST-Work\SST-Code\MF_TRTInfer\MA_TRTInferAlgoBase.cpp:53,MA_TRTInferAlgoBase::check] + + + +[2024-04-24 14:11:03.645] [info] +[D:\00_SST-Work\SST-Code\MF_TRTInfer\MA_TRTInferAlgoBase.cpp:56,MA_TRTInferAlgoBase::check] +The context's info: + +[2024-04-24 14:21:13.820] [info] +[D:\00_SST-Work\SST-Code\MF_TRTInfer\MF_Resnet34Infer.cpp:30,MF_Resnet34Infer::initEngine] +on the init engine, input onnx file : ./imageDatas/resnet34_0407.onnx + +[2024-04-24 14:21:16.855] [info] +[D:\00_SST-Work\SST-Code\MF_TRTInfer\MF_Resnet34Infer.cpp:30,MF_Resnet34Infer::initEngine] +on the init engine, input onnx file : ./imageDatas/resnet50.onnx + +[2024-04-24 14:21:20.801] [info] +[D:\00_SST-Work\SST-Code\MF_TRTInfer\MF_Resnet34Infer.cpp:48,MF_Resnet34Infer::initEngine] +trt model has existed. + + +[2024-04-24 14:21:30.715] [info] +[D:\00_SST-Work\SST-Code\MF_TRTInfer\MF_Resnet34Infer.cpp:48,MF_Resnet34Infer::initEngine] +trt model has existed. + + +[2024-04-24 14:21:47.950] [info] +[D:\00_SST-Work\SST-Code\MF_TRTInfer\MF_Resnet34Infer.cpp:30,MF_Resnet34Infer::initEngine] +on the init engine, input onnx file : ./imageDatas/resnet50.onnx + +[2024-04-24 14:21:56.426] [info] +[D:\00_SST-Work\SST-Code\MF_TRTInfer\MF_Resnet34Infer.cpp:48,MF_Resnet34Infer::initEngine] +trt model has existed. + + +[2024-04-24 14:22:20.751] [info] +[D:\00_SST-Work\SST-Code\MF_TRTInfer\MA_TRTInferAlgoBase.cpp:41,MA_TRTInferAlgoBase::check] +The engine's info: + +[2024-04-24 14:22:20.751] [info] +[D:\00_SST-Work\SST-Code\MF_TRTInfer\MA_TRTInferAlgoBase.cpp:47,MA_TRTInferAlgoBase::check] +idx = 0, input + +[2024-04-24 14:22:20.752] [info] +[D:\00_SST-Work\SST-Code\MF_TRTInfer\MA_TRTInferAlgoBase.cpp:53,MA_TRTInferAlgoBase::check] + + + +[2024-04-24 14:22:20.752] [info] +[D:\00_SST-Work\SST-Code\MF_TRTInfer\MA_TRTInferAlgoBase.cpp:47,MA_TRTInferAlgoBase::check] +idx = 1, output + +[2024-04-24 14:22:20.752] [info] +[D:\00_SST-Work\SST-Code\MF_TRTInfer\MA_TRTInferAlgoBase.cpp:53,MA_TRTInferAlgoBase::check] + + + +[2024-04-24 14:23:23.249] [info] +[D:\00_SST-Work\SST-Code\MF_TRTInfer\MF_Resnet34Infer.cpp:30,MF_Resnet34Infer::initEngine] +on the init engine, input onnx file : ./imageDatas/resnet34_0407.onnx + +[2024-04-24 14:23:27.717] [info] +[D:\00_SST-Work\SST-Code\MF_TRTInfer\MF_Resnet34Infer.cpp:48,MF_Resnet34Infer::initEngine] +trt model has existed. + + +[2024-04-24 14:23:29.141] [info] +[D:\00_SST-Work\SST-Code\MF_TRTInfer\MF_Resnet34Infer.cpp:30,MF_Resnet34Infer::initEngine] +on the init engine, input onnx file : ./imageDatas/resnet50.onnx + +[2024-04-24 14:23:30.553] [info] +[D:\00_SST-Work\SST-Code\MF_TRTInfer\MF_Resnet34Infer.cpp:48,MF_Resnet34Infer::initEngine] +trt model has existed. + + +[2024-04-24 14:23:37.512] [info] +[D:\00_SST-Work\SST-Code\MF_TRTInfer\MA_TRTInferAlgoBase.cpp:41,MA_TRTInferAlgoBase::check] +The engine's info: + +[2024-04-24 14:23:37.512] [info] +[D:\00_SST-Work\SST-Code\MF_TRTInfer\MA_TRTInferAlgoBase.cpp:47,MA_TRTInferAlgoBase::check] +idx = 0, input + +[2024-04-24 14:23:37.513] [info] +[D:\00_SST-Work\SST-Code\MF_TRTInfer\MA_TRTInferAlgoBase.cpp:53,MA_TRTInferAlgoBase::check] + + + +[2024-04-24 14:23:37.513] [info] +[D:\00_SST-Work\SST-Code\MF_TRTInfer\MA_TRTInferAlgoBase.cpp:47,MA_TRTInferAlgoBase::check] +idx = 1, output + +[2024-04-24 14:23:37.516] [info] +[D:\00_SST-Work\SST-Code\MF_TRTInfer\MA_TRTInferAlgoBase.cpp:53,MA_TRTInferAlgoBase::check] + + + +[2024-04-24 14:23:37.517] [info] +[D:\00_SST-Work\SST-Code\MF_TRTInfer\MA_TRTInferAlgoBase.cpp:56,MA_TRTInferAlgoBase::check] +The context's info: + +[2024-04-24 14:24:11.946] [info] +[D:\00_SST-Work\SST-Code\MF_TRTInfer\MA_TRTInferAlgoBase.cpp:41,MA_TRTInferAlgoBase::check] +The engine's info: + +[2024-04-24 14:24:11.946] [info] +[D:\00_SST-Work\SST-Code\MF_TRTInfer\MA_TRTInferAlgoBase.cpp:47,MA_TRTInferAlgoBase::check] +idx = 0, input + +[2024-04-24 14:24:11.947] [info] +[D:\00_SST-Work\SST-Code\MF_TRTInfer\MA_TRTInferAlgoBase.cpp:53,MA_TRTInferAlgoBase::check] + + + +[2024-04-24 14:24:11.947] [info] +[D:\00_SST-Work\SST-Code\MF_TRTInfer\MA_TRTInferAlgoBase.cpp:47,MA_TRTInferAlgoBase::check] +idx = 1, output + +[2024-04-24 14:24:11.949] [info] +[D:\00_SST-Work\SST-Code\MF_TRTInfer\MA_TRTInferAlgoBase.cpp:53,MA_TRTInferAlgoBase::check] + + + +[2024-04-24 14:24:11.950] [info] +[D:\00_SST-Work\SST-Code\MF_TRTInfer\MA_TRTInferAlgoBase.cpp:56,MA_TRTInferAlgoBase::check] +The context's info: + +[2024-04-24 14:30:58.451] [info] +[D:\00_SST-Work\SST-Code\MF_TRTInfer\MF_Resnet34Infer.cpp:30,MF_Resnet34Infer::initEngine] +on the init engine, input onnx file : ./imageDatas/resnet34_0407.onnx + +[2024-04-24 14:31:00.901] [info] +[D:\00_SST-Work\SST-Code\MF_TRTInfer\MF_Resnet34Infer.cpp:48,MF_Resnet34Infer::initEngine] +trt model has existed. + + +[2024-04-24 14:31:03.057] [info] +[D:\00_SST-Work\SST-Code\MF_TRTInfer\MF_Resnet34Infer.cpp:30,MF_Resnet34Infer::initEngine] +on the init engine, input onnx file : ./imageDatas/resnet50.onnx + +[2024-04-24 14:31:04.749] [info] +[D:\00_SST-Work\SST-Code\MF_TRTInfer\MF_Resnet34Infer.cpp:48,MF_Resnet34Infer::initEngine] +trt model has existed. + + +[2024-04-24 14:31:06.295] [info] +[D:\00_SST-Work\SST-Code\MF_TRTInfer\MA_TRTInferAlgoBase.cpp:41,MA_TRTInferAlgoBase::check] +The engine's info: + +[2024-04-24 14:31:06.296] [info] +[D:\00_SST-Work\SST-Code\MF_TRTInfer\MA_TRTInferAlgoBase.cpp:41,MA_TRTInferAlgoBase::check] +The engine's info: + +[2024-04-24 14:31:06.296] [info] +[D:\00_SST-Work\SST-Code\MF_TRTInfer\MA_TRTInferAlgoBase.cpp:47,MA_TRTInferAlgoBase::check] +idx = 0, input + +[2024-04-24 14:31:06.297] [info] +[D:\00_SST-Work\SST-Code\MF_TRTInfer\MA_TRTInferAlgoBase.cpp:47,MA_TRTInferAlgoBase::check] +idx = 0, input + +[2024-04-24 14:31:06.298] [info] +[D:\00_SST-Work\SST-Code\MF_TRTInfer\MA_TRTInferAlgoBase.cpp:53,MA_TRTInferAlgoBase::check] + + + +[2024-04-24 14:31:06.298] [info] +[D:\00_SST-Work\SST-Code\MF_TRTInfer\MA_TRTInferAlgoBase.cpp:53,MA_TRTInferAlgoBase::check] + + + +[2024-04-24 14:31:06.299] [info] +[D:\00_SST-Work\SST-Code\MF_TRTInfer\MA_TRTInferAlgoBase.cpp:47,MA_TRTInferAlgoBase::check] +idx = 1, output + +[2024-04-24 14:31:06.299] [info] +[D:\00_SST-Work\SST-Code\MF_TRTInfer\MA_TRTInferAlgoBase.cpp:47,MA_TRTInferAlgoBase::check] +idx = 1, output + +[2024-04-24 14:31:06.301] [info] +[D:\00_SST-Work\SST-Code\MF_TRTInfer\MA_TRTInferAlgoBase.cpp:53,MA_TRTInferAlgoBase::check] + + + +[2024-04-24 14:31:06.302] [info] +[D:\00_SST-Work\SST-Code\MF_TRTInfer\MA_TRTInferAlgoBase.cpp:53,MA_TRTInferAlgoBase::check] + + + +[2024-04-24 14:31:06.303] [info] +[D:\00_SST-Work\SST-Code\MF_TRTInfer\MA_TRTInferAlgoBase.cpp:56,MA_TRTInferAlgoBase::check] +The context's info: + +[2024-04-24 14:31:06.305] [info] +[D:\00_SST-Work\SST-Code\MF_TRTInfer\MA_TRTInferAlgoBase.cpp:56,MA_TRTInferAlgoBase::check] +The context's info: + +[2024-04-24 14:31:29.839] [info] +[D:\00_SST-Work\SST-Code\MF_TRTInfer\MA_TRTInferAlgoBase.cpp:41,MA_TRTInferAlgoBase::check] +The engine's info: + +[2024-04-24 14:31:29.839] [info] +[D:\00_SST-Work\SST-Code\MF_TRTInfer\MA_TRTInferAlgoBase.cpp:47,MA_TRTInferAlgoBase::check] +idx = 0, input + +[2024-04-24 14:31:29.839] [info] +[D:\00_SST-Work\SST-Code\MF_TRTInfer\MA_TRTInferAlgoBase.cpp:41,MA_TRTInferAlgoBase::check] +The engine's info: + +[2024-04-24 14:31:29.840] [info] +[D:\00_SST-Work\SST-Code\MF_TRTInfer\MA_TRTInferAlgoBase.cpp:53,MA_TRTInferAlgoBase::check] + + + +[2024-04-24 14:31:29.840] [info] +[D:\00_SST-Work\SST-Code\MF_TRTInfer\MA_TRTInferAlgoBase.cpp:47,MA_TRTInferAlgoBase::check] +idx = 0, input + +[2024-04-24 14:31:29.840] [info] +[D:\00_SST-Work\SST-Code\MF_TRTInfer\MA_TRTInferAlgoBase.cpp:47,MA_TRTInferAlgoBase::check] +idx = 1, output + +[2024-04-24 14:31:29.840] [info] +[D:\00_SST-Work\SST-Code\MF_TRTInfer\MA_TRTInferAlgoBase.cpp:53,MA_TRTInferAlgoBase::check] + + + +[2024-04-24 14:31:29.841] [info] +[D:\00_SST-Work\SST-Code\MF_TRTInfer\MA_TRTInferAlgoBase.cpp:53,MA_TRTInferAlgoBase::check] + + + +[2024-04-24 14:31:29.841] [info] +[D:\00_SST-Work\SST-Code\MF_TRTInfer\MA_TRTInferAlgoBase.cpp:47,MA_TRTInferAlgoBase::check] +idx = 1, output + +[2024-04-24 14:31:29.842] [info] +[D:\00_SST-Work\SST-Code\MF_TRTInfer\MA_TRTInferAlgoBase.cpp:56,MA_TRTInferAlgoBase::check] +The context's info: + +[2024-04-24 14:31:29.842] [info] +[D:\00_SST-Work\SST-Code\MF_TRTInfer\MA_TRTInferAlgoBase.cpp:53,MA_TRTInferAlgoBase::check] + + + +[2024-04-24 14:31:34.113] [info] +[D:\00_SST-Work\SST-Code\MF_TRTInfer\MA_TRTInferAlgoBase.cpp:56,MA_TRTInferAlgoBase::check] +The context's info: + +[2024-04-24 14:34:44.910] [info] +[D:\00_SST-Work\SST-Code\MF_TRTInfer\MF_Resnet34Infer.cpp:30,MF_Resnet34Infer::initEngine] +on the init engine, input onnx file : ./imageDatas/resnet34_0407.onnx + +[2024-04-24 14:34:46.422] [info] +[D:\00_SST-Work\SST-Code\MF_TRTInfer\MF_Resnet34Infer.cpp:48,MF_Resnet34Infer::initEngine] +trt model has existed. + + +[2024-04-24 14:34:47.848] [info] +[D:\00_SST-Work\SST-Code\MF_TRTInfer\MF_Resnet34Infer.cpp:30,MF_Resnet34Infer::initEngine] +on the init engine, input onnx file : ./imageDatas/resnet50.onnx + +[2024-04-24 14:34:49.635] [info] +[D:\00_SST-Work\SST-Code\MF_TRTInfer\MF_Resnet34Infer.cpp:48,MF_Resnet34Infer::initEngine] +trt model has existed. + + +[2024-04-24 14:34:50.939] [info] +[D:\00_SST-Work\SST-Code\MF_TRTInfer\MA_TRTInferAlgoBase.cpp:41,MA_TRTInferAlgoBase::check] +The engine's info: + +[2024-04-24 14:34:50.943] [info] +[D:\00_SST-Work\SST-Code\MF_TRTInfer\MA_TRTInferAlgoBase.cpp:41,MA_TRTInferAlgoBase::check] +The engine's info: + +[2024-04-24 14:34:50.944] [info] +[D:\00_SST-Work\SST-Code\MF_TRTInfer\MA_TRTInferAlgoBase.cpp:47,MA_TRTInferAlgoBase::check] +idx = 0, input + +[2024-04-24 14:34:50.943] [info] +[D:\00_SST-Work\SST-Code\MF_TRTInfer\MA_TRTInferAlgoBase.cpp:47,MA_TRTInferAlgoBase::check] +idx = 0, input + +[2024-04-24 14:34:50.945] [info] +[D:\00_SST-Work\SST-Code\MF_TRTInfer\MA_TRTInferAlgoBase.cpp:53,MA_TRTInferAlgoBase::check] + + + +[2024-04-24 14:34:50.946] [info] +[D:\00_SST-Work\SST-Code\MF_TRTInfer\MA_TRTInferAlgoBase.cpp:53,MA_TRTInferAlgoBase::check] + + + +[2024-04-24 14:34:50.946] [info] +[D:\00_SST-Work\SST-Code\MF_TRTInfer\MA_TRTInferAlgoBase.cpp:47,MA_TRTInferAlgoBase::check] +idx = 1, output + +[2024-04-24 14:34:50.946] [info] +[D:\00_SST-Work\SST-Code\MF_TRTInfer\MA_TRTInferAlgoBase.cpp:47,MA_TRTInferAlgoBase::check] +idx = 1, output + +[2024-04-24 14:34:50.946] [info] +[D:\00_SST-Work\SST-Code\MF_TRTInfer\MA_TRTInferAlgoBase.cpp:53,MA_TRTInferAlgoBase::check] + + + +[2024-04-24 14:34:50.947] [info] +[D:\00_SST-Work\SST-Code\MF_TRTInfer\MA_TRTInferAlgoBase.cpp:53,MA_TRTInferAlgoBase::check] + + + +[2024-04-24 14:34:50.947] [info] +[D:\00_SST-Work\SST-Code\MF_TRTInfer\MA_TRTInferAlgoBase.cpp:56,MA_TRTInferAlgoBase::check] +The context's info: + +[2024-04-24 14:34:50.948] [info] +[D:\00_SST-Work\SST-Code\MF_TRTInfer\MA_TRTInferAlgoBase.cpp:56,MA_TRTInferAlgoBase::check] +The context's info: + +[2024-04-24 14:35:33.809] [info] +[D:\00_SST-Work\SST-Code\MF_TRTInfer\MA_TRTInferAlgoBase.cpp:41,MA_TRTInferAlgoBase::check] +The engine's info: + +[2024-04-24 14:36:17.884] [info] +[D:\00_SST-Work\SST-Code\MF_TRTInfer\MA_TRTInferAlgoBase.cpp:47,MA_TRTInferAlgoBase::check] +idx = 0, input + +[2024-04-24 14:36:17.884] [info] +[D:\00_SST-Work\SST-Code\MF_TRTInfer\MA_TRTInferAlgoBase.cpp:41,MA_TRTInferAlgoBase::check] +The engine's info: + +[2024-04-24 14:36:17.885] [info] +[D:\00_SST-Work\SST-Code\MF_TRTInfer\MA_TRTInferAlgoBase.cpp:47,MA_TRTInferAlgoBase::check] +idx = 0, input + +[2024-04-24 14:36:17.885] [info] +[D:\00_SST-Work\SST-Code\MF_TRTInfer\MA_TRTInferAlgoBase.cpp:53,MA_TRTInferAlgoBase::check] + + + +[2024-04-24 14:36:17.886] [info] +[D:\00_SST-Work\SST-Code\MF_TRTInfer\MA_TRTInferAlgoBase.cpp:47,MA_TRTInferAlgoBase::check] +idx = 1, output + +[2024-04-24 14:36:17.886] [info] +[D:\00_SST-Work\SST-Code\MF_TRTInfer\MA_TRTInferAlgoBase.cpp:53,MA_TRTInferAlgoBase::check] + + + +[2024-04-24 14:36:17.886] [info] +[D:\00_SST-Work\SST-Code\MF_TRTInfer\MA_TRTInferAlgoBase.cpp:53,MA_TRTInferAlgoBase::check] + + + +[2024-04-24 14:36:17.886] [info] +[D:\00_SST-Work\SST-Code\MF_TRTInfer\MA_TRTInferAlgoBase.cpp:47,MA_TRTInferAlgoBase::check] +idx = 1, output + +[2024-04-24 14:36:17.887] [info] +[D:\00_SST-Work\SST-Code\MF_TRTInfer\MA_TRTInferAlgoBase.cpp:56,MA_TRTInferAlgoBase::check] +The context's info: + +[2024-04-24 14:36:43.975] [info] +[D:\00_SST-Work\SST-Code\MF_TRTInfer\MA_TRTInferAlgoBase.cpp:53,MA_TRTInferAlgoBase::check] + + + +[2024-04-24 14:36:49.419] [info] +[D:\00_SST-Work\SST-Code\MF_TRTInfer\MA_TRTInferAlgoBase.cpp:56,MA_TRTInferAlgoBase::check] +The context's info: + +[2024-04-24 15:08:22.000] [info] +[D:\00_SST-Work\SST-Code\MF_TRTInfer\MF_Resnet34Infer.cpp:30,MF_Resnet34Infer::initEngine] +on the init engine, input onnx file : ./imageDatas/resnet34_0407.onnx + +[2024-04-24 15:08:23.646] [info] +[D:\00_SST-Work\SST-Code\MF_TRTInfer\MF_Resnet34Infer.cpp:48,MF_Resnet34Infer::initEngine] +trt model has existed. + + +[2024-04-24 15:08:32.437] [info] +[D:\00_SST-Work\SST-Code\MF_TRTInfer\MF_Resnet34Infer.cpp:30,MF_Resnet34Infer::initEngine] +on the init engine, input onnx file : ./imageDatas/resnet50.onnx + +[2024-04-24 15:08:34.620] [info] +[D:\00_SST-Work\SST-Code\MF_TRTInfer\MF_Resnet34Infer.cpp:48,MF_Resnet34Infer::initEngine] +trt model has existed. + + +[2024-04-24 15:08:37.449] [info] +[D:\00_SST-Work\SST-Code\MF_TRTInfer\MA_TRTInferAlgoBase.cpp:43,MA_TRTInferAlgoBase::check] +The engine's info: + +[2024-04-24 15:08:37.451] [info] +[D:\00_SST-Work\SST-Code\MF_TRTInfer\MA_TRTInferAlgoBase.cpp:43,MA_TRTInferAlgoBase::check] +The engine's info: + +[2024-04-24 15:08:37.452] [info] +[D:\00_SST-Work\SST-Code\MF_TRTInfer\MA_TRTInferAlgoBase.cpp:49,MA_TRTInferAlgoBase::check] +idx = 0, input + +[2024-04-24 15:08:37.451] [info] +[D:\00_SST-Work\SST-Code\MF_TRTInfer\MA_TRTInferAlgoBase.cpp:49,MA_TRTInferAlgoBase::check] +idx = 0, input + +[2024-04-24 15:08:37.454] [info] +[D:\00_SST-Work\SST-Code\MF_TRTInfer\MA_TRTInferAlgoBase.cpp:55,MA_TRTInferAlgoBase::check] + + + +[2024-04-24 15:08:37.455] [info] +[D:\00_SST-Work\SST-Code\MF_TRTInfer\MA_TRTInferAlgoBase.cpp:55,MA_TRTInferAlgoBase::check] + + + +[2024-04-24 15:08:37.456] [info] +[D:\00_SST-Work\SST-Code\MF_TRTInfer\MA_TRTInferAlgoBase.cpp:49,MA_TRTInferAlgoBase::check] +idx = 1, output + +[2024-04-24 15:08:37.455] [info] +[D:\00_SST-Work\SST-Code\MF_TRTInfer\MA_TRTInferAlgoBase.cpp:49,MA_TRTInferAlgoBase::check] +idx = 1, output + +[2024-04-24 15:08:37.456] [info] +[D:\00_SST-Work\SST-Code\MF_TRTInfer\MA_TRTInferAlgoBase.cpp:55,MA_TRTInferAlgoBase::check] + + + +[2024-04-24 15:08:37.457] [info] +[D:\00_SST-Work\SST-Code\MF_TRTInfer\MA_TRTInferAlgoBase.cpp:55,MA_TRTInferAlgoBase::check] + + + +[2024-04-24 15:08:37.458] [info] +[D:\00_SST-Work\SST-Code\MF_TRTInfer\MA_TRTInferAlgoBase.cpp:58,MA_TRTInferAlgoBase::check] +The context's info: + +[2024-04-24 15:08:37.459] [info] +[D:\00_SST-Work\SST-Code\MF_TRTInfer\MA_TRTInferAlgoBase.cpp:58,MA_TRTInferAlgoBase::check] +The context's info: + +[2024-04-24 15:08:57.068] [info] +[D:\00_SST-Work\SST-Code\MF_TRTInfer\MA_TRTInferAlgoBase.cpp:43,MA_TRTInferAlgoBase::check] +The engine's info: + +[2024-04-24 15:08:58.514] [info] +[D:\00_SST-Work\SST-Code\MF_TRTInfer\MA_TRTInferAlgoBase.cpp:49,MA_TRTInferAlgoBase::check] +idx = 0, input + +[2024-04-24 15:09:05.783] [info] +[D:\00_SST-Work\SST-Code\MF_TRTInfer\MA_TRTInferAlgoBase.cpp:55,MA_TRTInferAlgoBase::check] + + + +[2024-04-24 15:09:05.784] [info] +[D:\00_SST-Work\SST-Code\MF_TRTInfer\MA_TRTInferAlgoBase.cpp:49,MA_TRTInferAlgoBase::check] +idx = 1, output + +[2024-04-24 15:09:05.784] [info] +[D:\00_SST-Work\SST-Code\MF_TRTInfer\MA_TRTInferAlgoBase.cpp:55,MA_TRTInferAlgoBase::check] + + + +[2024-04-24 15:09:05.785] [info] +[D:\00_SST-Work\SST-Code\MF_TRTInfer\MA_TRTInferAlgoBase.cpp:58,MA_TRTInferAlgoBase::check] +The context's info: + +[2024-04-24 15:09:16.908] [info] +[D:\00_SST-Work\SST-Code\MF_TRTInfer\MA_TRTInferAlgoBase.cpp:43,MA_TRTInferAlgoBase::check] +The engine's info: + +[2024-04-24 15:09:16.909] [info] +[D:\00_SST-Work\SST-Code\MF_TRTInfer\MA_TRTInferAlgoBase.cpp:49,MA_TRTInferAlgoBase::check] +idx = 0, input + +[2024-04-24 15:09:16.910] [info] +[D:\00_SST-Work\SST-Code\MF_TRTInfer\MA_TRTInferAlgoBase.cpp:55,MA_TRTInferAlgoBase::check] + + + +[2024-04-24 15:09:16.910] [info] +[D:\00_SST-Work\SST-Code\MF_TRTInfer\MA_TRTInferAlgoBase.cpp:49,MA_TRTInferAlgoBase::check] +idx = 1, output + +[2024-04-24 15:09:16.911] [info] +[D:\00_SST-Work\SST-Code\MF_TRTInfer\MA_TRTInferAlgoBase.cpp:55,MA_TRTInferAlgoBase::check] + + + +[2024-04-24 15:09:16.912] [info] +[D:\00_SST-Work\SST-Code\MF_TRTInfer\MA_TRTInferAlgoBase.cpp:58,MA_TRTInferAlgoBase::check] +The context's info: + diff --git a/Code/TestTRTInterDll/tensorrt_860_release_x64.props b/Code/TestTRTInterDll/tensorrt_860_release_x64.props new file mode 100644 index 0000000..b6a68d6 --- /dev/null +++ b/Code/TestTRTInterDll/tensorrt_860_release_x64.props @@ -0,0 +1,16 @@ + + + + + + + + ..\MF_TRTInfer\lib\tensorrt_lib\lib;%(AdditionalLibraryDirectories) + ..\MF_TRTInfer\lib\tensorrt_lib\lib\*.lib;%(AdditionalDependencies) + + + ..\MF_TRTInfer\lib\tensorrt_lib\include;%(AdditionalIncludeDirectories) + + + + \ No newline at end of file diff --git a/Code/TestTRTInterDll/trtinfer_lib/MF_TRTInfer.dll b/Code/TestTRTInterDll/trtinfer_lib/MF_TRTInfer.dll new file mode 100644 index 0000000..f7facbe Binary files /dev/null and b/Code/TestTRTInterDll/trtinfer_lib/MF_TRTInfer.dll differ diff --git a/Code/TestTRTInterDll/trtinfer_lib/MF_TRTInfer.lib b/Code/TestTRTInterDll/trtinfer_lib/MF_TRTInfer.lib new file mode 100644 index 0000000..ec376c5 Binary files /dev/null and b/Code/TestTRTInterDll/trtinfer_lib/MF_TRTInfer.lib differ diff --git a/Code/TestTRTInterDll/trtinfer_lib/MF_TRTInfer.pdb b/Code/TestTRTInterDll/trtinfer_lib/MF_TRTInfer.pdb new file mode 100644 index 0000000..131e6ef Binary files /dev/null and b/Code/TestTRTInterDll/trtinfer_lib/MF_TRTInfer.pdb differ diff --git a/Code/TestTRTInterDll/trtinfer_lib/common/BatchStream.h b/Code/TestTRTInterDll/trtinfer_lib/common/BatchStream.h new file mode 100644 index 0000000..94acde5 --- /dev/null +++ b/Code/TestTRTInterDll/trtinfer_lib/common/BatchStream.h @@ -0,0 +1,381 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef BATCH_STREAM_H +#define BATCH_STREAM_H + +#include "NvInfer.h" +#include "common.h" +#include +#include +#include + +class IBatchStream +{ +public: + virtual void reset(int firstBatch) = 0; + virtual bool next() = 0; + virtual void skip(int skipCount) = 0; + virtual float* getBatch() = 0; + virtual float* getLabels() = 0; + virtual int getBatchesRead() const = 0; + virtual int getBatchSize() const = 0; + virtual nvinfer1::Dims getDims() const = 0; +}; + +class MNISTBatchStream : public IBatchStream +{ +public: + MNISTBatchStream(int batchSize, int maxBatches, const std::string& dataFile, const std::string& labelsFile, + const std::vector& directories) + : mBatchSize{batchSize} + , mMaxBatches{maxBatches} + , mDims{3, {1, 28, 28}} //!< We already know the dimensions of MNIST images. + { + readDataFile(locateFile(dataFile, directories)); + readLabelsFile(locateFile(labelsFile, directories)); + } + + void reset(int firstBatch) override + { + mBatchCount = firstBatch; + } + + bool next() override + { + if (mBatchCount >= mMaxBatches) + { + return false; + } + ++mBatchCount; + return true; + } + + void skip(int skipCount) override + { + mBatchCount += skipCount; + } + + float* getBatch() override + { + return mData.data() + (mBatchCount * mBatchSize * samplesCommon::volume(mDims)); + } + + float* getLabels() override + { + return mLabels.data() + (mBatchCount * mBatchSize); + } + + int getBatchesRead() const override + { + return mBatchCount; + } + + int getBatchSize() const override + { + return mBatchSize; + } + + nvinfer1::Dims getDims() const override + { + return nvinfer1::Dims{4, {mBatchSize, mDims.d[0], mDims.d[1], mDims.d[2]}}; + } + +private: + void readDataFile(const std::string& dataFilePath) + { + std::ifstream file{dataFilePath.c_str(), std::ios::binary}; + + int magicNumber, numImages, imageH, imageW; + file.read(reinterpret_cast(&magicNumber), sizeof(magicNumber)); + // All values in the MNIST files are big endian. + magicNumber = samplesCommon::swapEndianness(magicNumber); + ASSERT(magicNumber == 2051 && "Magic Number does not match the expected value for an MNIST image set"); + + // Read number of images and dimensions + file.read(reinterpret_cast(&numImages), sizeof(numImages)); + file.read(reinterpret_cast(&imageH), sizeof(imageH)); + file.read(reinterpret_cast(&imageW), sizeof(imageW)); + + numImages = samplesCommon::swapEndianness(numImages); + imageH = samplesCommon::swapEndianness(imageH); + imageW = samplesCommon::swapEndianness(imageW); + + // The MNIST data is made up of unsigned bytes, so we need to cast to float and normalize. + int numElements = numImages * imageH * imageW; + std::vector rawData(numElements); + file.read(reinterpret_cast(rawData.data()), numElements * sizeof(uint8_t)); + mData.resize(numElements); + std::transform( + rawData.begin(), rawData.end(), mData.begin(), [](uint8_t val) { return static_cast(val) / 255.f; }); + } + + void readLabelsFile(const std::string& labelsFilePath) + { + std::ifstream file{labelsFilePath.c_str(), std::ios::binary}; + int magicNumber, numImages; + file.read(reinterpret_cast(&magicNumber), sizeof(magicNumber)); + // All values in the MNIST files are big endian. + magicNumber = samplesCommon::swapEndianness(magicNumber); + ASSERT(magicNumber == 2049 && "Magic Number does not match the expected value for an MNIST labels file"); + + file.read(reinterpret_cast(&numImages), sizeof(numImages)); + numImages = samplesCommon::swapEndianness(numImages); + + std::vector rawLabels(numImages); + file.read(reinterpret_cast(rawLabels.data()), numImages * sizeof(uint8_t)); + mLabels.resize(numImages); + std::transform( + rawLabels.begin(), rawLabels.end(), mLabels.begin(), [](uint8_t val) { return static_cast(val); }); + } + + int mBatchSize{0}; + int mBatchCount{0}; //!< The batch that will be read on the next invocation of next() + int mMaxBatches{0}; + nvinfer1::Dims mDims{}; + std::vector mData{}; + std::vector mLabels{}; +}; + +class BatchStream : public IBatchStream +{ +public: + BatchStream(int batchSize, int maxBatches, std::string const& prefix, std::string const& suffix, + std::vector const& directories) + : mBatchSize(batchSize) + , mMaxBatches(maxBatches) + , mPrefix(prefix) + , mSuffix(suffix) + , mDataDir(directories) + { + std::ifstream file(locateFile(mPrefix + std::string("0") + mSuffix, mDataDir).c_str(), std::ios::binary); + ASSERT(file.good()); + int d[4]; + file.read(reinterpret_cast(d), 4 * sizeof(int32_t)); + mDims.nbDims = 4; // The number of dimensions. + mDims.d[0] = d[0]; // Batch Size + mDims.d[1] = d[1]; // Channels + mDims.d[2] = d[2]; // Height + mDims.d[3] = d[3]; // Width + ASSERT(mDims.d[0] > 0 && mDims.d[1] > 0 && mDims.d[2] > 0 && mDims.d[3] > 0); + + mImageSize = mDims.d[1] * mDims.d[2] * mDims.d[3]; + mBatch.resize(mBatchSize * mImageSize, 0); + mLabels.resize(mBatchSize, 0); + mFileBatch.resize(mDims.d[0] * mImageSize, 0); + mFileLabels.resize(mDims.d[0], 0); + reset(0); + } + + BatchStream(int batchSize, int maxBatches, std::string const& prefix, std::vector const& directories) + : BatchStream(batchSize, maxBatches, prefix, ".batch", directories) + { + } + + BatchStream(int batchSize, int maxBatches, nvinfer1::Dims const& dims, std::string const& listFile, + std::vector const& directories) + : mBatchSize(batchSize) + , mMaxBatches(maxBatches) + , mDims(dims) + , mListFile(listFile) + , mDataDir(directories) + { + mImageSize = mDims.d[1] * mDims.d[2] * mDims.d[3]; + mBatch.resize(mBatchSize * mImageSize, 0); + mLabels.resize(mBatchSize, 0); + mFileBatch.resize(mDims.d[0] * mImageSize, 0); + mFileLabels.resize(mDims.d[0], 0); + reset(0); + } + + // Resets data members + void reset(int firstBatch) override + { + mBatchCount = 0; + mFileCount = 0; + mFileBatchPos = mDims.d[0]; + skip(firstBatch); + } + + // Advance to next batch and return true, or return false if there is no batch left. + bool next() override + { + if (mBatchCount == mMaxBatches) + { + return false; + } + + for (int csize = 1, batchPos = 0; batchPos < mBatchSize; batchPos += csize, mFileBatchPos += csize) + { + ASSERT(mFileBatchPos > 0 && mFileBatchPos <= mDims.d[0]); + if (mFileBatchPos == mDims.d[0] && !update()) + { + return false; + } + + // copy the smaller of: elements left to fulfill the request, or elements left in the file buffer. + csize = std::min(mBatchSize - batchPos, mDims.d[0] - mFileBatchPos); + std::copy_n( + getFileBatch() + mFileBatchPos * mImageSize, csize * mImageSize, getBatch() + batchPos * mImageSize); + std::copy_n(getFileLabels() + mFileBatchPos, csize, getLabels() + batchPos); + } + mBatchCount++; + return true; + } + + // Skips the batches + void skip(int skipCount) override + { + if (mBatchSize >= mDims.d[0] && mBatchSize % mDims.d[0] == 0 && mFileBatchPos == mDims.d[0]) + { + mFileCount += skipCount * mBatchSize / mDims.d[0]; + return; + } + + int x = mBatchCount; + for (int i = 0; i < skipCount; i++) + { + next(); + } + mBatchCount = x; + } + + float* getBatch() override + { + return mBatch.data(); + } + + float* getLabels() override + { + return mLabels.data(); + } + + int getBatchesRead() const override + { + return mBatchCount; + } + + int getBatchSize() const override + { + return mBatchSize; + } + + nvinfer1::Dims getDims() const override + { + return mDims; + } + +private: + float* getFileBatch() + { + return mFileBatch.data(); + } + + float* getFileLabels() + { + return mFileLabels.data(); + } + + bool update() + { + if (mListFile.empty()) + { + std::string inputFileName = locateFile(mPrefix + std::to_string(mFileCount++) + mSuffix, mDataDir); + std::ifstream file(inputFileName.c_str(), std::ios::binary); + if (!file) + { + return false; + } + int d[4]; + file.read(reinterpret_cast(d), 4 * sizeof(int32_t)); + ASSERT(mDims.d[0] == d[0] && mDims.d[1] == d[1] && mDims.d[2] == d[2] && mDims.d[3] == d[3]); + file.read(reinterpret_cast(getFileBatch()), sizeof(float) * mDims.d[0] * mImageSize); + file.read(reinterpret_cast(getFileLabels()), sizeof(float) * mDims.d[0]); + } + else + { + std::vector fNames; + std::ifstream file(locateFile(mListFile, mDataDir), std::ios::binary); + if (!file) + { + return false; + } + + sample::gLogInfo << "Batch #" << mFileCount << std::endl; + file.seekg(((mBatchCount * mBatchSize)) * 7); + + for (int i = 1; i <= mBatchSize; i++) + { + std::string sName; + std::getline(file, sName); + sName = sName + ".ppm"; + sample::gLogInfo << "Calibrating with file " << sName << std::endl; + fNames.emplace_back(sName); + } + + mFileCount++; + + const int imageC = 3; + const int imageH = 300; + const int imageW = 300; + std::vector> ppms(fNames.size()); + for (uint32_t i = 0; i < fNames.size(); ++i) + { + readPPMFile(locateFile(fNames[i], mDataDir), ppms[i]); + } + + std::vector data(samplesCommon::volume(mDims)); + const float scale = 2.0 / 255.0; + const float bias = 1.0; + long int volChl = mDims.d[2] * mDims.d[3]; + + // Normalize input data + for (int i = 0, volImg = mDims.d[1] * mDims.d[2] * mDims.d[3]; i < mBatchSize; ++i) + { + for (int c = 0; c < mDims.d[1]; ++c) + { + for (int j = 0; j < volChl; ++j) + { + data[i * volImg + c * volChl + j] = scale * float(ppms[i].buffer[j * mDims.d[1] + c]) - bias; + } + } + } + + std::copy_n(data.data(), mDims.d[0] * mImageSize, getFileBatch()); + } + + mFileBatchPos = 0; + return true; + } + + int mBatchSize{0}; + int mMaxBatches{0}; + int mBatchCount{0}; + int mFileCount{0}; + int mFileBatchPos{0}; + int mImageSize{0}; + std::vector mBatch; //!< Data for the batch + std::vector mLabels; //!< Labels for the batch + std::vector mFileBatch; //!< List of image files + std::vector mFileLabels; //!< List of label files + std::string mPrefix; //!< Batch file name prefix + std::string mSuffix; //!< Batch file name suffix + nvinfer1::Dims mDims; //!< Input dimensions + std::string mListFile; //!< File name of the list of image names + std::vector mDataDir; //!< Directories where the files can be found +}; + +#endif diff --git a/Code/TestTRTInterDll/trtinfer_lib/common/EntropyCalibrator.h b/Code/TestTRTInterDll/trtinfer_lib/common/EntropyCalibrator.h new file mode 100644 index 0000000..936d10e --- /dev/null +++ b/Code/TestTRTInterDll/trtinfer_lib/common/EntropyCalibrator.h @@ -0,0 +1,136 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef ENTROPY_CALIBRATOR_H +#define ENTROPY_CALIBRATOR_H + +#include "BatchStream.h" +#include "NvInfer.h" + +//! \class EntropyCalibratorImpl +//! +//! \brief Implements common functionality for Entropy calibrators. +//! +template +class EntropyCalibratorImpl +{ +public: + EntropyCalibratorImpl(TBatchStream const& stream, int firstBatch, std::string const& networkName, + const char* inputBlobName, bool readCache = true) + : mStream{stream} + , mCalibrationTableName("CalibrationTable" + networkName) + , mInputBlobName(inputBlobName) + , mReadCache(readCache) + { + nvinfer1::Dims dims = mStream.getDims(); + mInputCount = samplesCommon::volume(dims); + CHECK(cudaMalloc(&mDeviceInput, mInputCount * sizeof(float))); + mStream.reset(firstBatch); + } + + virtual ~EntropyCalibratorImpl() + { + CHECK(cudaFree(mDeviceInput)); + } + + int getBatchSize() const noexcept + { + return mStream.getBatchSize(); + } + + bool getBatch(void* bindings[], const char* names[], int nbBindings) noexcept + { + if (!mStream.next()) + { + return false; + } + CHECK(cudaMemcpy(mDeviceInput, mStream.getBatch(), mInputCount * sizeof(float), cudaMemcpyHostToDevice)); + ASSERT(!strcmp(names[0], mInputBlobName)); + bindings[0] = mDeviceInput; + return true; + } + + const void* readCalibrationCache(size_t& length) noexcept + { + mCalibrationCache.clear(); + std::ifstream input(mCalibrationTableName, std::ios::binary); + input >> std::noskipws; + if (mReadCache && input.good()) + { + std::copy(std::istream_iterator(input), std::istream_iterator(), + std::back_inserter(mCalibrationCache)); + } + length = mCalibrationCache.size(); + return length ? mCalibrationCache.data() : nullptr; + } + + void writeCalibrationCache(const void* cache, size_t length) noexcept + { + std::ofstream output(mCalibrationTableName, std::ios::binary); + output.write(reinterpret_cast(cache), length); + } + +private: + TBatchStream mStream; + size_t mInputCount; + std::string mCalibrationTableName; + const char* mInputBlobName; + bool mReadCache{true}; + void* mDeviceInput{nullptr}; + std::vector mCalibrationCache; +}; + +//! \class Int8EntropyCalibrator2 +//! +//! \brief Implements Entropy calibrator 2. +//! CalibrationAlgoType is kENTROPY_CALIBRATION_2. +//! +template +class Int8EntropyCalibrator2 : public nvinfer1::IInt8EntropyCalibrator2 +{ +public: + Int8EntropyCalibrator2(TBatchStream const& stream, int32_t firstBatch, const char* networkName, + const char* inputBlobName, bool readCache = true) + : mImpl(stream, firstBatch, networkName, inputBlobName, readCache) + { + } + + int getBatchSize() const noexcept override + { + return mImpl.getBatchSize(); + } + + bool getBatch(void* bindings[], const char* names[], int nbBindings) noexcept override + { + return mImpl.getBatch(bindings, names, nbBindings); + } + + const void* readCalibrationCache(size_t& length) noexcept override + { + return mImpl.readCalibrationCache(length); + } + + void writeCalibrationCache(const void* cache, size_t length) noexcept override + { + mImpl.writeCalibrationCache(cache, length); + } + +private: + EntropyCalibratorImpl mImpl; +}; + +#endif // ENTROPY_CALIBRATOR_H diff --git a/Code/TestTRTInterDll/trtinfer_lib/common/ErrorRecorder.h b/Code/TestTRTInterDll/trtinfer_lib/common/ErrorRecorder.h new file mode 100644 index 0000000..3cc8ef9 --- /dev/null +++ b/Code/TestTRTInterDll/trtinfer_lib/common/ErrorRecorder.h @@ -0,0 +1,138 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef ERROR_RECORDER_H +#define ERROR_RECORDER_H +#include "NvInferRuntimeCommon.h" +#include "logger.h" +#include +#include +#include +#include +#include + +using nvinfer1::IErrorRecorder; +using nvinfer1::ErrorCode; + +//! +//! A simple implementation of the IErrorRecorder interface for +//! use by samples. This interface also can be used as a reference +//! implementation. +//! The sample Error recorder is based on a vector that pairs the error +//! code and the error string into a single element. It also uses +//! standard mutex's and atomics in order to make sure that the code +//! works in a multi-threaded environment. +//! +class SampleErrorRecorder : public IErrorRecorder +{ + using errorPair = std::pair; + using errorStack = std::vector; + +public: + SampleErrorRecorder() = default; + + ~SampleErrorRecorder() noexcept override {} + int32_t getNbErrors() const noexcept final + { + return mErrorStack.size(); + } + ErrorCode getErrorCode(int32_t errorIdx) const noexcept final + { + return invalidIndexCheck(errorIdx) ? ErrorCode::kINVALID_ARGUMENT : (*this)[errorIdx].first; + }; + IErrorRecorder::ErrorDesc getErrorDesc(int32_t errorIdx) const noexcept final + { + return invalidIndexCheck(errorIdx) ? "errorIdx out of range." : (*this)[errorIdx].second.c_str(); + } + // This class can never overflow since we have dynamic resize via std::vector usage. + bool hasOverflowed() const noexcept final + { + return false; + } + + // Empty the errorStack. + void clear() noexcept final + { + try + { + // grab a lock so that there is no addition while clearing. + std::lock_guard guard(mStackLock); + mErrorStack.clear(); + } + catch (const std::exception& e) + { + sample::gLogFatal << "Internal Error: " << e.what() << std::endl; + } + }; + + //! Simple helper function that + bool empty() const noexcept + { + return mErrorStack.empty(); + } + + bool reportError(ErrorCode val, IErrorRecorder::ErrorDesc desc) noexcept final + { + try + { + std::lock_guard guard(mStackLock); + sample::gLogError << "Error[" << static_cast(val) << "]: " << desc << std::endl; + mErrorStack.push_back(errorPair(val, desc)); + } + catch (const std::exception& e) + { + sample::gLogFatal << "Internal Error: " << e.what() << std::endl; + } + // All errors are considered fatal. + return true; + } + + // Atomically increment or decrement the ref counter. + IErrorRecorder::RefCount incRefCount() noexcept final + { + return ++mRefCount; + } + IErrorRecorder::RefCount decRefCount() noexcept final + { + return --mRefCount; + } + +private: + // Simple helper functions. + const errorPair& operator[](size_t index) const noexcept + { + return mErrorStack[index]; + } + + bool invalidIndexCheck(int32_t index) const noexcept + { + // By converting signed to unsigned, we only need a single check since + // negative numbers turn into large positive greater than the size. + size_t sIndex = index; + return sIndex >= mErrorStack.size(); + } + // Mutex to hold when locking mErrorStack. + std::mutex mStackLock; + + // Reference count of the class. Destruction of the class when mRefCount + // is not zero causes undefined behavior. + std::atomic mRefCount{0}; + + // The error stack that holds the errors recorded by TensorRT. + errorStack mErrorStack; +}; // class SampleErrorRecorder +#endif // ERROR_RECORDER_H diff --git a/Code/TestTRTInterDll/trtinfer_lib/common/argsParser.h b/Code/TestTRTInterDll/trtinfer_lib/common/argsParser.h new file mode 100644 index 0000000..3b80797 --- /dev/null +++ b/Code/TestTRTInterDll/trtinfer_lib/common/argsParser.h @@ -0,0 +1,164 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef TENSORRT_ARGS_PARSER_H +#define TENSORRT_ARGS_PARSER_H + +#ifdef _MSC_VER +#include "getOptWin.h" +#else +#include +#endif +#include +#include +#include + +namespace samplesCommon +{ + +//! +//! \brief The SampleParams structure groups the basic parameters required by +//! all sample networks. +//! +struct SampleParams +{ + int32_t batchSize{1}; //!< Number of inputs in a batch + int32_t dlaCore{-1}; //!< Specify the DLA core to run network on. + bool int8{false}; //!< Allow runnning the network in Int8 mode. + bool fp16{false}; //!< Allow running the network in FP16 mode. + std::vector dataDirs; //!< Directory paths where sample data files are stored + std::vector inputTensorNames; + std::vector outputTensorNames; +}; + +//! +//! \brief The CaffeSampleParams structure groups the additional parameters required by +//! networks that use caffe +//! +struct CaffeSampleParams : public SampleParams +{ + std::string prototxtFileName; //!< Filename of prototxt design file of a network + std::string weightsFileName; //!< Filename of trained weights file of a network + std::string meanFileName; //!< Filename of mean file of a network +}; + +//! +//! \brief The OnnxSampleParams structure groups the additional parameters required by +//! networks that use ONNX +//! +struct OnnxSampleParams : public SampleParams +{ + std::string onnxFileName; //!< Filename of ONNX file of a network +}; + +//! +//! \brief The UffSampleParams structure groups the additional parameters required by +//! networks that use Uff +//! +struct UffSampleParams : public SampleParams +{ + std::string uffFileName; //!< Filename of uff file of a network +}; + +//! +//! /brief Struct to maintain command-line arguments. +//! +struct Args +{ + bool runInInt8{false}; + bool runInFp16{false}; + bool help{false}; + int32_t useDLACore{-1}; + int32_t batch{1}; + std::vector dataDirs; + std::string saveEngine; + std::string loadEngine; + bool useILoop{false}; +}; + +//! +//! \brief Populates the Args struct with the provided command-line parameters. +//! +//! \throw invalid_argument if any of the arguments are not valid +//! +//! \return boolean If return value is true, execution can continue, otherwise program should exit +//! +inline bool parseArgs(Args& args, int32_t argc, char* argv[]) +{ + while (1) + { + int32_t arg; + static struct option long_options[] = {{"help", no_argument, 0, 'h'}, {"datadir", required_argument, 0, 'd'}, + {"int8", no_argument, 0, 'i'}, {"fp16", no_argument, 0, 'f'}, {"useILoop", no_argument, 0, 'l'}, + {"saveEngine", required_argument, 0, 's'}, {"loadEngine", required_argument, 0, 'o'}, + {"useDLACore", required_argument, 0, 'u'}, {"batch", required_argument, 0, 'b'}, {nullptr, 0, nullptr, 0}}; + int32_t option_index = 0; + arg = getopt_long(argc, argv, "hd:iu", long_options, &option_index); + if (arg == -1) + { + break; + } + + switch (arg) + { + case 'h': args.help = true; return true; + case 'd': + if (optarg) + { + args.dataDirs.push_back(optarg); + } + else + { + std::cerr << "ERROR: --datadir requires option argument" << std::endl; + return false; + } + break; + case 's': + if (optarg) + { + args.saveEngine = optarg; + } + break; + case 'o': + if (optarg) + { + args.loadEngine = optarg; + } + break; + case 'i': args.runInInt8 = true; break; + case 'f': args.runInFp16 = true; break; + case 'l': args.useILoop = true; break; + case 'u': + if (optarg) + { + args.useDLACore = std::stoi(optarg); + } + break; + case 'b': + if (optarg) + { + args.batch = std::stoi(optarg); + } + break; + default: return false; + } + } + return true; +} + +} // namespace samplesCommon + +#endif // TENSORRT_ARGS_PARSER_H diff --git a/Code/TestTRTInterDll/trtinfer_lib/common/buffers.h b/Code/TestTRTInterDll/trtinfer_lib/common/buffers.h new file mode 100644 index 0000000..6d87a11 --- /dev/null +++ b/Code/TestTRTInterDll/trtinfer_lib/common/buffers.h @@ -0,0 +1,421 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef TENSORRT_BUFFERS_H +#define TENSORRT_BUFFERS_H + +#include "NvInfer.h" +#include "common.h" +#include "half.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace samplesCommon +{ + +//! +//! \brief The GenericBuffer class is a templated class for buffers. +//! +//! \details This templated RAII (Resource Acquisition Is Initialization) class handles the allocation, +//! deallocation, querying of buffers on both the device and the host. +//! It can handle data of arbitrary types because it stores byte buffers. +//! The template parameters AllocFunc and FreeFunc are used for the +//! allocation and deallocation of the buffer. +//! AllocFunc must be a functor that takes in (void** ptr, size_t size) +//! and returns bool. ptr is a pointer to where the allocated buffer address should be stored. +//! size is the amount of memory in bytes to allocate. +//! The boolean indicates whether or not the memory allocation was successful. +//! FreeFunc must be a functor that takes in (void* ptr) and returns void. +//! ptr is the allocated buffer address. It must work with nullptr input. +//! +template +class GenericBuffer +{ +public: + //! + //! \brief Construct an empty buffer. + //! + GenericBuffer(nvinfer1::DataType type = nvinfer1::DataType::kFLOAT) + : mSize(0) + , mCapacity(0) + , mType(type) + , mBuffer(nullptr) + { + } + + //! + //! \brief Construct a buffer with the specified allocation size in bytes. + //! + GenericBuffer(size_t size, nvinfer1::DataType type) + : mSize(size) + , mCapacity(size) + , mType(type) + { + if (!allocFn(&mBuffer, this->nbBytes())) + { + throw std::bad_alloc(); + } + } + + GenericBuffer(GenericBuffer&& buf) + : mSize(buf.mSize) + , mCapacity(buf.mCapacity) + , mType(buf.mType) + , mBuffer(buf.mBuffer) + { + buf.mSize = 0; + buf.mCapacity = 0; + buf.mType = nvinfer1::DataType::kFLOAT; + buf.mBuffer = nullptr; + } + + GenericBuffer& operator=(GenericBuffer&& buf) + { + if (this != &buf) + { + freeFn(mBuffer); + mSize = buf.mSize; + mCapacity = buf.mCapacity; + mType = buf.mType; + mBuffer = buf.mBuffer; + // Reset buf. + buf.mSize = 0; + buf.mCapacity = 0; + buf.mBuffer = nullptr; + } + return *this; + } + + //! + //! \brief Returns pointer to underlying array. + //! + void* data() + { + return mBuffer; + } + + //! + //! \brief Returns pointer to underlying array. + //! + const void* data() const + { + return mBuffer; + } + + //! + //! \brief Returns the size (in number of elements) of the buffer. + //! + size_t size() const + { + return mSize; + } + + //! + //! \brief Returns the size (in bytes) of the buffer. + //! + size_t nbBytes() const + { + return this->size() * samplesCommon::getElementSize(mType); + } + + //! + //! \brief Resizes the buffer. This is a no-op if the new size is smaller than or equal to the current capacity. + //! + void resize(size_t newSize) + { + mSize = newSize; + if (mCapacity < newSize) + { + freeFn(mBuffer); + if (!allocFn(&mBuffer, this->nbBytes())) + { + throw std::bad_alloc{}; + } + mCapacity = newSize; + } + } + + //! + //! \brief Overload of resize that accepts Dims + //! + void resize(const nvinfer1::Dims& dims) + { + return this->resize(samplesCommon::volume(dims)); + } + + ~GenericBuffer() + { + freeFn(mBuffer); + } + +private: + size_t mSize{0}, mCapacity{0}; + nvinfer1::DataType mType; + void* mBuffer; + AllocFunc allocFn; + FreeFunc freeFn; +}; + +class DeviceAllocator +{ +public: + bool operator()(void** ptr, size_t size) const + { + return cudaMalloc(ptr, size) == cudaSuccess; + } +}; + +class DeviceFree +{ +public: + void operator()(void* ptr) const + { + cudaFree(ptr); + } +}; + +class HostAllocator +{ +public: + bool operator()(void** ptr, size_t size) const + { + *ptr = malloc(size); + return *ptr != nullptr; + } +}; + +class HostFree +{ +public: + void operator()(void* ptr) const + { + free(ptr); + } +}; + +using DeviceBuffer = GenericBuffer; +using HostBuffer = GenericBuffer; + +//! +//! \brief The ManagedBuffer class groups together a pair of corresponding device and host buffers. +//! +class ManagedBuffer +{ +public: + DeviceBuffer deviceBuffer; + HostBuffer hostBuffer; +}; + +//! +//! \brief The BufferManager class handles host and device buffer allocation and deallocation. +//! +//! \details This RAII class handles host and device buffer allocation and deallocation, +//! memcpy between host and device buffers to aid with inference, +//! and debugging dumps to validate inference. The BufferManager class is meant to be +//! used to simplify buffer management and any interactions between buffers and the engine. +//! +class BufferManager +{ +public: + static const size_t kINVALID_SIZE_VALUE = ~size_t(0); + + //! + //! \brief Create a BufferManager for handling buffer interactions with engine. + //! + BufferManager(std::shared_ptr engine, const int batchSize = 0, + const nvinfer1::IExecutionContext* context = nullptr) + : mEngine(engine) + , mBatchSize(batchSize) + { + // Full Dims implies no batch size. + assert(engine->hasImplicitBatchDimension() || mBatchSize == 0); + // Create host and device buffers + for (int i = 0; i < mEngine->getNbBindings(); i++) + { + auto dims = context ? context->getBindingDimensions(i) : mEngine->getBindingDimensions(i); + size_t vol = context || !mBatchSize ? 1 : static_cast(mBatchSize); + nvinfer1::DataType type = mEngine->getBindingDataType(i); + int vecDim = mEngine->getBindingVectorizedDim(i); + if (-1 != vecDim) // i.e., 0 != lgScalarsPerVector + { + int scalarsPerVec = mEngine->getBindingComponentsPerElement(i); + dims.d[vecDim] = divUp(dims.d[vecDim], scalarsPerVec); + vol *= scalarsPerVec; + } + vol *= samplesCommon::volume(dims); + std::unique_ptr manBuf{new ManagedBuffer()}; + manBuf->deviceBuffer = DeviceBuffer(vol, type); + manBuf->hostBuffer = HostBuffer(vol, type); + mDeviceBindings.emplace_back(manBuf->deviceBuffer.data()); + mManagedBuffers.emplace_back(std::move(manBuf)); + } + } + + //! + //! \brief Returns a vector of device buffers that you can use directly as + //! bindings for the execute and enqueue methods of IExecutionContext. + //! + std::vector& getDeviceBindings() + { + return mDeviceBindings; + } + + //! + //! \brief Returns a vector of device buffers. + //! + const std::vector& getDeviceBindings() const + { + return mDeviceBindings; + } + + //! + //! \brief Returns the device buffer corresponding to tensorName. + //! Returns nullptr if no such tensor can be found. + //! + void* getDeviceBuffer(const std::string& tensorName) const + { + return getBuffer(false, tensorName); + } + + //! + //! \brief Returns the host buffer corresponding to tensorName. + //! Returns nullptr if no such tensor can be found. + //! + void* getHostBuffer(const std::string& tensorName) const + { + return getBuffer(true, tensorName); + } + + //! + //! \brief Returns the size of the host and device buffers that correspond to tensorName. + //! Returns kINVALID_SIZE_VALUE if no such tensor can be found. + //! + size_t size(const std::string& tensorName) const + { + int index = mEngine->getBindingIndex(tensorName.c_str()); + if (index == -1) + return kINVALID_SIZE_VALUE; + return mManagedBuffers[index]->hostBuffer.nbBytes(); + } + + //! + //! \brief Templated print function that dumps buffers of arbitrary type to std::ostream. + //! rowCount parameter controls how many elements are on each line. + //! A rowCount of 1 means that there is only 1 element on each line. + //! + template + void print(std::ostream& os, void* buf, size_t bufSize, size_t rowCount) + { + assert(rowCount != 0); + assert(bufSize % sizeof(T) == 0); + T* typedBuf = static_cast(buf); + size_t numItems = bufSize / sizeof(T); + for (int i = 0; i < static_cast(numItems); i++) + { + // Handle rowCount == 1 case + if (rowCount == 1 && i != static_cast(numItems) - 1) + os << typedBuf[i] << std::endl; + else if (rowCount == 1) + os << typedBuf[i]; + // Handle rowCount > 1 case + else if (i % rowCount == 0) + os << typedBuf[i]; + else if (i % rowCount == rowCount - 1) + os << " " << typedBuf[i] << std::endl; + else + os << " " << typedBuf[i]; + } + } + + //! + //! \brief Copy the contents of input host buffers to input device buffers synchronously. + //! + void copyInputToDevice() + { + memcpyBuffers(true, false, false); + } + + //! + //! \brief Copy the contents of output device buffers to output host buffers synchronously. + //! + void copyOutputToHost() + { + memcpyBuffers(false, true, false); + } + + //! + //! \brief Copy the contents of input host buffers to input device buffers asynchronously. + //! + void copyInputToDeviceAsync(const cudaStream_t& stream = 0) + { + memcpyBuffers(true, false, true, stream); + } + + //! + //! \brief Copy the contents of output device buffers to output host buffers asynchronously. + //! + void copyOutputToHostAsync(const cudaStream_t& stream = 0) + { + memcpyBuffers(false, true, true, stream); + } + + ~BufferManager() = default; + +private: + void* getBuffer(const bool isHost, const std::string& tensorName) const + { + int index = mEngine->getBindingIndex(tensorName.c_str()); + if (index == -1) + return nullptr; + return (isHost ? mManagedBuffers[index]->hostBuffer.data() : mManagedBuffers[index]->deviceBuffer.data()); + } + + void memcpyBuffers(const bool copyInput, const bool deviceToHost, const bool async, const cudaStream_t& stream = 0) + { + for (int i = 0; i < mEngine->getNbBindings(); i++) + { + void* dstPtr + = deviceToHost ? mManagedBuffers[i]->hostBuffer.data() : mManagedBuffers[i]->deviceBuffer.data(); + const void* srcPtr + = deviceToHost ? mManagedBuffers[i]->deviceBuffer.data() : mManagedBuffers[i]->hostBuffer.data(); + const size_t byteSize = mManagedBuffers[i]->hostBuffer.nbBytes(); + const cudaMemcpyKind memcpyType = deviceToHost ? cudaMemcpyDeviceToHost : cudaMemcpyHostToDevice; + if ((copyInput && mEngine->bindingIsInput(i)) || (!copyInput && !mEngine->bindingIsInput(i))) + { + if (async) + CHECK(cudaMemcpyAsync(dstPtr, srcPtr, byteSize, memcpyType, stream)); + else + CHECK(cudaMemcpy(dstPtr, srcPtr, byteSize, memcpyType)); + } + } + } + + std::shared_ptr mEngine; //!< The pointer to the engine + int mBatchSize; //!< The batch size for legacy networks, 0 otherwise. + std::vector> mManagedBuffers; //!< The vector of pointers to managed buffers + std::vector mDeviceBindings; //!< The vector of device buffers needed for engine execution +}; + +} // namespace samplesCommon + +#endif // TENSORRT_BUFFERS_H diff --git a/Code/TestTRTInterDll/trtinfer_lib/common/common.h b/Code/TestTRTInterDll/trtinfer_lib/common/common.h new file mode 100644 index 0000000..cc52916 --- /dev/null +++ b/Code/TestTRTInterDll/trtinfer_lib/common/common.h @@ -0,0 +1,1222 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef TENSORRT_COMMON_H +#define TENSORRT_COMMON_H + +// For loadLibrary +#ifdef _MSC_VER +// Needed so that the max/min definitions in windows.h do not conflict with std::max/min. +#define NOMINMAX +#include +#undef NOMINMAX +#else +#include +#endif + +#include "NvInfer.h" +#include "NvInferPlugin.h" +#include "logger.h" +#include "sampleEntrypoints.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef _MSC_VER +#else +#include // fileno +#include // lockf +#endif + +#include "safeCommon.h" + +#ifdef _MSC_VER +#define FN_NAME __FUNCTION__ +#else +#define FN_NAME __func__ +#endif + +#if defined(__aarch64__) || defined(__QNX__) +#define ENABLE_DLA_API 1 +#endif + +#define CHECK_RETURN_W_MSG(status, val, errMsg) \ + do \ + { \ + if (!(status)) \ + { \ + sample::gLogError << errMsg << " Error in " << __FILE__ << ", function " << FN_NAME << "(), line " << __LINE__ \ + << std::endl; \ + return val; \ + } \ + } while (0) + +#undef ASSERT +#define ASSERT(condition) \ + do \ + { \ + if (!(condition)) \ + { \ + sample::gLogError << "Assertion failure: " << #condition << std::endl; \ + abort(); \ + } \ + } while (0) + + +#define CHECK_RETURN(status, val) CHECK_RETURN_W_MSG(status, val, "") + +#define OBJ_GUARD(A) std::unique_ptr + +template +OBJ_GUARD(T) +makeObjGuard(T_* t) +{ + CHECK(!(std::is_base_of::value || std::is_same::value)); + auto deleter = [](T* t) { t->destroy(); }; + return std::unique_ptr{static_cast(t), deleter}; +} + +constexpr long double operator"" _GiB(long double val) +{ + return val * (1 << 30); +} +constexpr long double operator"" _MiB(long double val) +{ + return val * (1 << 20); +} +constexpr long double operator"" _KiB(long double val) +{ + return val * (1 << 10); +} + +struct SimpleProfiler : public nvinfer1::IProfiler +{ + struct Record + { + float time{0}; + int count{0}; + }; + + void reportLayerTime(const char* layerName, float ms) noexcept override + { + mProfile[layerName].count++; + mProfile[layerName].time += ms; + if (std::find(mLayerNames.begin(), mLayerNames.end(), layerName) == mLayerNames.end()) + { + mLayerNames.push_back(layerName); + } + } + + SimpleProfiler(const char* name, const std::vector& srcProfilers = std::vector()) + : mName(name) + { + for (const auto& srcProfiler : srcProfilers) + { + for (const auto& rec : srcProfiler.mProfile) + { + auto it = mProfile.find(rec.first); + if (it == mProfile.end()) + { + mProfile.insert(rec); + } + else + { + it->second.time += rec.second.time; + it->second.count += rec.second.count; + } + } + } + } + + friend std::ostream& operator<<(std::ostream& out, const SimpleProfiler& value) + { + out << "========== " << value.mName << " profile ==========" << std::endl; + float totalTime = 0; + std::string layerNameStr = "TensorRT layer name"; + int maxLayerNameLength = std::max(static_cast(layerNameStr.size()), 70); + for (const auto& elem : value.mProfile) + { + totalTime += elem.second.time; + maxLayerNameLength = std::max(maxLayerNameLength, static_cast(elem.first.size())); + } + + auto old_settings = out.flags(); + auto old_precision = out.precision(); + // Output header + { + out << std::setfill(' ') << std::setw(maxLayerNameLength) << layerNameStr << " "; + out << std::setw(12) << "Runtime, " + << "%" + << " "; + out << std::setw(12) << "Invocations" + << " "; + out << std::setw(12) << "Runtime, ms" << std::endl; + } + for (size_t i = 0; i < value.mLayerNames.size(); i++) + { + const std::string layerName = value.mLayerNames[i]; + auto elem = value.mProfile.at(layerName); + out << std::setw(maxLayerNameLength) << layerName << " "; + out << std::setw(12) << std::fixed << std::setprecision(1) << (elem.time * 100.0F / totalTime) << "%" + << " "; + out << std::setw(12) << elem.count << " "; + out << std::setw(12) << std::fixed << std::setprecision(2) << elem.time << std::endl; + } + out.flags(old_settings); + out.precision(old_precision); + out << "========== " << value.mName << " total runtime = " << totalTime << " ms ==========" << std::endl; + + return out; + } + +private: + std::string mName; + std::vector mLayerNames; + std::map mProfile; +}; + +//! Locate path to file, given its filename or filepath suffix and possible dirs it might lie in. +//! Function will also walk back MAX_DEPTH dirs from CWD to check for such a file path. +inline std::string locateFile( + const std::string& filepathSuffix, const std::vector& directories, bool reportError = true) +{ + const int MAX_DEPTH{10}; + bool found{false}; + std::string filepath; + + for (auto& dir : directories) + { + if (!dir.empty() && dir.back() != '/') + { +#ifdef _MSC_VER + filepath = dir + "\\" + filepathSuffix; +#else + filepath = dir + "/" + filepathSuffix; +#endif + } + else + { + filepath = dir + filepathSuffix; + } + + for (int i = 0; i < MAX_DEPTH && !found; i++) + { + const std::ifstream checkFile(filepath); + found = checkFile.is_open(); + if (found) + { + break; + } + + filepath = "../" + filepath; // Try again in parent dir + } + + if (found) + { + break; + } + + filepath.clear(); + } + + // Could not find the file + if (filepath.empty()) + { + const std::string dirList = std::accumulate(directories.begin() + 1, directories.end(), directories.front(), + [](const std::string& a, const std::string& b) { return a + "\n\t" + b; }); + std::cout << "Could not find " << filepathSuffix << " in data directories:\n\t" << dirList << std::endl; + + if (reportError) + { + std::cout << "&&&& FAILED" << std::endl; + exit(EXIT_FAILURE); + } + } + + return filepath; +} + +inline void readPGMFile(const std::string& fileName, uint8_t* buffer, int inH, int inW) +{ + std::ifstream infile(fileName, std::ifstream::binary); + assert(infile.is_open() && "Attempting to read from a file that is not open."); + std::string magic, h, w, max; + infile >> magic >> h >> w >> max; + infile.seekg(1, infile.cur); + infile.read(reinterpret_cast(buffer), inH * inW); +} + +namespace samplesCommon +{ + +// Swaps endianness of an integral type. +template ::value, int>::type = 0> +inline T swapEndianness(const T& value) +{ + uint8_t bytes[sizeof(T)]; + for (int i = 0; i < static_cast(sizeof(T)); ++i) + { + bytes[sizeof(T) - 1 - i] = *(reinterpret_cast(&value) + i); + } + return *reinterpret_cast(bytes); +} + +class HostMemory +{ +public: + HostMemory() = delete; + virtual void* data() const noexcept + { + return mData; + } + virtual std::size_t size() const noexcept + { + return mSize; + } + virtual nvinfer1::DataType type() const noexcept + { + return mType; + } + virtual ~HostMemory() {} + +protected: + HostMemory(std::size_t size, nvinfer1::DataType type) + : mData{nullptr} + , mSize(size) + , mType(type) + { + } + void* mData; + std::size_t mSize; + nvinfer1::DataType mType; +}; + +template +class TypedHostMemory : public HostMemory +{ +public: + explicit TypedHostMemory(std::size_t size) + : HostMemory(size, dataType) + { + mData = new ElemType[size]; + }; + ~TypedHostMemory() noexcept override + { + delete[](ElemType*) mData; + } + ElemType* raw() noexcept + { + return static_cast(data()); + } +}; + +using FloatMemory = TypedHostMemory; +using HalfMemory = TypedHostMemory; +using ByteMemory = TypedHostMemory; + +inline void* safeCudaMalloc(size_t memSize) +{ + void* deviceMem; + CHECK(cudaMalloc(&deviceMem, memSize)); + if (deviceMem == nullptr) + { + std::cerr << "Out of memory" << std::endl; + exit(1); + } + return deviceMem; +} + +inline bool isDebug() +{ + return (std::getenv("TENSORRT_DEBUG") ? true : false); +} + +struct InferDeleter +{ + template + void operator()(T* obj) const + { + delete obj; + } +}; + +template +using SampleUniquePtr = std::unique_ptr; + +static auto StreamDeleter = [](cudaStream_t* pStream) + { + if (pStream) + { + cudaStreamDestroy(*pStream); + delete pStream; + } + }; + +inline std::unique_ptr makeCudaStream() +{ + std::unique_ptr pStream(new cudaStream_t, StreamDeleter); + if (cudaStreamCreateWithFlags(pStream.get(), cudaStreamNonBlocking) != cudaSuccess) + { + pStream.reset(nullptr); + } + + return pStream; +} + +//! Return vector of indices that puts magnitudes of sequence in descending order. +template +std::vector argMagnitudeSort(Iter begin, Iter end) +{ + std::vector indices(end - begin); + std::iota(indices.begin(), indices.end(), 0); + std::sort(indices.begin(), indices.end(), [&begin](size_t i, size_t j) { return std::abs(begin[j]) < std::abs(begin[i]); }); + return indices; +} + +inline bool readReferenceFile(const std::string& fileName, std::vector& refVector) +{ + std::ifstream infile(fileName); + if (!infile.is_open()) + { + std::cout << "ERROR: readReferenceFile: Attempting to read from a file that is not open." << std::endl; + return false; + } + std::string line; + while (std::getline(infile, line)) + { + if (line.empty()) + continue; + refVector.push_back(line); + } + infile.close(); + return true; +} + +template +std::vector classify( + const std::vector& refVector, const std::vector& output, const size_t topK) +{ + const auto inds = samplesCommon::argMagnitudeSort(output.cbegin(), output.cend()); + std::vector result; + result.reserve(topK); + for (size_t k = 0; k < topK; ++k) + { + result.push_back(refVector[inds[k]]); + } + return result; +} + +// Returns indices of highest K magnitudes in v. +template +std::vector topKMagnitudes(const std::vector& v, const size_t k) +{ + std::vector indices = samplesCommon::argMagnitudeSort(v.cbegin(), v.cend()); + indices.resize(k); + return indices; +} + +template +bool readASCIIFile(const std::string& fileName, const size_t size, std::vector& out) +{ + std::ifstream infile(fileName); + if (!infile.is_open()) + { + std::cout << "ERROR readASCIIFile: Attempting to read from a file that is not open." << std::endl; + return false; + } + out.clear(); + out.reserve(size); + out.assign(std::istream_iterator(infile), std::istream_iterator()); + infile.close(); + return true; +} + +template +bool writeASCIIFile(const std::string& fileName, const std::vector& in) +{ + std::ofstream outfile(fileName); + if (!outfile.is_open()) + { + std::cout << "ERROR: writeASCIIFile: Attempting to write to a file that is not open." << std::endl; + return false; + } + for (auto fn : in) + { + outfile << fn << "\n"; + } + outfile.close(); + return true; +} + +inline void print_version() +{ + std::cout << " TensorRT version: " << NV_TENSORRT_MAJOR << "." << NV_TENSORRT_MINOR << "." << NV_TENSORRT_PATCH + << "." << NV_TENSORRT_BUILD << std::endl; +} + +inline std::string getFileType(const std::string& filepath) +{ + return filepath.substr(filepath.find_last_of(".") + 1); +} + +inline std::string toLower(const std::string& inp) +{ + std::string out = inp; + std::transform(out.begin(), out.end(), out.begin(), ::tolower); + return out; +} + +inline float getMaxValue(const float* buffer, int64_t size) +{ + assert(buffer != nullptr); + assert(size > 0); + return *std::max_element(buffer, buffer + size); +} + +inline int32_t calculateSoftmax(float* const prob, int32_t const numDigits) +{ + ASSERT(prob != nullptr); + ASSERT(numDigits == 10); + float sum{0.0F}; + std::transform(prob, prob + numDigits, prob, [&sum](float v) -> float { + sum += exp(v); + return exp(v); + }); + + ASSERT(sum != 0.0F); + std::transform(prob, prob + numDigits, prob, [sum](float v) -> float { return v / sum; }); + int32_t idx = std::max_element(prob, prob + numDigits) - prob; + return idx; +} + +// Ensures that every tensor used by a network has a dynamic range set. +// +// All tensors in a network must have a dynamic range specified if a calibrator is not used. +// This function is just a utility to globally fill in missing scales and zero-points for the entire network. +// +// If a tensor does not have a dyanamic range set, it is assigned inRange or outRange as follows: +// +// * If the tensor is the input to a layer or output of a pooling node, its dynamic range is derived from inRange. +// * Otherwise its dynamic range is derived from outRange. +// +// The default parameter values are intended to demonstrate, for final layers in the network, +// cases where dynamic ranges are asymmetric. +// +// The default parameter values choosen arbitrarily. Range values should be choosen such that +// we avoid underflow or overflow. Also range value should be non zero to avoid uniform zero scale tensor. +inline void setAllDynamicRanges(nvinfer1::INetworkDefinition* network, float inRange = 2.0f, float outRange = 4.0f) +{ + // Ensure that all layer inputs have a scale. + for (int i = 0; i < network->getNbLayers(); i++) + { + auto layer = network->getLayer(i); + for (int j = 0; j < layer->getNbInputs(); j++) + { + nvinfer1::ITensor* input{layer->getInput(j)}; + // Optional inputs are nullptr here and are from RNN layers. + if (input != nullptr && !input->dynamicRangeIsSet()) + { + ASSERT(input->setDynamicRange(-inRange, inRange)); + } + } + } + + // Ensure that all layer outputs have a scale. + // Tensors that are also inputs to layers are ingored here + // since the previous loop nest assigned scales to them. + for (int i = 0; i < network->getNbLayers(); i++) + { + auto layer = network->getLayer(i); + for (int j = 0; j < layer->getNbOutputs(); j++) + { + nvinfer1::ITensor* output{layer->getOutput(j)}; + // Optional outputs are nullptr here and are from RNN layers. + if (output != nullptr && !output->dynamicRangeIsSet()) + { + // Pooling must have the same input and output scales. + if (layer->getType() == nvinfer1::LayerType::kPOOLING) + { + ASSERT(output->setDynamicRange(-inRange, inRange)); + } + else + { + ASSERT(output->setDynamicRange(-outRange, outRange)); + } + } + } + } +} + +inline void setDummyInt8DynamicRanges(const nvinfer1::IBuilderConfig* c, nvinfer1::INetworkDefinition* n) +{ + // Set dummy per-tensor dynamic range if Int8 mode is requested. + if (c->getFlag(nvinfer1::BuilderFlag::kINT8)) + { + sample::gLogWarning << "Int8 calibrator not provided. Generating dummy per-tensor dynamic range. Int8 accuracy " + "is not guaranteed." + << std::endl; + setAllDynamicRanges(n); + } +} + +inline void enableDLA( + nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, int useDLACore, bool allowGPUFallback = true) +{ + if (useDLACore >= 0) + { + if (builder->getNbDLACores() == 0) + { + std::cerr << "Trying to use DLA core " << useDLACore << " on a platform that doesn't have any DLA cores" + << std::endl; + assert("Error: use DLA core on a platfrom that doesn't have any DLA cores" && false); + } + if (allowGPUFallback) + { + config->setFlag(nvinfer1::BuilderFlag::kGPU_FALLBACK); + } + if (!config->getFlag(nvinfer1::BuilderFlag::kINT8)) + { + // User has not requested INT8 Mode. + // By default run in FP16 mode. FP32 mode is not permitted. + config->setFlag(nvinfer1::BuilderFlag::kFP16); + } + config->setDefaultDeviceType(nvinfer1::DeviceType::kDLA); + config->setDLACore(useDLACore); + } +} + +inline int32_t parseDLA(int32_t argc, char** argv) +{ + for (int32_t i = 1; i < argc; i++) + { + if (strncmp(argv[i], "--useDLACore=", 13) == 0) + { + return std::stoi(argv[i] + 13); + } + } + return -1; +} + +inline uint32_t getElementSize(nvinfer1::DataType t) noexcept +{ + switch (t) + { + case nvinfer1::DataType::kINT32: return 4; + case nvinfer1::DataType::kFLOAT: return 4; + case nvinfer1::DataType::kHALF: return 2; + case nvinfer1::DataType::kBOOL: + case nvinfer1::DataType::kUINT8: + case nvinfer1::DataType::kINT8: + case nvinfer1::DataType::kFP8: return 1; + } + return 0; +} + +inline int64_t volume(nvinfer1::Dims const& dims, int32_t start, int32_t stop) +{ + ASSERT(start >= 0); + ASSERT(start <= stop); + ASSERT(stop <= dims.nbDims); + ASSERT(std::all_of(dims.d + start, dims.d + stop, [](int32_t x) { return x >= 0; })); + return std::accumulate(dims.d + start, dims.d + stop, int64_t{1}, std::multiplies{}); +} + +template +struct PPM +{ + std::string magic, fileName; + int h, w, max; + uint8_t buffer[C * H * W]; +}; + +// New vPPM(variable sized PPM) class with variable dimensions. +struct vPPM +{ + std::string magic, fileName; + int h, w, max; + std::vector buffer; +}; + +struct BBox +{ + float x1, y1, x2, y2; +}; + +template +void readPPMFile(const std::string& filename, samplesCommon::PPM& ppm) +{ + ppm.fileName = filename; + std::ifstream infile(filename, std::ifstream::binary); + assert(infile.is_open() && "Attempting to read from a file that is not open."); + infile >> ppm.magic >> ppm.w >> ppm.h >> ppm.max; + infile.seekg(1, infile.cur); + infile.read(reinterpret_cast(ppm.buffer), ppm.w * ppm.h * 3); +} + +inline void readPPMFile(const std::string& filename, vPPM& ppm, std::vector& input_dir) +{ + ppm.fileName = filename; + std::ifstream infile(locateFile(filename, input_dir), std::ifstream::binary); + infile >> ppm.magic >> ppm.w >> ppm.h >> ppm.max; + infile.seekg(1, infile.cur); + + for (int i = 0; i < ppm.w * ppm.h * 3; ++i) + { + ppm.buffer.push_back(0); + } + + infile.read(reinterpret_cast(&ppm.buffer[0]), ppm.w * ppm.h * 3); +} + +template +void writePPMFileWithBBox(const std::string& filename, PPM& ppm, const BBox& bbox) +{ + std::ofstream outfile("./" + filename, std::ofstream::binary); + assert(!outfile.fail()); + outfile << "P6" + << "\n" + << ppm.w << " " << ppm.h << "\n" + << ppm.max << "\n"; + + auto round = [](float x) -> int { return int(std::floor(x + 0.5f)); }; + const int x1 = std::min(std::max(0, round(int(bbox.x1))), W - 1); + const int x2 = std::min(std::max(0, round(int(bbox.x2))), W - 1); + const int y1 = std::min(std::max(0, round(int(bbox.y1))), H - 1); + const int y2 = std::min(std::max(0, round(int(bbox.y2))), H - 1); + + for (int x = x1; x <= x2; ++x) + { + // bbox top border + ppm.buffer[(y1 * ppm.w + x) * 3] = 255; + ppm.buffer[(y1 * ppm.w + x) * 3 + 1] = 0; + ppm.buffer[(y1 * ppm.w + x) * 3 + 2] = 0; + // bbox bottom border + ppm.buffer[(y2 * ppm.w + x) * 3] = 255; + ppm.buffer[(y2 * ppm.w + x) * 3 + 1] = 0; + ppm.buffer[(y2 * ppm.w + x) * 3 + 2] = 0; + } + + for (int y = y1; y <= y2; ++y) + { + // bbox left border + ppm.buffer[(y * ppm.w + x1) * 3] = 255; + ppm.buffer[(y * ppm.w + x1) * 3 + 1] = 0; + ppm.buffer[(y * ppm.w + x1) * 3 + 2] = 0; + // bbox right border + ppm.buffer[(y * ppm.w + x2) * 3] = 255; + ppm.buffer[(y * ppm.w + x2) * 3 + 1] = 0; + ppm.buffer[(y * ppm.w + x2) * 3 + 2] = 0; + } + + outfile.write(reinterpret_cast(ppm.buffer), ppm.w * ppm.h * 3); +} + +inline void writePPMFileWithBBox(const std::string& filename, vPPM ppm, std::vector& dets) +{ + std::ofstream outfile("./" + filename, std::ofstream::binary); + assert(!outfile.fail()); + outfile << "P6" + << "\n" + << ppm.w << " " << ppm.h << "\n" + << ppm.max << "\n"; + auto round = [](float x) -> int { return int(std::floor(x + 0.5f)); }; + + for (auto bbox : dets) + { + for (int x = int(bbox.x1); x < int(bbox.x2); ++x) + { + // bbox top border + ppm.buffer[(round(bbox.y1) * ppm.w + x) * 3] = 255; + ppm.buffer[(round(bbox.y1) * ppm.w + x) * 3 + 1] = 0; + ppm.buffer[(round(bbox.y1) * ppm.w + x) * 3 + 2] = 0; + // bbox bottom border + ppm.buffer[(round(bbox.y2) * ppm.w + x) * 3] = 255; + ppm.buffer[(round(bbox.y2) * ppm.w + x) * 3 + 1] = 0; + ppm.buffer[(round(bbox.y2) * ppm.w + x) * 3 + 2] = 0; + } + + for (int y = int(bbox.y1); y < int(bbox.y2); ++y) + { + // bbox left border + ppm.buffer[(y * ppm.w + round(bbox.x1)) * 3] = 255; + ppm.buffer[(y * ppm.w + round(bbox.x1)) * 3 + 1] = 0; + ppm.buffer[(y * ppm.w + round(bbox.x1)) * 3 + 2] = 0; + // bbox right border + ppm.buffer[(y * ppm.w + round(bbox.x2)) * 3] = 255; + ppm.buffer[(y * ppm.w + round(bbox.x2)) * 3 + 1] = 0; + ppm.buffer[(y * ppm.w + round(bbox.x2)) * 3 + 2] = 0; + } + } + + outfile.write(reinterpret_cast(&ppm.buffer[0]), ppm.w * ppm.h * 3); +} + +class TimerBase +{ +public: + virtual void start() {} + virtual void stop() {} + float microseconds() const noexcept + { + return mMs * 1000.f; + } + float milliseconds() const noexcept + { + return mMs; + } + float seconds() const noexcept + { + return mMs / 1000.f; + } + void reset() noexcept + { + mMs = 0.f; + } + +protected: + float mMs{0.0f}; +}; + +class GpuTimer : public TimerBase +{ +public: + explicit GpuTimer(cudaStream_t stream) + : mStream(stream) + { + CHECK(cudaEventCreate(&mStart)); + CHECK(cudaEventCreate(&mStop)); + } + ~GpuTimer() + { + CHECK(cudaEventDestroy(mStart)); + CHECK(cudaEventDestroy(mStop)); + } + void start() override + { + CHECK(cudaEventRecord(mStart, mStream)); + } + void stop() override + { + CHECK(cudaEventRecord(mStop, mStream)); + float ms{0.0f}; + CHECK(cudaEventSynchronize(mStop)); + CHECK(cudaEventElapsedTime(&ms, mStart, mStop)); + mMs += ms; + } + +private: + cudaEvent_t mStart, mStop; + cudaStream_t mStream; +}; // class GpuTimer + +template +class CpuTimer : public TimerBase +{ +public: + using clock_type = Clock; + + void start() override + { + mStart = Clock::now(); + } + void stop() override + { + mStop = Clock::now(); + mMs += std::chrono::duration{mStop - mStart}.count(); + } + +private: + std::chrono::time_point mStart, mStop; +}; // class CpuTimer + +using PreciseCpuTimer = CpuTimer; + +inline std::vector splitString(std::string str, char delimiter = ',') +{ + std::vector splitVect; + std::stringstream ss(str); + std::string substr; + + while (ss.good()) + { + getline(ss, substr, delimiter); + splitVect.emplace_back(std::move(substr)); + } + return splitVect; +} + +inline int getC(nvinfer1::Dims const& d) +{ + return d.nbDims >= 3 ? d.d[d.nbDims - 3] : 1; +} + +inline int getH(const nvinfer1::Dims& d) +{ + return d.nbDims >= 2 ? d.d[d.nbDims - 2] : 1; +} + +inline int getW(const nvinfer1::Dims& d) +{ + return d.nbDims >= 1 ? d.d[d.nbDims - 1] : 1; +} + +//! Platform-agnostic wrapper around dynamic libraries. +class DynamicLibrary +{ +public: + explicit DynamicLibrary(std::string const& name) + : mLibName{name} + { +#if defined(_WIN32) + mHandle = LoadLibrary(name.c_str()); +#else // defined(_WIN32) + int32_t flags{RTLD_LAZY}; +#if ENABLE_ASAN + // https://github.com/google/sanitizers/issues/89 + // asan doesn't handle module unloading correctly and there are no plans on doing + // so. In order to get proper stack traces, don't delete the shared library on + // close so that asan can resolve the symbols correctly. + flags |= RTLD_NODELETE; +#endif // ENABLE_ASAN + + mHandle = dlopen(name.c_str(), flags); +#endif // defined(_WIN32) + + if (mHandle == nullptr) + { + std::string errorStr{}; +#if !defined(_WIN32) + errorStr = std::string{" due to "} + std::string{dlerror()}; +#endif + throw std::runtime_error("Unable to open library: " + name + errorStr); + } + } + + DynamicLibrary(DynamicLibrary const&) = delete; + DynamicLibrary(DynamicLibrary const&&) = delete; + + //! + //! Retrieve a function symbol from the loaded library. + //! + //! \return the loaded symbol on success + //! \throw std::invalid_argument if loading the symbol failed. + //! + template + std::function symbolAddress(char const* name) + { + if (mHandle == nullptr) + { + throw std::runtime_error("Handle to library is nullptr."); + } + void* ret; +#if defined(_MSC_VER) + ret = static_cast(GetProcAddress(static_cast(mHandle), name)); +#else + ret = dlsym(mHandle, name); +#endif + if (ret == nullptr) + { + std::string const kERROR_MSG(mLibName + ": error loading symbol: " + std::string(name)); + throw std::invalid_argument(kERROR_MSG); + } + return reinterpret_cast(ret); + } + + ~DynamicLibrary() + { + try + { +#if defined(_WIN32) + ASSERT(static_cast(FreeLibrary(static_cast(mHandle)))); +#else + ASSERT(dlclose(mHandle) == 0); +#endif + } + catch (...) + { + sample::gLogError << "Unable to close library: " << mLibName << std::endl; + } + } + +private: + std::string mLibName{}; //!< Name of the DynamicLibrary + void* mHandle{}; //!< Handle to the DynamicLibrary +}; + +inline std::unique_ptr loadLibrary(std::string const& path) +{ + // make_unique not available until C++14 - we still need to support C++11 builds. + return std::unique_ptr(new DynamicLibrary{path}); +} + +inline int32_t getSMVersion() +{ + int32_t deviceIndex = 0; + CHECK(cudaGetDevice(&deviceIndex)); + + int32_t major, minor; + CHECK(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, deviceIndex)); + CHECK(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, deviceIndex)); + + return ((major << 8) | minor); +} + +inline bool isSMSafe() +{ + const int32_t smVersion = getSMVersion(); + return smVersion == 0x0700 || smVersion == 0x0702 || smVersion == 0x0705 || smVersion == 0x0800 + || smVersion == 0x0806 || smVersion == 0x0807; +} + +inline int32_t getMaxPersistentCacheSize() +{ + int32_t deviceIndex{}; + CHECK(cudaGetDevice(&deviceIndex)); + + int32_t maxPersistentL2CacheSize; +#if CUDART_VERSION >= 11030 + CHECK(cudaDeviceGetAttribute(&maxPersistentL2CacheSize, cudaDevAttrMaxPersistingL2CacheSize, deviceIndex)); +#else + maxPersistentL2CacheSize = 0; +#endif + + return maxPersistentL2CacheSize; +} + +inline bool isDataTypeSupported(nvinfer1::DataType dataType) +{ + auto builder = SampleUniquePtr(createBuilder()); + if (!builder) + { + return false; + } + + if ((dataType == nvinfer1::DataType::kINT8 && !builder->platformHasFastInt8()) + || (dataType == nvinfer1::DataType::kHALF && !builder->platformHasFastFp16())) + { + return false; + } + + return true; +} + +class FileLock +{ +public: + FileLock(std::string const& fileName) + : fileName(fileName) + { + std::string lockFileName = fileName + ".lock"; +#ifdef _MSC_VER + sample::gLogVerbose << "Trying to set exclusive file lock " << lockFileName << std::endl; + auto startTime = std::chrono::high_resolution_clock::now(); + // MS docs said this is a blocking IO if "FILE_FLAG_OVERLAPPED" is not provided + lock = CreateFileA(lockFileName.c_str(), GENERIC_WRITE, 0, NULL, OPEN_ALWAYS, 0, NULL); + if (lock != INVALID_HANDLE_VALUE) + { + float const time + = std::chrono::duration(std::chrono::high_resolution_clock::now() - startTime).count(); + sample::gLogVerbose << "File locked in " << time << " seconds." << std::endl; + } + else + { + throw std::runtime_error("Failed to lock " + lockFileName + "!"); + } +#elif defined(__QNX__) + // We once enabled the file lock on QNX, lockf(F_TLOCK) return -1 and the reported error is + // The error generated was 89 + // That means : Function not implemented +#else + fp = fopen(lockFileName.c_str(), "wb+"); + if (!fp) + { + throw std::runtime_error("Cannot open " + lockFileName + "!"); + } + fd = fileno(fp); + sample::gLogVerbose << "Trying to set exclusive file lock " << lockFileName << std::endl; + auto startTime = std::chrono::high_resolution_clock::now(); + auto ret = lockf(fd, F_LOCK, 0); + if (ret != 0) + { + fd = -1; + fclose(fp); + throw std::runtime_error("Failed to lock " + lockFileName + "!"); + } + float const time = std::chrono::duration(std::chrono::high_resolution_clock::now() - startTime).count(); + sample::gLogVerbose << "File locked in " << time << " seconds." << std::endl; +#endif + } + + ~FileLock() + { + std::string lockFileName = fileName + ".lock"; +#ifdef _MSC_VER + if (lock != INVALID_HANDLE_VALUE) + { + sample::gLogVerbose << "Trying to remove exclusive file lock " << lockFileName << std::endl; + auto startTime = std::chrono::high_resolution_clock::now(); + CloseHandle(lock); + float const time + = std::chrono::duration(std::chrono::high_resolution_clock::now() - startTime).count(); + sample::gLogVerbose << "File unlocked in " << time << " seconds." << std::endl; + } +#elif defined(__QNX__) + // We once enabled the file lock on QNX, lockf(F_TLOCK) return -1 and the reported error is + // The error generated was 89 + // That means : Function not implemented +#else + if (fd != -1) + { + sample::gLogVerbose << "Trying to remove exclusive file lock " << lockFileName << std::endl; + auto startTime = std::chrono::high_resolution_clock::now(); + auto ret = lockf(fd, F_ULOCK, 0); + if (ret != 0) + { + sample::gLogVerbose << "Failed to unlock " << lockFileName << "!" << std::endl; + } + else + { + fd = -1; + fclose(fp); + float const time + = std::chrono::duration(std::chrono::high_resolution_clock::now() - startTime).count(); + sample::gLogVerbose << "File unlocked in " << time << " seconds." << std::endl; + } + } +#endif + } + +private: + FileLock() = delete; // no default ctor + FileLock(FileLock const&) = delete; // no copy ctor + FileLock& operator=(FileLock const&) = delete; // no copy assignment + + const std::string fileName; // the file being protected +#ifdef _MSC_VER + HANDLE lock; +#else + FILE* fp; + int32_t fd; +#endif +}; + +inline std::vector loadTimingCacheFile(std::string const& inFileName) +{ + std::unique_ptr fileLock{new samplesCommon::FileLock(inFileName)}; + std::ifstream iFile(inFileName, std::ios::in | std::ios::binary); + if (!iFile) + { + sample::gLogWarning << "Could not read timing cache from: " << inFileName + << ". A new timing cache will be generated and written." << std::endl; + return std::vector(); + } + iFile.seekg(0, std::ifstream::end); + size_t fsize = iFile.tellg(); + iFile.seekg(0, std::ifstream::beg); + std::vector content(fsize); + iFile.read(content.data(), fsize); + iFile.close(); + sample::gLogInfo << "Loaded " << fsize << " bytes of timing cache from " << inFileName << std::endl; + return content; +} + +inline void saveTimingCacheFile(std::string const& outFileName, nvinfer1::IHostMemory const* blob) +{ + std::unique_ptr fileLock{new samplesCommon::FileLock(outFileName)}; + std::ofstream oFile(outFileName, std::ios::out | std::ios::binary); + if (!oFile) + { + sample::gLogWarning << "Could not write timing cache to: " << outFileName << std::endl; + return; + } + oFile.write((char*) blob->data(), blob->size()); + oFile.close(); + sample::gLogInfo << "Saved " << blob->size() << " bytes of timing cache to " << outFileName << std::endl; +} + +inline void updateTimingCacheFile(std::string const& fileName, nvinfer1::ITimingCache const* timingCache) +{ + // Prepare empty timingCache in case that there is no existing file to read + std::unique_ptr builder{createBuilder()}; + std::unique_ptr config{builder->createBuilderConfig()}; + std::unique_ptr fileTimingCache{ + config->createTimingCache(static_cast(nullptr), 0)}; + + std::unique_ptr fileLock{new samplesCommon::FileLock(fileName)}; + std::ifstream iFile(fileName, std::ios::in | std::ios::binary); + if (iFile) + { + iFile.seekg(0, std::ifstream::end); + size_t fsize = iFile.tellg(); + iFile.seekg(0, std::ifstream::beg); + std::vector content(fsize); + iFile.read(content.data(), fsize); + iFile.close(); + sample::gLogInfo << "Loaded " << fsize << " bytes of timing cache from " << fileName << std::endl; + fileTimingCache.reset(config->createTimingCache(static_cast(content.data()), content.size())); + if (!fileTimingCache) + { + throw std::runtime_error("Failed to create timingCache from " + fileName + "!"); + } + } + fileTimingCache->combine(*timingCache, false); + std::unique_ptr blob{fileTimingCache->serialize()}; + if (!blob) + { + throw std::runtime_error("Failed to serialize ITimingCache!"); + } + std::ofstream oFile(fileName, std::ios::out | std::ios::binary); + if (!oFile) + { + sample::gLogWarning << "Could not write timing cache to: " << fileName << std::endl; + return; + } + oFile.write((char*) blob->data(), blob->size()); + oFile.close(); + sample::gLogInfo << "Saved " << blob->size() << " bytes of timing cache to " << fileName << std::endl; +} + +} // namespace samplesCommon + +inline std::ostream& operator<<(std::ostream& os, const nvinfer1::Dims& dims) +{ + os << "("; + for (int i = 0; i < dims.nbDims; ++i) + { + os << (i ? ", " : "") << dims.d[i]; + } + return os << ")"; +} + +#endif // TENSORRT_COMMON_H diff --git a/Code/TestTRTInterDll/trtinfer_lib/common/dumpTFWts.py b/Code/TestTRTInterDll/trtinfer_lib/common/dumpTFWts.py new file mode 100644 index 0000000..0b7a012 --- /dev/null +++ b/Code/TestTRTInterDll/trtinfer_lib/common/dumpTFWts.py @@ -0,0 +1,124 @@ +#!/usr/bin/python +# +# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Script to dump TensorFlow weights in TRT v1 and v2 dump format. +# The V1 format is for TensorRT 4.0. The V2 format is for TensorRT 4.0 and later. + +import sys +import struct +import argparse + +try: + import tensorflow as tf + from tensorflow.python import pywrap_tensorflow +except ImportError as err: + sys.stderr.write("""Error: Failed to import module ({})""".format(err)) + sys.exit() + +parser = argparse.ArgumentParser(description="TensorFlow Weight Dumper") + +parser.add_argument( + "-m", + "--model", + required=True, + help="The checkpoint file basename, example basename(model.ckpt-766908.data-00000-of-00001) -> model.ckpt-766908", +) +parser.add_argument("-o", "--output", required=True, help="The weight file to dump all the weights to.") +parser.add_argument("-1", "--wtsv1", required=False, default=False, type=bool, help="Dump the weights in the wts v1.") + +opt = parser.parse_args() + +if opt.wtsv1: + print("Outputting the trained weights in TensorRT's wts v1 format. This format is documented as:") + print("Line 0: ") + print("Line 1-Num: [buffer name] [buffer type] [buffer size] ") +else: + print("Outputting the trained weights in TensorRT's wts v2 format. This format is documented as:") + print("Line 0: ") + print("Line 1-Num: [buffer name] [buffer type] [(buffer shape{e.g. (1, 2, 3)}] ") + +inputbase = opt.model +outputbase = opt.output + + +def float_to_hex(f): + return hex(struct.unpack(" +#include +#include +#include +#include + +namespace nvinfer1 +{ +namespace utility +{ + +//! Matching for TRTOptions is defined as follows: +//! +//! If A and B both have longName set, A matches B if and only if A.longName == +//! B.longName and (A.shortName == B.shortName if both have short name set). +//! +//! If A only has shortName set and B only has longName set, then A does not +//! match B. It is assumed that when 2 TRTOptions are compared, one of them is +//! the definition of a TRTOption in the input to getOptions. As such, if the +//! definition only has shortName set, it will never be equal to a TRTOption +//! that does not have shortName set (and same for longName). +//! +//! If A and B both have shortName set but B does not have longName set, A +//! matches B if and only if A.shortName == B.shortName. +//! +//! If A has neither long or short name set, A matches B if and only if B has +//! neither long or short name set. +bool matches(const TRTOption& a, const TRTOption& b) +{ + if (!a.longName.empty() && !b.longName.empty()) + { + if (a.shortName && b.shortName) + { + return (a.longName == b.longName) && (a.shortName == b.shortName); + } + return a.longName == b.longName; + } + + // If only one of them is not set, this will return false anyway. + return a.shortName == b.shortName; +} + +//! getTRTOptionIndex returns the index of a TRTOption in a vector of +//! TRTOptions, -1 if not found. +int getTRTOptionIndex(const std::vector& options, const TRTOption& opt) +{ + for (size_t i = 0; i < options.size(); ++i) + { + if (matches(opt, options[i])) + { + return i; + } + } + return -1; +} + +//! validateTRTOption will return a string containing an error message if options +//! contain non-numeric characters, or if there are duplicate option names found. +//! Otherwise, returns the empty string. +std::string validateTRTOption( + const std::set& seenShortNames, const std::set& seenLongNames, const TRTOption& opt) +{ + if (opt.shortName != 0) + { + if (!std::isalnum(opt.shortName)) + { + return "Short name '" + std::to_string(opt.shortName) + "' is non-alphanumeric"; + } + + if (seenShortNames.find(opt.shortName) != seenShortNames.end()) + { + return "Short name '" + std::to_string(opt.shortName) + "' is a duplicate"; + } + } + + if (!opt.longName.empty()) + { + for (const char& c : opt.longName) + { + if (!std::isalnum(c) && c != '-' && c != '_') + { + return "Long name '" + opt.longName + "' contains characters that are not '-', '_', or alphanumeric"; + } + } + + if (seenLongNames.find(opt.longName) != seenLongNames.end()) + { + return "Long name '" + opt.longName + "' is a duplicate"; + } + } + return ""; +} + +//! validateTRTOptions will return a string containing an error message if any +//! options contain non-numeric characters, or if there are duplicate option +//! names found. Otherwise, returns the empty string. +std::string validateTRTOptions(const std::vector& options) +{ + std::set seenShortNames; + std::set seenLongNames; + for (size_t i = 0; i < options.size(); ++i) + { + const std::string errMsg = validateTRTOption(seenShortNames, seenLongNames, options[i]); + if (!errMsg.empty()) + { + return "Error '" + errMsg + "' at TRTOption " + std::to_string(i); + } + + seenShortNames.insert(options[i].shortName); + seenLongNames.insert(options[i].longName); + } + return ""; +} + +//! parseArgs parses an argument list and returns a TRTParsedArgs with the +//! fields set accordingly. Assumes that options is validated. +//! ErrMsg will be set if: +//! - an argument is null +//! - an argument is empty +//! - an argument does not have option (i.e. "-" and "--") +//! - a short argument has more than 1 character +//! - the last argument in the list requires a value +TRTParsedArgs parseArgs(int argc, const char* const* argv, const std::vector& options) +{ + TRTParsedArgs parsedArgs; + parsedArgs.values.resize(options.size()); + + for (int i = 1; i < argc; ++i) // index of current command-line argument + { + if (argv[i] == nullptr) + { + return TRTParsedArgs{"Null argument at index " + std::to_string(i)}; + } + + const std::string argStr(argv[i]); + if (argStr.empty()) + { + return TRTParsedArgs{"Empty argument at index " + std::to_string(i)}; + } + + // No starting hyphen means it is a positional argument + if (argStr[0] != '-') + { + parsedArgs.positionalArgs.push_back(argStr); + continue; + } + + if (argStr == "-" || argStr == "--") + { + return TRTParsedArgs{"Argument does not specify an option at index " + std::to_string(i)}; + } + + // If only 1 hyphen, char after is the flag. + TRTOption opt{' ', "", false, ""}; + std::string value; + if (argStr[1] != '-') + { + // Must only have 1 char after the hyphen + if (argStr.size() > 2) + { + return TRTParsedArgs{"Short arg contains more than 1 character at index " + std::to_string(i)}; + } + opt.shortName = argStr[1]; + } + else + { + opt.longName = argStr.substr(2); + + // We need to support --foo=bar syntax, so look for '=' + const size_t eqIndex = opt.longName.find('='); + if (eqIndex < opt.longName.size()) + { + value = opt.longName.substr(eqIndex + 1); + opt.longName = opt.longName.substr(0, eqIndex); + } + } + + const int idx = getTRTOptionIndex(options, opt); + if (idx < 0) + { + continue; + } + + if (options[idx].valueRequired) + { + if (!value.empty()) + { + parsedArgs.values[idx].second.push_back(value); + parsedArgs.values[idx].first = parsedArgs.values[idx].second.size(); + continue; + } + + if (i + 1 >= argc) + { + return TRTParsedArgs{"Last argument requires value, but none given"}; + } + + const std::string nextArg(argv[i + 1]); + if (nextArg.size() >= 1 && nextArg[0] == '-') + { + sample::gLogWarning << "Warning: Using '" << nextArg << "' as a value for '" << argStr + << "', Should this be its own flag?" << std::endl; + } + + parsedArgs.values[idx].second.push_back(nextArg); + i += 1; // Next argument already consumed + + parsedArgs.values[idx].first = parsedArgs.values[idx].second.size(); + } + else + { + parsedArgs.values[idx].first += 1; + } + } + return parsedArgs; +} + +TRTParsedArgs getOptions(int argc, const char* const* argv, const std::vector& options) +{ + const std::string errMsg = validateTRTOptions(options); + if (!errMsg.empty()) + { + return TRTParsedArgs{errMsg}; + } + return parseArgs(argc, argv, options); +} +} // namespace utility +} // namespace nvinfer1 diff --git a/Code/TestTRTInterDll/trtinfer_lib/common/getOptions.h b/Code/TestTRTInterDll/trtinfer_lib/common/getOptions.h new file mode 100644 index 0000000..e846051 --- /dev/null +++ b/Code/TestTRTInterDll/trtinfer_lib/common/getOptions.h @@ -0,0 +1,128 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef TRT_GET_OPTIONS_H +#define TRT_GET_OPTIONS_H + +#include +#include +#include + +namespace nvinfer1 +{ +namespace utility +{ + +//! TRTOption defines a command line option. At least 1 of shortName and longName +//! must be defined. +//! If bool initialization is undefined behavior on your system, valueRequired +//! must also be explicitly defined. +//! helpText is optional. +struct TRTOption +{ + char shortName; //!< Option name in short (single hyphen) form (i.e. -a, -b) + std::string longName; //!< Option name in long (double hyphen) form (i.e. --foo, --bar) + bool valueRequired; //!< True if a value is needed for an option (i.e. -N 4, --foo bar) + std::string helpText; //!< Text to show when printing out the command usage +}; + +//! TRTParsedArgs is returned by getOptions after it has parsed a command line +//! argument list (argv). +//! +//! errMsg is a string containing an error message if any errors occurred. If it +//! is empty, no errors occurred. +//! +//! values stores a vector of pairs for each option (ordered by order in the +//! input). Each pair contains an int (the number of occurrences) and a vector +//! of strings (a list of values). The user should know which of these to use, +//! and which options required values. For non-value options, only occurrences is +//! populated. For value-required options, occurrences == # of values. Values do +//! not need to be unique. +//! +//! positionalArgs stores additional arguments that are passed in without an +//! option (these must not start with a hyphen). +struct TRTParsedArgs +{ + std::string errMsg; + std::vector>> values; + std::vector positionalArgs; +}; + +//! Parse the input arguments passed to main() and extract options as well as +//! positional arguments. +//! +//! Options are supposed to be passed to main() with a preceding hyphen '-'. +//! +//! If there is a single preceding hyphen, there should be exactly 1 character +//! after the hyphen, which is interpreted as the option. +//! +//! If there are 2 preceding hyphens, the entire argument (without the hyphens) +//! is interpreted as the option. +//! +//! If the option requires a value, the next argument is used as the value. +//! +//! Positional arguments must not start with a hyphen. +//! +//! If an argument requires a value, the next argument is interpreted as the +//! value, even if it is the form of a valid option (i.e. --foo --bar will store +//! "--bar" as a value for option "foo" if "foo" requires a value). +//! We also support --name=value syntax. In this case, 'value' would be used as +//! the value, NOT the next argument. +//! +//! For options: +//! { { 'a', "", false }, +//! { 'b', "", false }, +//! { 0, "cee", false }, +//! { 'd', "", true }, +//! { 'e', "", true }, +//! { 'f', "foo", true } } +//! +//! ./main hello world -a -a --cee -d 12 -f 34 +//! and +//! ./main hello world -a -a --cee -d 12 --foo 34 +//! +//! will result in: +//! +//! TRTParsedArgs { +//! errMsg: "", +//! values: { { 2, {} }, +//! { 0, {} }, +//! { 1, {} }, +//! { 1, {"12"} }, +//! { 0, {} }, +//! { 1, {"34"} } } +//! positionalArgs: {"hello", "world"}, +//! } +//! +//! Non-POSIX behavior: +//! - Does not support "-abcde" as a shorthand for "-a -b -c -d -e". Each +//! option must have its own hyphen prefix. +//! - Does not support -e12 as a shorthand for "-e 12". Values MUST be +//! whitespace-separated from the option it is for. +//! +//! @param[in] argc The number of arguments passed to main (including the +//! file name, which is disregarded) +//! @param[in] argv The arguments passed to main (including the file name, +//! which is disregarded) +//! @param[in] options List of TRTOptions to parse +//! @return TRTParsedArgs. See TRTParsedArgs documentation for descriptions of +//! the fields. +TRTParsedArgs getOptions(int argc, const char* const* argv, const std::vector& options); +} // namespace utility +} // namespace nvinfer1 + +#endif // TRT_GET_OPTIONS_H diff --git a/Code/TestTRTInterDll/trtinfer_lib/common/getopt.c b/Code/TestTRTInterDll/trtinfer_lib/common/getopt.c new file mode 100644 index 0000000..c1da08b --- /dev/null +++ b/Code/TestTRTInterDll/trtinfer_lib/common/getopt.c @@ -0,0 +1,568 @@ +/* $OpenBSD: getopt_long.c,v 1.23 2007/10/31 12:34:57 chl Exp $ */ +/* $NetBSD: getopt_long.c,v 1.15 2002/01/31 22:43:40 tv Exp $ */ + +/* + * Copyright (c) 2002 Todd C. Miller + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * + * Sponsored in part by the Defense Advanced Research Projects + * Agency (DARPA) and Air Force Research Laboratory, Air Force + * Materiel Command, USAF, under agreement number F39502-99-1-0512. + */ +/*- + * Copyright (c) 2000 The NetBSD Foundation, Inc. + * All rights reserved. + * + * This code is derived from software contributed to The NetBSD Foundation + * by Dieter Baron and Thomas Klausner. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "getoptWin.h" +#include +#include +#include +#include +#include +#include + +#define REPLACE_GETOPT /* use this getopt as the system getopt(3) */ + +#ifdef REPLACE_GETOPT +int opterr = 1; /* if error message should be printed */ +int optind = 1; /* index into parent argv vector */ +int optopt = '?'; /* character checked for validity */ +#undef optreset /* see getopt.h */ +#define optreset __mingw_optreset +int optreset; /* reset getopt */ +char* optarg; /* argument associated with option */ +#endif + +#define PRINT_ERROR ((opterr) && (*options != ':')) + +#define FLAG_PERMUTE 0x01 /* permute non-options to the end of argv */ +#define FLAG_ALLARGS 0x02 /* treat non-options as args to option "-1" */ +#define FLAG_LONGONLY 0x04 /* operate as getopt_long_only */ + +/* return values */ +#define BADCH (int) '?' +#define BADARG ((*options == ':') ? (int) ':' : (int) '?') +#define INORDER (int) 1 + +#ifndef __CYGWIN__ +#define __progname __argv[0] +#else +extern char __declspec(dllimport) * __progname; +#endif + +#ifdef __CYGWIN__ +static char EMSG[] = ""; +#else +#define EMSG "" +#endif + +static int getopt_internal(int, char* const*, char const*, const struct option*, int*, int); +static int parse_long_options(char* const*, char const*, const struct option*, int*, int); +static int gcd(int, int); +static void permute_args(int, int, int, char* const*); + +static char* place = EMSG; /* option letter processing */ + +/* XXX: set optreset to 1 rather than these two */ +static int nonopt_start = -1; /* first non option argument (for permute) */ +static int nonopt_end = -1; /* first option after non options (for permute) */ + +/* Error messages */ +static char const recargchar[] = "option requires an argument -- %c"; +static char const recargstring[] = "option requires an argument -- %s"; +static char const ambig[] = "ambiguous option -- %.*s"; +static char const noarg[] = "option doesn't take an argument -- %.*s"; +static char const illoptchar[] = "unknown option -- %c"; +static char const illoptstring[] = "unknown option -- %s"; + +static void _vwarnx(char const* fmt, va_list ap) +{ + (void) fprintf(stderr, "%s: ", __progname); + if (fmt != NULL) + (void) vfprintf(stderr, fmt, ap); + (void) fprintf(stderr, "\n"); +} + +static void warnx(char const* fmt, ...) +{ + va_list ap; + va_start(ap, fmt); + _vwarnx(fmt, ap); + va_end(ap); +} + +/* + * Compute the greatest common divisor of a and b. + */ +static int gcd(int a, int b) +{ + int c; + + c = a % b; + while (c != 0) + { + a = b; + b = c; + c = a % b; + } + + return (b); +} + +/* + * Exchange the block from nonopt_start to nonopt_end with the block + * from nonopt_end to opt_end (keeping the same order of arguments + * in each block). + */ +static void permute_args(int panonopt_start, int panonopt_end, int opt_end, char* const* nargv) +{ + int cstart, cyclelen, i, j, ncycle, nnonopts, nopts, pos; + char* swap; + + /* + * compute lengths of blocks and number and size of cycles + */ + nnonopts = panonopt_end - panonopt_start; + nopts = opt_end - panonopt_end; + ncycle = gcd(nnonopts, nopts); + cyclelen = (opt_end - panonopt_start) / ncycle; + + for (i = 0; i < ncycle; i++) + { + cstart = panonopt_end + i; + pos = cstart; + for (j = 0; j < cyclelen; j++) + { + if (pos >= panonopt_end) + pos -= nnonopts; + else + pos += nopts; + swap = nargv[pos]; + /* LINTED const cast */ + ((char**) nargv)[pos] = nargv[cstart]; + /* LINTED const cast */ + ((char**) nargv)[cstart] = swap; + } + } +} + +/* + * parse_long_options -- + * Parse long options in argc/argv argument vector. + * Returns -1 if short_too is set and the option does not match long_options. + */ +static int parse_long_options( + char* const* nargv, char const* options, const struct option* long_options, int* idx, int short_too) +{ + char *current_argv, *has_equal; + size_t current_argv_len; + int i, ambiguous, match; + +#define IDENTICAL_INTERPRETATION(_x, _y) \ + (long_options[(_x)].has_arg == long_options[(_y)].has_arg && long_options[(_x)].flag == long_options[(_y)].flag \ + && long_options[(_x)].val == long_options[(_y)].val) + + current_argv = place; + match = -1; + ambiguous = 0; + + optind++; + + if ((has_equal = strchr(current_argv, '=')) != NULL) + { + /* argument found (--option=arg) */ + current_argv_len = has_equal - current_argv; + has_equal++; + } + else + current_argv_len = strlen(current_argv); + + for (i = 0; long_options[i].name; i++) + { + /* find matching long option */ + if (strncmp(current_argv, long_options[i].name, current_argv_len)) + continue; + + if (strlen(long_options[i].name) == current_argv_len) + { + /* exact match */ + match = i; + ambiguous = 0; + break; + } + /* + * If this is a known short option, don't allow + * a partial match of a single character. + */ + if (short_too && current_argv_len == 1) + continue; + + if (match == -1) /* partial match */ + match = i; + else if (!IDENTICAL_INTERPRETATION(i, match)) + ambiguous = 1; + } + if (ambiguous) + { + /* ambiguous abbreviation */ + if (PRINT_ERROR) + warnx(ambig, (int) current_argv_len, current_argv); + optopt = 0; + return (BADCH); + } + if (match != -1) + { /* option found */ + if (long_options[match].has_arg == no_argument && has_equal) + { + if (PRINT_ERROR) + warnx(noarg, (int) current_argv_len, current_argv); + /* + * XXX: GNU sets optopt to val regardless of flag + */ + if (long_options[match].flag == NULL) + optopt = long_options[match].val; + else + optopt = 0; + return (BADARG); + } + if (long_options[match].has_arg == required_argument || long_options[match].has_arg == optional_argument) + { + if (has_equal) + optarg = has_equal; + else if (long_options[match].has_arg == required_argument) + { + /* + * optional argument doesn't use next nargv + */ + optarg = nargv[optind++]; + } + } + if ((long_options[match].has_arg == required_argument) && (optarg == NULL)) + { + /* + * Missing argument; leading ':' indicates no error + * should be generated. + */ + if (PRINT_ERROR) + warnx(recargstring, current_argv); + /* + * XXX: GNU sets optopt to val regardless of flag + */ + if (long_options[match].flag == NULL) + optopt = long_options[match].val; + else + optopt = 0; + --optind; + return (BADARG); + } + } + else + { /* unknown option */ + if (short_too) + { + --optind; + return (-1); + } + if (PRINT_ERROR) + warnx(illoptstring, current_argv); + optopt = 0; + return (BADCH); + } + if (idx) + *idx = match; + if (long_options[match].flag) + { + *long_options[match].flag = long_options[match].val; + return (0); + } + else + return (long_options[match].val); +#undef IDENTICAL_INTERPRETATION +} + +/* + * getopt_internal -- + * Parse argc/argv argument vector. Called by user level routines. + */ +static int getopt_internal( + int nargc, char* const* nargv, char const* options, const struct option* long_options, int* idx, int flags) +{ + char const* oli; /* option letter list index */ + int optchar, short_too; + static int posixly_correct = -1; + + if (options == NULL) + return (-1); + + /* + * XXX Some GNU programs (like cvs) set optind to 0 instead of + * XXX using optreset. Work around this braindamage. + */ + if (optind == 0) + optind = optreset = 1; + + /* + * Disable GNU extensions if POSIXLY_CORRECT is set or options + * string begins with a '+'. + * + * CV, 2009-12-14: Check POSIXLY_CORRECT anew if optind == 0 or + * optreset != 0 for GNU compatibility. + */ + if (posixly_correct == -1 || optreset != 0) + posixly_correct = (getenv("POSIXLY_CORRECT") != NULL); + if (*options == '-') + flags |= FLAG_ALLARGS; + else if (posixly_correct || *options == '+') + flags &= ~FLAG_PERMUTE; + if (*options == '+' || *options == '-') + options++; + + optarg = NULL; + if (optreset) + nonopt_start = nonopt_end = -1; +start: + if (optreset || !*place) + { /* update scanning pointer */ + optreset = 0; + if (optind >= nargc) + { /* end of argument vector */ + place = EMSG; + if (nonopt_end != -1) + { + /* do permutation, if we have to */ + permute_args(nonopt_start, nonopt_end, optind, nargv); + optind -= nonopt_end - nonopt_start; + } + else if (nonopt_start != -1) + { + /* + * If we skipped non-options, set optind + * to the first of them. + */ + optind = nonopt_start; + } + nonopt_start = nonopt_end = -1; + return (-1); + } + if (*(place = nargv[optind]) != '-' || (place[1] == '\0' && strchr(options, '-') == NULL)) + { + place = EMSG; /* found non-option */ + if (flags & FLAG_ALLARGS) + { + /* + * GNU extension: + * return non-option as argument to option 1 + */ + optarg = nargv[optind++]; + return (INORDER); + } + if (!(flags & FLAG_PERMUTE)) + { + /* + * If no permutation wanted, stop parsing + * at first non-option. + */ + return (-1); + } + /* do permutation */ + if (nonopt_start == -1) + nonopt_start = optind; + else if (nonopt_end != -1) + { + permute_args(nonopt_start, nonopt_end, optind, nargv); + nonopt_start = optind - (nonopt_end - nonopt_start); + nonopt_end = -1; + } + optind++; + /* process next argument */ + goto start; + } + if (nonopt_start != -1 && nonopt_end == -1) + nonopt_end = optind; + + /* + * If we have "-" do nothing, if "--" we are done. + */ + if (place[1] != '\0' && *++place == '-' && place[1] == '\0') + { + optind++; + place = EMSG; + /* + * We found an option (--), so if we skipped + * non-options, we have to permute. + */ + if (nonopt_end != -1) + { + permute_args(nonopt_start, nonopt_end, optind, nargv); + optind -= nonopt_end - nonopt_start; + } + nonopt_start = nonopt_end = -1; + return (-1); + } + } + + /* + * Check long options if: + * 1) we were passed some + * 2) the arg is not just "-" + * 3) either the arg starts with -- we are getopt_long_only() + */ + if (long_options != NULL && place != nargv[optind] && (*place == '-' || (flags & FLAG_LONGONLY))) + { + short_too = 0; + if (*place == '-') + place++; /* --foo long option */ + else if (*place != ':' && strchr(options, *place) != NULL) + short_too = 1; /* could be short option too */ + + optchar = parse_long_options(nargv, options, long_options, idx, short_too); + if (optchar != -1) + { + place = EMSG; + return (optchar); + } + } + + if ((optchar = (int) *place++) == (int) ':' || (optchar == (int) '-' && *place != '\0') + || (oli = strchr(options, optchar)) == NULL) + { + /* + * If the user specified "-" and '-' isn't listed in + * options, return -1 (non-option) as per POSIX. + * Otherwise, it is an unknown option character (or ':'). + */ + if (optchar == (int) '-' && *place == '\0') + return (-1); + if (!*place) + ++optind; + if (PRINT_ERROR) + warnx(illoptchar, optchar); + optopt = optchar; + return (BADCH); + } + if (long_options != NULL && optchar == 'W' && oli[1] == ';') + { + /* -W long-option */ + if (*place) /* no space */ + /* NOTHING */; + else if (++optind >= nargc) + { /* no arg */ + place = EMSG; + if (PRINT_ERROR) + warnx(recargchar, optchar); + optopt = optchar; + return (BADARG); + } + else /* white space */ + place = nargv[optind]; + optchar = parse_long_options(nargv, options, long_options, idx, 0); + place = EMSG; + return (optchar); + } + if (*++oli != ':') + { /* doesn't take argument */ + if (!*place) + ++optind; + } + else + { /* takes (optional) argument */ + optarg = NULL; + if (*place) /* no white space */ + optarg = place; + else if (oli[1] != ':') + { /* arg not optional */ + if (++optind >= nargc) + { /* no arg */ + place = EMSG; + if (PRINT_ERROR) + warnx(recargchar, optchar); + optopt = optchar; + return (BADARG); + } + else + optarg = nargv[optind]; + } + place = EMSG; + ++optind; + } + /* dump back option letter */ + return (optchar); +} + +#ifdef REPLACE_GETOPT +/* + * getopt -- + * Parse argc/argv argument vector. + * + * [eventually this will replace the BSD getopt] + */ +int getopt(int nargc, char* const* nargv, char const* options) +{ + + /* + * We don't pass FLAG_PERMUTE to getopt_internal() since + * the BSD getopt(3) (unlike GNU) has never done this. + * + * Furthermore, since many privileged programs call getopt() + * before dropping privileges it makes sense to keep things + * as simple (and bug-free) as possible. + */ + return (getopt_internal(nargc, nargv, options, NULL, NULL, 0)); +} +#endif /* REPLACE_GETOPT */ + +/* + * getopt_long -- + * Parse argc/argv argument vector. + */ +int getopt_long(int nargc, char* const* nargv, char const* options, const struct option* long_options, int* idx) +{ + + return (getopt_internal(nargc, nargv, options, long_options, idx, FLAG_PERMUTE)); +} + +/* + * getopt_long_only -- + * Parse argc/argv argument vector. + */ +int getopt_long_only(int nargc, char* const* nargv, char const* options, const struct option* long_options, int* idx) +{ + + return (getopt_internal(nargc, nargv, options, long_options, idx, FLAG_PERMUTE | FLAG_LONGONLY)); +} diff --git a/Code/TestTRTInterDll/trtinfer_lib/common/getoptWin.h b/Code/TestTRTInterDll/trtinfer_lib/common/getoptWin.h new file mode 100644 index 0000000..7e1cf1b --- /dev/null +++ b/Code/TestTRTInterDll/trtinfer_lib/common/getoptWin.h @@ -0,0 +1,124 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __GETOPT_H__ +/** + * DISCLAIMER + * This file has no copyright assigned and is placed in the Public Domain. + * This file is a part of the w64 mingw-runtime package. + * + * The w64 mingw-runtime package and its code is distributed in the hope that it + * will be useful but WITHOUT ANY WARRANTY. ALL WARRANTIES, EXPRESSED OR + * IMPLIED ARE HEREBY DISCLAIMED. This includes but is not limited to + * warranties of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + */ + +#define __GETOPT_H__ + +/* All the headers include this file. */ +#include + +#if defined(WINGETOPT_SHARED_LIB) +#if defined(BUILDING_WINGETOPT_DLL) +#define WINGETOPT_API __declspec(dllexport) +#else +#define WINGETOPT_API __declspec(dllimport) +#endif +#else +#define WINGETOPT_API +#endif + +#ifdef __cplusplus +extern "C" +{ +#endif + + WINGETOPT_API extern int optind; /* index of first non-option in argv */ + WINGETOPT_API extern int optopt; /* single option character, as parsed */ + WINGETOPT_API extern int opterr; /* flag to enable built-in diagnostics... */ + /* (user may set to zero, to suppress) */ + + WINGETOPT_API extern char* optarg; /* pointer to argument of current option */ + + extern int getopt(int nargc, char* const* nargv, char const* options); + +#ifdef _BSD_SOURCE +/* + * BSD adds the non-standard `optreset' feature, for reinitialisation + * of `getopt' parsing. We support this feature, for applications which + * proclaim their BSD heritage, before including this header; however, + * to maintain portability, developers are advised to avoid it. + */ +#define optreset __mingw_optreset + extern int optreset; +#endif +#ifdef __cplusplus +} +#endif +/* + * POSIX requires the `getopt' API to be specified in `unistd.h'; + * thus, `unistd.h' includes this header. However, we do not want + * to expose the `getopt_long' or `getopt_long_only' APIs, when + * included in this manner. Thus, close the standard __GETOPT_H__ + * declarations block, and open an additional __GETOPT_LONG_H__ + * specific block, only when *not* __UNISTD_H_SOURCED__, in which + * to declare the extended API. + */ +#endif /* !defined(__GETOPT_H__) */ + +#if !defined(__UNISTD_H_SOURCED__) && !defined(__GETOPT_LONG_H__) +#define __GETOPT_LONG_H__ + +#ifdef __cplusplus +extern "C" +{ +#endif + + struct option /* specification for a long form option... */ + { + char const* name; /* option name, without leading hyphens */ + int has_arg; /* does it take an argument? */ + int* flag; /* where to save its status, or NULL */ + int val; /* its associated status value */ + }; + + enum /* permitted values for its `has_arg' field... */ + { + no_argument = 0, /* option never takes an argument */ + required_argument, /* option always requires an argument */ + optional_argument /* option may take an argument */ + }; + + extern int getopt_long( + int nargc, char* const* nargv, char const* options, const struct option* long_options, int* idx); + extern int getopt_long_only( + int nargc, char* const* nargv, char const* options, const struct option* long_options, int* idx); +/* + * Previous MinGW implementation had... + */ +#ifndef HAVE_DECL_GETOPT +/* + * ...for the long form API only; keep this for compatibility. + */ +#define HAVE_DECL_GETOPT 1 +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* !defined(__UNISTD_H_SOURCED__) && !defined(__GETOPT_LONG_H__) */ diff --git a/Code/TestTRTInterDll/trtinfer_lib/common/half.h b/Code/TestTRTInterDll/trtinfer_lib/common/half.h new file mode 100644 index 0000000..5513e74 --- /dev/null +++ b/Code/TestTRTInterDll/trtinfer_lib/common/half.h @@ -0,0 +1,4303 @@ +// half - IEEE 754-based half-precision floating point library. +// +// Copyright (c) 2012-2017 Christian Rau +// +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated +// documentation files (the "Software"), to deal in the Software without restriction, including without limitation the +// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to +// permit persons to whom the Software is furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the +// Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR +// OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +/* + * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Version 1.12.0 + +/// \file +/// Main header file for half precision functionality. + +#ifndef HALF_HALF_HPP +#define HALF_HALF_HPP + +/// Combined gcc version number. +#define HALF_GNUC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__) + +// check C++11 language features +#if defined(__clang__) // clang +#if __has_feature(cxx_static_assert) && !defined(HALF_ENABLE_CPP11_STATIC_ASSERT) +#define HALF_ENABLE_CPP11_STATIC_ASSERT 1 +#endif +#if __has_feature(cxx_constexpr) && !defined(HALF_ENABLE_CPP11_CONSTEXPR) +#define HALF_ENABLE_CPP11_CONSTEXPR 1 +#endif +#if __has_feature(cxx_noexcept) && !defined(HALF_ENABLE_CPP11_NOEXCEPT) +#define HALF_ENABLE_CPP11_NOEXCEPT 1 +#endif +#if __has_feature(cxx_user_literals) && !defined(HALF_ENABLE_CPP11_USER_LITERALS) +#define HALF_ENABLE_CPP11_USER_LITERALS 1 +#endif +#if (defined(__GXX_EXPERIMENTAL_CXX0X__) || __cplusplus >= 201103L) && !defined(HALF_ENABLE_CPP11_LONG_LONG) +#define HALF_ENABLE_CPP11_LONG_LONG 1 +#endif +/*#elif defined(__INTEL_COMPILER) //Intel C++ + #if __INTEL_COMPILER >= 1100 && !defined(HALF_ENABLE_CPP11_STATIC_ASSERT) ???????? + #define HALF_ENABLE_CPP11_STATIC_ASSERT 1 + #endif + #if __INTEL_COMPILER >= 1300 && !defined(HALF_ENABLE_CPP11_CONSTEXPR) ???????? + #define HALF_ENABLE_CPP11_CONSTEXPR 1 + #endif + #if __INTEL_COMPILER >= 1300 && !defined(HALF_ENABLE_CPP11_NOEXCEPT) ???????? + #define HALF_ENABLE_CPP11_NOEXCEPT 1 + #endif + #if __INTEL_COMPILER >= 1100 && !defined(HALF_ENABLE_CPP11_LONG_LONG) ???????? + #define HALF_ENABLE_CPP11_LONG_LONG 1 + #endif*/ +#elif defined(__GNUC__) // gcc +#if defined(__GXX_EXPERIMENTAL_CXX0X__) || __cplusplus >= 201103L +#if HALF_GNUC_VERSION >= 403 && !defined(HALF_ENABLE_CPP11_STATIC_ASSERT) +#define HALF_ENABLE_CPP11_STATIC_ASSERT 1 +#endif +#if HALF_GNUC_VERSION >= 406 && !defined(HALF_ENABLE_CPP11_CONSTEXPR) +#define HALF_ENABLE_CPP11_CONSTEXPR 1 +#endif +#if HALF_GNUC_VERSION >= 406 && !defined(HALF_ENABLE_CPP11_NOEXCEPT) +#define HALF_ENABLE_CPP11_NOEXCEPT 1 +#endif +#if HALF_GNUC_VERSION >= 407 && !defined(HALF_ENABLE_CPP11_USER_LITERALS) +#define HALF_ENABLE_CPP11_USER_LITERALS 1 +#endif +#if !defined(HALF_ENABLE_CPP11_LONG_LONG) +#define HALF_ENABLE_CPP11_LONG_LONG 1 +#endif +#endif +#elif defined(_MSC_VER) // Visual C++ +#if _MSC_VER >= 1900 && !defined(HALF_ENABLE_CPP11_CONSTEXPR) +#define HALF_ENABLE_CPP11_CONSTEXPR 1 +#endif +#if _MSC_VER >= 1900 && !defined(HALF_ENABLE_CPP11_NOEXCEPT) +#define HALF_ENABLE_CPP11_NOEXCEPT 1 +#endif +#if _MSC_VER >= 1900 && !defined(HALF_ENABLE_CPP11_USER_LITERALS) +#define HALF_ENABLE_CPP11_USER_LITERALS 1 +#endif +#if _MSC_VER >= 1600 && !defined(HALF_ENABLE_CPP11_STATIC_ASSERT) +#define HALF_ENABLE_CPP11_STATIC_ASSERT 1 +#endif +#if _MSC_VER >= 1310 && !defined(HALF_ENABLE_CPP11_LONG_LONG) +#define HALF_ENABLE_CPP11_LONG_LONG 1 +#endif +#define HALF_POP_WARNINGS 1 +#pragma warning(push) +#pragma warning(disable : 4099 4127 4146) // struct vs class, constant in if, negative unsigned +#endif + +// check C++11 library features +#include +#if defined(_LIBCPP_VERSION) // libc++ +#if defined(__GXX_EXPERIMENTAL_CXX0X__) || __cplusplus >= 201103 +#ifndef HALF_ENABLE_CPP11_TYPE_TRAITS +#define HALF_ENABLE_CPP11_TYPE_TRAITS 1 +#endif +#ifndef HALF_ENABLE_CPP11_CSTDINT +#define HALF_ENABLE_CPP11_CSTDINT 1 +#endif +#ifndef HALF_ENABLE_CPP11_CMATH +#define HALF_ENABLE_CPP11_CMATH 1 +#endif +#ifndef HALF_ENABLE_CPP11_HASH +#define HALF_ENABLE_CPP11_HASH 1 +#endif +#endif +#elif defined(__GLIBCXX__) // libstdc++ +#if defined(__GXX_EXPERIMENTAL_CXX0X__) || __cplusplus >= 201103 +#ifdef __clang__ +#if __GLIBCXX__ >= 20080606 && !defined(HALF_ENABLE_CPP11_TYPE_TRAITS) +#define HALF_ENABLE_CPP11_TYPE_TRAITS 1 +#endif +#if __GLIBCXX__ >= 20080606 && !defined(HALF_ENABLE_CPP11_CSTDINT) +#define HALF_ENABLE_CPP11_CSTDINT 1 +#endif +#if __GLIBCXX__ >= 20080606 && !defined(HALF_ENABLE_CPP11_CMATH) +#define HALF_ENABLE_CPP11_CMATH 1 +#endif +#if __GLIBCXX__ >= 20080606 && !defined(HALF_ENABLE_CPP11_HASH) +#define HALF_ENABLE_CPP11_HASH 1 +#endif +#else +#if HALF_GNUC_VERSION >= 403 && !defined(HALF_ENABLE_CPP11_CSTDINT) +#define HALF_ENABLE_CPP11_CSTDINT 1 +#endif +#if HALF_GNUC_VERSION >= 403 && !defined(HALF_ENABLE_CPP11_CMATH) +#define HALF_ENABLE_CPP11_CMATH 1 +#endif +#if HALF_GNUC_VERSION >= 403 && !defined(HALF_ENABLE_CPP11_HASH) +#define HALF_ENABLE_CPP11_HASH 1 +#endif +#endif +#endif +#elif defined(_CPPLIB_VER) // Dinkumware/Visual C++ +#if _CPPLIB_VER >= 520 +#ifndef HALF_ENABLE_CPP11_TYPE_TRAITS +#define HALF_ENABLE_CPP11_TYPE_TRAITS 1 +#endif +#ifndef HALF_ENABLE_CPP11_CSTDINT +#define HALF_ENABLE_CPP11_CSTDINT 1 +#endif +#ifndef HALF_ENABLE_CPP11_HASH +#define HALF_ENABLE_CPP11_HASH 1 +#endif +#endif +#if _CPPLIB_VER >= 610 +#ifndef HALF_ENABLE_CPP11_CMATH +#define HALF_ENABLE_CPP11_CMATH 1 +#endif +#endif +#endif +#undef HALF_GNUC_VERSION + +// support constexpr +#if HALF_ENABLE_CPP11_CONSTEXPR +#define HALF_CONSTEXPR constexpr +#define HALF_CONSTEXPR_CONST constexpr +#else +#define HALF_CONSTEXPR +#define HALF_CONSTEXPR_CONST const +#endif + +// support noexcept +#if HALF_ENABLE_CPP11_NOEXCEPT +#define HALF_NOEXCEPT noexcept +#define HALF_NOTHROW noexcept +#else +#define HALF_NOEXCEPT +#define HALF_NOTHROW throw() +#endif + +#include +#include +#include +#include +#include +#include +#if HALF_ENABLE_CPP11_TYPE_TRAITS +#include +#endif +#if HALF_ENABLE_CPP11_CSTDINT +#include +#endif +#if HALF_ENABLE_CPP11_HASH +#include +#endif + +/// Default rounding mode. +/// This specifies the rounding mode used for all conversions between [half](\ref half_float::half)s and `float`s as +/// well as for the half_cast() if not specifying a rounding mode explicitly. It can be redefined (before including +/// half.hpp) to one of the standard rounding modes using their respective constants or the equivalent values of +/// `std::float_round_style`: +/// +/// `std::float_round_style` | value | rounding +/// ---------------------------------|-------|------------------------- +/// `std::round_indeterminate` | -1 | fastest (default) +/// `std::round_toward_zero` | 0 | toward zero +/// `std::round_to_nearest` | 1 | to nearest +/// `std::round_toward_infinity` | 2 | toward positive infinity +/// `std::round_toward_neg_infinity` | 3 | toward negative infinity +/// +/// By default this is set to `-1` (`std::round_indeterminate`), which uses truncation (round toward zero, but with +/// overflows set to infinity) and is the fastest rounding mode possible. It can even be set to +/// `std::numeric_limits::round_style` to synchronize the rounding mode with that of the underlying +/// single-precision implementation. +#ifndef HALF_ROUND_STYLE +#define HALF_ROUND_STYLE 1 // = std::round_to_nearest +#endif + +/// Tie-breaking behaviour for round to nearest. +/// This specifies if ties in round to nearest should be resolved by rounding to the nearest even value. By default this +/// is defined to `0` resulting in the faster but slightly more biased behaviour of rounding away from zero in half-way +/// cases (and thus equal to the round() function), but can be redefined to `1` (before including half.hpp) if more +/// IEEE-conformant behaviour is needed. +#ifndef HALF_ROUND_TIES_TO_EVEN +#define HALF_ROUND_TIES_TO_EVEN 0 // ties away from zero +#endif + +/// Value signaling overflow. +/// In correspondence with `HUGE_VAL[F|L]` from `` this symbol expands to a positive value signaling the overflow +/// of an operation, in particular it just evaluates to positive infinity. +#define HUGE_VALH std::numeric_limits::infinity() + +/// Fast half-precision fma function. +/// This symbol is only defined if the fma() function generally executes as fast as, or faster than, a separate +/// half-precision multiplication followed by an addition. Due to the internal single-precision implementation of all +/// arithmetic operations, this is in fact always the case. +#define FP_FAST_FMAH 1 + +#ifndef FP_ILOGB0 +#define FP_ILOGB0 INT_MIN +#endif +#ifndef FP_ILOGBNAN +#define FP_ILOGBNAN INT_MAX +#endif +#ifndef FP_SUBNORMAL +#define FP_SUBNORMAL 0 +#endif +#ifndef FP_ZERO +#define FP_ZERO 1 +#endif +#ifndef FP_NAN +#define FP_NAN 2 +#endif +#ifndef FP_INFINITE +#define FP_INFINITE 3 +#endif +#ifndef FP_NORMAL +#define FP_NORMAL 4 +#endif + +/// Main namespace for half precision functionality. +/// This namespace contains all the functionality provided by the library. +namespace half_float +{ +class half; + +#if HALF_ENABLE_CPP11_USER_LITERALS +/// Library-defined half-precision literals. +/// Import this namespace to enable half-precision floating point literals: +/// ~~~~{.cpp} +/// using namespace half_float::literal; +/// half_float::half = 4.2_h; +/// ~~~~ +namespace literal +{ +half operator"" _h(long double); +} +#endif + +/// \internal +/// \brief Implementation details. +namespace detail +{ +#if HALF_ENABLE_CPP11_TYPE_TRAITS +/// Conditional type. +template +struct conditional : std::conditional +{ +}; + +/// Helper for tag dispatching. +template +struct bool_type : std::integral_constant +{ +}; +using std::false_type; +using std::true_type; + +/// Type traits for floating point types. +template +struct is_float : std::is_floating_point +{ +}; +#else +/// Conditional type. +template +struct conditional +{ + typedef T type; +}; +template +struct conditional +{ + typedef F type; +}; + +/// Helper for tag dispatching. +template +struct bool_type +{ +}; +typedef bool_type true_type; +typedef bool_type false_type; + +/// Type traits for floating point types. +template +struct is_float : false_type +{ +}; +template +struct is_float : is_float +{ +}; +template +struct is_float : is_float +{ +}; +template +struct is_float : is_float +{ +}; +template <> +struct is_float : true_type +{ +}; +template <> +struct is_float : true_type +{ +}; +template <> +struct is_float : true_type +{ +}; +#endif + +/// Type traits for floating point bits. +template +struct bits +{ + typedef unsigned char type; +}; +template +struct bits : bits +{ +}; +template +struct bits : bits +{ +}; +template +struct bits : bits +{ +}; + +#if HALF_ENABLE_CPP11_CSTDINT +/// Unsigned integer of (at least) 16 bits width. +typedef std::uint_least16_t uint16; + +/// Unsigned integer of (at least) 32 bits width. +template <> +struct bits +{ + typedef std::uint_least32_t type; +}; + +/// Unsigned integer of (at least) 64 bits width. +template <> +struct bits +{ + typedef std::uint_least64_t type; +}; +#else +/// Unsigned integer of (at least) 16 bits width. +typedef unsigned short uint16; + +/// Unsigned integer of (at least) 32 bits width. +template <> +struct bits : conditional::digits >= 32, unsigned int, unsigned long> +{ +}; + +#if HALF_ENABLE_CPP11_LONG_LONG +/// Unsigned integer of (at least) 64 bits width. +template <> +struct bits : conditional::digits >= 64, unsigned long, unsigned long long> +{ +}; +#else +/// Unsigned integer of (at least) 64 bits width. +template <> +struct bits +{ + typedef unsigned long type; +}; +#endif +#endif + +/// Tag type for binary construction. +struct binary_t +{ +}; + +/// Tag for binary construction. +HALF_CONSTEXPR_CONST binary_t binary = binary_t(); + +/// Temporary half-precision expression. +/// This class represents a half-precision expression which just stores a single-precision value internally. +struct expr +{ + /// Conversion constructor. + /// \param f single-precision value to convert + explicit HALF_CONSTEXPR expr(float f) HALF_NOEXCEPT : value_(f) {} + + /// Conversion to single-precision. + /// \return single precision value representing expression value + HALF_CONSTEXPR operator float() const HALF_NOEXCEPT + { + return value_; + } + +private: + /// Internal expression value stored in single-precision. + float value_; +}; + +/// SFINAE helper for generic half-precision functions. +/// This class template has to be specialized for each valid combination of argument types to provide a corresponding +/// `type` member equivalent to \a T. +/// \tparam T type to return +template +struct enable +{ +}; +template +struct enable +{ + typedef T type; +}; +template +struct enable +{ + typedef T type; +}; +template +struct enable +{ + typedef T type; +}; +template +struct enable +{ + typedef T type; +}; +template +struct enable +{ + typedef T type; +}; +template +struct enable +{ + typedef T type; +}; +template +struct enable +{ + typedef T type; +}; +template +struct enable +{ + typedef T type; +}; +template +struct enable +{ + typedef T type; +}; +template +struct enable +{ + typedef T type; +}; +template +struct enable +{ + typedef T type; +}; +template +struct enable +{ + typedef T type; +}; +template +struct enable +{ + typedef T type; +}; +template +struct enable +{ + typedef T type; +}; + +/// Return type for specialized generic 2-argument half-precision functions. +/// This class template has to be specialized for each valid combination of argument types to provide a corresponding +/// `type` member denoting the appropriate return type. +/// \tparam T first argument type +/// \tparam U first argument type +template +struct result : enable +{ +}; +template <> +struct result +{ + typedef half type; +}; + +/// \name Classification helpers +/// \{ + +/// Check for infinity. +/// \tparam T argument type (builtin floating point type) +/// \param arg value to query +/// \retval true if infinity +/// \retval false else +template +bool builtin_isinf(T arg) +{ +#if HALF_ENABLE_CPP11_CMATH + return std::isinf(arg); +#elif defined(_MSC_VER) + return !::_finite(static_cast(arg)) && !::_isnan(static_cast(arg)); +#else + return arg == std::numeric_limits::infinity() || arg == -std::numeric_limits::infinity(); +#endif +} + +/// Check for NaN. +/// \tparam T argument type (builtin floating point type) +/// \param arg value to query +/// \retval true if not a number +/// \retval false else +template +bool builtin_isnan(T arg) +{ +#if HALF_ENABLE_CPP11_CMATH + return std::isnan(arg); +#elif defined(_MSC_VER) + return ::_isnan(static_cast(arg)) != 0; +#else + return arg != arg; +#endif +} + +/// Check sign. +/// \tparam T argument type (builtin floating point type) +/// \param arg value to query +/// \retval true if signbit set +/// \retval false else +template +bool builtin_signbit(T arg) +{ +#if HALF_ENABLE_CPP11_CMATH + return std::signbit(arg); +#else + return arg < T() || (arg == T() && T(1) / arg < T()); +#endif +} + +/// \} +/// \name Conversion +/// \{ + +/// Convert IEEE single-precision to half-precision. +/// Credit for this goes to [Jeroen van der Zijp](ftp://ftp.fox-toolkit.org/pub/fasthalffloatconversion.pdf). +/// \tparam R rounding mode to use, `std::round_indeterminate` for fastest rounding +/// \param value single-precision value +/// \return binary representation of half-precision value +template +uint16 float2half_impl(float value, true_type) +{ + typedef bits::type uint32; + uint32 bits; // = *reinterpret_cast(&value); //violating strict aliasing! + std::memcpy(&bits, &value, sizeof(float)); + /* uint16 hbits = (bits>>16) & 0x8000; + bits &= 0x7FFFFFFF; + int exp = bits >> 23; + if(exp == 255) + return hbits | 0x7C00 | (0x3FF&-static_cast((bits&0x7FFFFF)!=0)); + if(exp > 142) + { + if(R == std::round_toward_infinity) + return hbits | 0x7C00 - (hbits>>15); + if(R == std::round_toward_neg_infinity) + return hbits | 0x7BFF + (hbits>>15); + return hbits | 0x7BFF + (R!=std::round_toward_zero); + } + int g, s; + if(exp > 112) + { + g = (bits>>12) & 1; + s = (bits&0xFFF) != 0; + hbits |= ((exp-112)<<10) | ((bits>>13)&0x3FF); + } + else if(exp > 101) + { + int i = 125 - exp; + bits = (bits&0x7FFFFF) | 0x800000; + g = (bits>>i) & 1; + s = (bits&((1L<> (i+1); + } + else + { + g = 0; + s = bits != 0; + } + if(R == std::round_to_nearest) + #if HALF_ROUND_TIES_TO_EVEN + hbits += g & (s|hbits); + #else + hbits += g; + #endif + else if(R == std::round_toward_infinity) + hbits += ~(hbits>>15) & (s|g); + else if(R == std::round_toward_neg_infinity) + hbits += (hbits>>15) & (g|s); + */ + static const uint16 base_table[512] = {0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0001, 0x0002, 0x0004, 0x0008, + 0x0010, 0x0020, 0x0040, 0x0080, 0x0100, 0x0200, 0x0400, 0x0800, 0x0C00, 0x1000, 0x1400, 0x1800, 0x1C00, 0x2000, + 0x2400, 0x2800, 0x2C00, 0x3000, 0x3400, 0x3800, 0x3C00, 0x4000, 0x4400, 0x4800, 0x4C00, 0x5000, 0x5400, 0x5800, + 0x5C00, 0x6000, 0x6400, 0x6800, 0x6C00, 0x7000, 0x7400, 0x7800, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, + 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, + 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, + 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, + 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, + 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, + 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, + 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, + 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8001, 0x8002, 0x8004, 0x8008, 0x8010, 0x8020, 0x8040, 0x8080, 0x8100, 0x8200, 0x8400, 0x8800, 0x8C00, 0x9000, + 0x9400, 0x9800, 0x9C00, 0xA000, 0xA400, 0xA800, 0xAC00, 0xB000, 0xB400, 0xB800, 0xBC00, 0xC000, 0xC400, 0xC800, + 0xCC00, 0xD000, 0xD400, 0xD800, 0xDC00, 0xE000, 0xE400, 0xE800, 0xEC00, 0xF000, 0xF400, 0xF800, 0xFC00, 0xFC00, + 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, + 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, + 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, + 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, + 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, + 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, + 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, + 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00}; + static const unsigned char shift_table[512] = {24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 13, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 13}; + uint16 hbits = base_table[bits >> 23] + static_cast((bits & 0x7FFFFF) >> shift_table[bits >> 23]); + if (R == std::round_to_nearest) + hbits += (((bits & 0x7FFFFF) >> (shift_table[bits >> 23] - 1)) | (((bits >> 23) & 0xFF) == 102)) + & ((hbits & 0x7C00) != 0x7C00) +#if HALF_ROUND_TIES_TO_EVEN + & (((((static_cast(1) << (shift_table[bits >> 23] - 1)) - 1) & bits) != 0) | hbits) +#endif + ; + else if (R == std::round_toward_zero) + hbits -= ((hbits & 0x7FFF) == 0x7C00) & ~shift_table[bits >> 23]; + else if (R == std::round_toward_infinity) + hbits += ((((bits & 0x7FFFFF & ((static_cast(1) << (shift_table[bits >> 23])) - 1)) != 0) + | (((bits >> 23) <= 102) & ((bits >> 23) != 0))) + & (hbits < 0x7C00)) + - ((hbits == 0xFC00) & ((bits >> 23) != 511)); + else if (R == std::round_toward_neg_infinity) + hbits += ((((bits & 0x7FFFFF & ((static_cast(1) << (shift_table[bits >> 23])) - 1)) != 0) + | (((bits >> 23) <= 358) & ((bits >> 23) != 256))) + & (hbits < 0xFC00) & (hbits >> 15)) + - ((hbits == 0x7C00) & ((bits >> 23) != 255)); + return hbits; +} + +/// Convert IEEE double-precision to half-precision. +/// \tparam R rounding mode to use, `std::round_indeterminate` for fastest rounding +/// \param value double-precision value +/// \return binary representation of half-precision value +template +uint16 float2half_impl(double value, true_type) +{ + typedef bits::type uint32; + typedef bits::type uint64; + uint64 bits; // = *reinterpret_cast(&value); //violating strict aliasing! + std::memcpy(&bits, &value, sizeof(double)); + uint32 hi = bits >> 32, lo = bits & 0xFFFFFFFF; + uint16 hbits = (hi >> 16) & 0x8000; + hi &= 0x7FFFFFFF; + int exp = hi >> 20; + if (exp == 2047) + return hbits | 0x7C00 | (0x3FF & -static_cast((bits & 0xFFFFFFFFFFFFF) != 0)); + if (exp > 1038) + { + if (R == std::round_toward_infinity) + return hbits | 0x7C00 - (hbits >> 15); + if (R == std::round_toward_neg_infinity) + return hbits | 0x7BFF + (hbits >> 15); + return hbits | 0x7BFF + (R != std::round_toward_zero); + } + int g, s = lo != 0; + if (exp > 1008) + { + g = (hi >> 9) & 1; + s |= (hi & 0x1FF) != 0; + hbits |= ((exp - 1008) << 10) | ((hi >> 10) & 0x3FF); + } + else if (exp > 997) + { + int i = 1018 - exp; + hi = (hi & 0xFFFFF) | 0x100000; + g = (hi >> i) & 1; + s |= (hi & ((1L << i) - 1)) != 0; + hbits |= hi >> (i + 1); + } + else + { + g = 0; + s |= hi != 0; + } + if (R == std::round_to_nearest) +#if HALF_ROUND_TIES_TO_EVEN + hbits += g & (s | hbits); +#else + hbits += g; +#endif + else if (R == std::round_toward_infinity) + hbits += ~(hbits >> 15) & (s | g); + else if (R == std::round_toward_neg_infinity) + hbits += (hbits >> 15) & (g | s); + return hbits; +} + +/// Convert non-IEEE floating point to half-precision. +/// \tparam R rounding mode to use, `std::round_indeterminate` for fastest rounding +/// \tparam T source type (builtin floating point type) +/// \param value floating point value +/// \return binary representation of half-precision value +template +uint16 float2half_impl(T value, ...) +{ + uint16 hbits = static_cast(builtin_signbit(value)) << 15; + if (value == T()) + return hbits; + if (builtin_isnan(value)) + return hbits | 0x7FFF; + if (builtin_isinf(value)) + return hbits | 0x7C00; + int exp; + std::frexp(value, &exp); + if (exp > 16) + { + if (R == std::round_toward_infinity) + return hbits | (0x7C00 - (hbits >> 15)); + else if (R == std::round_toward_neg_infinity) + return hbits | (0x7BFF + (hbits >> 15)); + return hbits | (0x7BFF + (R != std::round_toward_zero)); + } + if (exp < -13) + value = std::ldexp(value, 24); + else + { + value = std::ldexp(value, 11 - exp); + hbits |= ((exp + 13) << 10); + } + T ival, frac = std::modf(value, &ival); + hbits += static_cast(std::abs(static_cast(ival))); + if (R == std::round_to_nearest) + { + frac = std::abs(frac); +#if HALF_ROUND_TIES_TO_EVEN + hbits += (frac > T(0.5)) | ((frac == T(0.5)) & hbits); +#else + hbits += frac >= T(0.5); +#endif + } + else if (R == std::round_toward_infinity) + hbits += frac > T(); + else if (R == std::round_toward_neg_infinity) + hbits += frac < T(); + return hbits; +} + +/// Convert floating point to half-precision. +/// \tparam R rounding mode to use, `std::round_indeterminate` for fastest rounding +/// \tparam T source type (builtin floating point type) +/// \param value floating point value +/// \return binary representation of half-precision value +template +uint16 float2half(T value) +{ + return float2half_impl( + value, bool_type < std::numeric_limits::is_iec559 && sizeof(typename bits::type) == sizeof(T) > ()); +} + +/// Convert integer to half-precision floating point. +/// \tparam R rounding mode to use, `std::round_indeterminate` for fastest rounding +/// \tparam S `true` if value negative, `false` else +/// \tparam T type to convert (builtin integer type) +/// \param value non-negative integral value +/// \return binary representation of half-precision value +template +uint16 int2half_impl(T value) +{ +#if HALF_ENABLE_CPP11_STATIC_ASSERT && HALF_ENABLE_CPP11_TYPE_TRAITS + static_assert(std::is_integral::value, "int to half conversion only supports builtin integer types"); +#endif + if (S) + value = -value; + uint16 bits = S << 15; + if (value > 0xFFFF) + { + if (R == std::round_toward_infinity) + bits |= 0x7C00 - S; + else if (R == std::round_toward_neg_infinity) + bits |= 0x7BFF + S; + else + bits |= 0x7BFF + (R != std::round_toward_zero); + } + else if (value) + { + uint32_t m = value, exp = 24; + for (; m < 0x400; m <<= 1, --exp) + ; + for (; m > 0x7FF; m >>= 1, ++exp) + ; + bits |= (exp << 10) + m; + if (exp > 24) + { + if (R == std::round_to_nearest) + bits += (value >> (exp - 25)) & 1 +#if HALF_ROUND_TIES_TO_EVEN + & (((((1 << (exp - 25)) - 1) & value) != 0) | bits) +#endif + ; + else if (R == std::round_toward_infinity) + bits += ((value & ((1 << (exp - 24)) - 1)) != 0) & !S; + else if (R == std::round_toward_neg_infinity) + bits += ((value & ((1 << (exp - 24)) - 1)) != 0) & S; + } + } + return bits; +} + +/// Convert integer to half-precision floating point. +/// \tparam R rounding mode to use, `std::round_indeterminate` for fastest rounding +/// \tparam T type to convert (builtin integer type) +/// \param value integral value +/// \return binary representation of half-precision value +template +uint16 int2half(T value) +{ + return (value < 0) ? int2half_impl(value) : int2half_impl(value); +} + +/// Convert half-precision to IEEE single-precision. +/// Credit for this goes to [Jeroen van der Zijp](ftp://ftp.fox-toolkit.org/pub/fasthalffloatconversion.pdf). +/// \param value binary representation of half-precision value +/// \return single-precision value +inline float half2float_impl(uint16 value, float, true_type) +{ + typedef bits::type uint32; + /* uint32 bits = static_cast(value&0x8000) << 16; + int abs = value & 0x7FFF; + if(abs) + { + bits |= 0x38000000 << static_cast(abs>=0x7C00); + for(; abs<0x400; abs<<=1,bits-=0x800000) ; + bits += static_cast(abs) << 13; + } + */ + static const uint32 mantissa_table[2048] = {0x00000000, 0x33800000, 0x34000000, 0x34400000, 0x34800000, 0x34A00000, + 0x34C00000, 0x34E00000, 0x35000000, 0x35100000, 0x35200000, 0x35300000, 0x35400000, 0x35500000, 0x35600000, + 0x35700000, 0x35800000, 0x35880000, 0x35900000, 0x35980000, 0x35A00000, 0x35A80000, 0x35B00000, 0x35B80000, + 0x35C00000, 0x35C80000, 0x35D00000, 0x35D80000, 0x35E00000, 0x35E80000, 0x35F00000, 0x35F80000, 0x36000000, + 0x36040000, 0x36080000, 0x360C0000, 0x36100000, 0x36140000, 0x36180000, 0x361C0000, 0x36200000, 0x36240000, + 0x36280000, 0x362C0000, 0x36300000, 0x36340000, 0x36380000, 0x363C0000, 0x36400000, 0x36440000, 0x36480000, + 0x364C0000, 0x36500000, 0x36540000, 0x36580000, 0x365C0000, 0x36600000, 0x36640000, 0x36680000, 0x366C0000, + 0x36700000, 0x36740000, 0x36780000, 0x367C0000, 0x36800000, 0x36820000, 0x36840000, 0x36860000, 0x36880000, + 0x368A0000, 0x368C0000, 0x368E0000, 0x36900000, 0x36920000, 0x36940000, 0x36960000, 0x36980000, 0x369A0000, + 0x369C0000, 0x369E0000, 0x36A00000, 0x36A20000, 0x36A40000, 0x36A60000, 0x36A80000, 0x36AA0000, 0x36AC0000, + 0x36AE0000, 0x36B00000, 0x36B20000, 0x36B40000, 0x36B60000, 0x36B80000, 0x36BA0000, 0x36BC0000, 0x36BE0000, + 0x36C00000, 0x36C20000, 0x36C40000, 0x36C60000, 0x36C80000, 0x36CA0000, 0x36CC0000, 0x36CE0000, 0x36D00000, + 0x36D20000, 0x36D40000, 0x36D60000, 0x36D80000, 0x36DA0000, 0x36DC0000, 0x36DE0000, 0x36E00000, 0x36E20000, + 0x36E40000, 0x36E60000, 0x36E80000, 0x36EA0000, 0x36EC0000, 0x36EE0000, 0x36F00000, 0x36F20000, 0x36F40000, + 0x36F60000, 0x36F80000, 0x36FA0000, 0x36FC0000, 0x36FE0000, 0x37000000, 0x37010000, 0x37020000, 0x37030000, + 0x37040000, 0x37050000, 0x37060000, 0x37070000, 0x37080000, 0x37090000, 0x370A0000, 0x370B0000, 0x370C0000, + 0x370D0000, 0x370E0000, 0x370F0000, 0x37100000, 0x37110000, 0x37120000, 0x37130000, 0x37140000, 0x37150000, + 0x37160000, 0x37170000, 0x37180000, 0x37190000, 0x371A0000, 0x371B0000, 0x371C0000, 0x371D0000, 0x371E0000, + 0x371F0000, 0x37200000, 0x37210000, 0x37220000, 0x37230000, 0x37240000, 0x37250000, 0x37260000, 0x37270000, + 0x37280000, 0x37290000, 0x372A0000, 0x372B0000, 0x372C0000, 0x372D0000, 0x372E0000, 0x372F0000, 0x37300000, + 0x37310000, 0x37320000, 0x37330000, 0x37340000, 0x37350000, 0x37360000, 0x37370000, 0x37380000, 0x37390000, + 0x373A0000, 0x373B0000, 0x373C0000, 0x373D0000, 0x373E0000, 0x373F0000, 0x37400000, 0x37410000, 0x37420000, + 0x37430000, 0x37440000, 0x37450000, 0x37460000, 0x37470000, 0x37480000, 0x37490000, 0x374A0000, 0x374B0000, + 0x374C0000, 0x374D0000, 0x374E0000, 0x374F0000, 0x37500000, 0x37510000, 0x37520000, 0x37530000, 0x37540000, + 0x37550000, 0x37560000, 0x37570000, 0x37580000, 0x37590000, 0x375A0000, 0x375B0000, 0x375C0000, 0x375D0000, + 0x375E0000, 0x375F0000, 0x37600000, 0x37610000, 0x37620000, 0x37630000, 0x37640000, 0x37650000, 0x37660000, + 0x37670000, 0x37680000, 0x37690000, 0x376A0000, 0x376B0000, 0x376C0000, 0x376D0000, 0x376E0000, 0x376F0000, + 0x37700000, 0x37710000, 0x37720000, 0x37730000, 0x37740000, 0x37750000, 0x37760000, 0x37770000, 0x37780000, + 0x37790000, 0x377A0000, 0x377B0000, 0x377C0000, 0x377D0000, 0x377E0000, 0x377F0000, 0x37800000, 0x37808000, + 0x37810000, 0x37818000, 0x37820000, 0x37828000, 0x37830000, 0x37838000, 0x37840000, 0x37848000, 0x37850000, + 0x37858000, 0x37860000, 0x37868000, 0x37870000, 0x37878000, 0x37880000, 0x37888000, 0x37890000, 0x37898000, + 0x378A0000, 0x378A8000, 0x378B0000, 0x378B8000, 0x378C0000, 0x378C8000, 0x378D0000, 0x378D8000, 0x378E0000, + 0x378E8000, 0x378F0000, 0x378F8000, 0x37900000, 0x37908000, 0x37910000, 0x37918000, 0x37920000, 0x37928000, + 0x37930000, 0x37938000, 0x37940000, 0x37948000, 0x37950000, 0x37958000, 0x37960000, 0x37968000, 0x37970000, + 0x37978000, 0x37980000, 0x37988000, 0x37990000, 0x37998000, 0x379A0000, 0x379A8000, 0x379B0000, 0x379B8000, + 0x379C0000, 0x379C8000, 0x379D0000, 0x379D8000, 0x379E0000, 0x379E8000, 0x379F0000, 0x379F8000, 0x37A00000, + 0x37A08000, 0x37A10000, 0x37A18000, 0x37A20000, 0x37A28000, 0x37A30000, 0x37A38000, 0x37A40000, 0x37A48000, + 0x37A50000, 0x37A58000, 0x37A60000, 0x37A68000, 0x37A70000, 0x37A78000, 0x37A80000, 0x37A88000, 0x37A90000, + 0x37A98000, 0x37AA0000, 0x37AA8000, 0x37AB0000, 0x37AB8000, 0x37AC0000, 0x37AC8000, 0x37AD0000, 0x37AD8000, + 0x37AE0000, 0x37AE8000, 0x37AF0000, 0x37AF8000, 0x37B00000, 0x37B08000, 0x37B10000, 0x37B18000, 0x37B20000, + 0x37B28000, 0x37B30000, 0x37B38000, 0x37B40000, 0x37B48000, 0x37B50000, 0x37B58000, 0x37B60000, 0x37B68000, + 0x37B70000, 0x37B78000, 0x37B80000, 0x37B88000, 0x37B90000, 0x37B98000, 0x37BA0000, 0x37BA8000, 0x37BB0000, + 0x37BB8000, 0x37BC0000, 0x37BC8000, 0x37BD0000, 0x37BD8000, 0x37BE0000, 0x37BE8000, 0x37BF0000, 0x37BF8000, + 0x37C00000, 0x37C08000, 0x37C10000, 0x37C18000, 0x37C20000, 0x37C28000, 0x37C30000, 0x37C38000, 0x37C40000, + 0x37C48000, 0x37C50000, 0x37C58000, 0x37C60000, 0x37C68000, 0x37C70000, 0x37C78000, 0x37C80000, 0x37C88000, + 0x37C90000, 0x37C98000, 0x37CA0000, 0x37CA8000, 0x37CB0000, 0x37CB8000, 0x37CC0000, 0x37CC8000, 0x37CD0000, + 0x37CD8000, 0x37CE0000, 0x37CE8000, 0x37CF0000, 0x37CF8000, 0x37D00000, 0x37D08000, 0x37D10000, 0x37D18000, + 0x37D20000, 0x37D28000, 0x37D30000, 0x37D38000, 0x37D40000, 0x37D48000, 0x37D50000, 0x37D58000, 0x37D60000, + 0x37D68000, 0x37D70000, 0x37D78000, 0x37D80000, 0x37D88000, 0x37D90000, 0x37D98000, 0x37DA0000, 0x37DA8000, + 0x37DB0000, 0x37DB8000, 0x37DC0000, 0x37DC8000, 0x37DD0000, 0x37DD8000, 0x37DE0000, 0x37DE8000, 0x37DF0000, + 0x37DF8000, 0x37E00000, 0x37E08000, 0x37E10000, 0x37E18000, 0x37E20000, 0x37E28000, 0x37E30000, 0x37E38000, + 0x37E40000, 0x37E48000, 0x37E50000, 0x37E58000, 0x37E60000, 0x37E68000, 0x37E70000, 0x37E78000, 0x37E80000, + 0x37E88000, 0x37E90000, 0x37E98000, 0x37EA0000, 0x37EA8000, 0x37EB0000, 0x37EB8000, 0x37EC0000, 0x37EC8000, + 0x37ED0000, 0x37ED8000, 0x37EE0000, 0x37EE8000, 0x37EF0000, 0x37EF8000, 0x37F00000, 0x37F08000, 0x37F10000, + 0x37F18000, 0x37F20000, 0x37F28000, 0x37F30000, 0x37F38000, 0x37F40000, 0x37F48000, 0x37F50000, 0x37F58000, + 0x37F60000, 0x37F68000, 0x37F70000, 0x37F78000, 0x37F80000, 0x37F88000, 0x37F90000, 0x37F98000, 0x37FA0000, + 0x37FA8000, 0x37FB0000, 0x37FB8000, 0x37FC0000, 0x37FC8000, 0x37FD0000, 0x37FD8000, 0x37FE0000, 0x37FE8000, + 0x37FF0000, 0x37FF8000, 0x38000000, 0x38004000, 0x38008000, 0x3800C000, 0x38010000, 0x38014000, 0x38018000, + 0x3801C000, 0x38020000, 0x38024000, 0x38028000, 0x3802C000, 0x38030000, 0x38034000, 0x38038000, 0x3803C000, + 0x38040000, 0x38044000, 0x38048000, 0x3804C000, 0x38050000, 0x38054000, 0x38058000, 0x3805C000, 0x38060000, + 0x38064000, 0x38068000, 0x3806C000, 0x38070000, 0x38074000, 0x38078000, 0x3807C000, 0x38080000, 0x38084000, + 0x38088000, 0x3808C000, 0x38090000, 0x38094000, 0x38098000, 0x3809C000, 0x380A0000, 0x380A4000, 0x380A8000, + 0x380AC000, 0x380B0000, 0x380B4000, 0x380B8000, 0x380BC000, 0x380C0000, 0x380C4000, 0x380C8000, 0x380CC000, + 0x380D0000, 0x380D4000, 0x380D8000, 0x380DC000, 0x380E0000, 0x380E4000, 0x380E8000, 0x380EC000, 0x380F0000, + 0x380F4000, 0x380F8000, 0x380FC000, 0x38100000, 0x38104000, 0x38108000, 0x3810C000, 0x38110000, 0x38114000, + 0x38118000, 0x3811C000, 0x38120000, 0x38124000, 0x38128000, 0x3812C000, 0x38130000, 0x38134000, 0x38138000, + 0x3813C000, 0x38140000, 0x38144000, 0x38148000, 0x3814C000, 0x38150000, 0x38154000, 0x38158000, 0x3815C000, + 0x38160000, 0x38164000, 0x38168000, 0x3816C000, 0x38170000, 0x38174000, 0x38178000, 0x3817C000, 0x38180000, + 0x38184000, 0x38188000, 0x3818C000, 0x38190000, 0x38194000, 0x38198000, 0x3819C000, 0x381A0000, 0x381A4000, + 0x381A8000, 0x381AC000, 0x381B0000, 0x381B4000, 0x381B8000, 0x381BC000, 0x381C0000, 0x381C4000, 0x381C8000, + 0x381CC000, 0x381D0000, 0x381D4000, 0x381D8000, 0x381DC000, 0x381E0000, 0x381E4000, 0x381E8000, 0x381EC000, + 0x381F0000, 0x381F4000, 0x381F8000, 0x381FC000, 0x38200000, 0x38204000, 0x38208000, 0x3820C000, 0x38210000, + 0x38214000, 0x38218000, 0x3821C000, 0x38220000, 0x38224000, 0x38228000, 0x3822C000, 0x38230000, 0x38234000, + 0x38238000, 0x3823C000, 0x38240000, 0x38244000, 0x38248000, 0x3824C000, 0x38250000, 0x38254000, 0x38258000, + 0x3825C000, 0x38260000, 0x38264000, 0x38268000, 0x3826C000, 0x38270000, 0x38274000, 0x38278000, 0x3827C000, + 0x38280000, 0x38284000, 0x38288000, 0x3828C000, 0x38290000, 0x38294000, 0x38298000, 0x3829C000, 0x382A0000, + 0x382A4000, 0x382A8000, 0x382AC000, 0x382B0000, 0x382B4000, 0x382B8000, 0x382BC000, 0x382C0000, 0x382C4000, + 0x382C8000, 0x382CC000, 0x382D0000, 0x382D4000, 0x382D8000, 0x382DC000, 0x382E0000, 0x382E4000, 0x382E8000, + 0x382EC000, 0x382F0000, 0x382F4000, 0x382F8000, 0x382FC000, 0x38300000, 0x38304000, 0x38308000, 0x3830C000, + 0x38310000, 0x38314000, 0x38318000, 0x3831C000, 0x38320000, 0x38324000, 0x38328000, 0x3832C000, 0x38330000, + 0x38334000, 0x38338000, 0x3833C000, 0x38340000, 0x38344000, 0x38348000, 0x3834C000, 0x38350000, 0x38354000, + 0x38358000, 0x3835C000, 0x38360000, 0x38364000, 0x38368000, 0x3836C000, 0x38370000, 0x38374000, 0x38378000, + 0x3837C000, 0x38380000, 0x38384000, 0x38388000, 0x3838C000, 0x38390000, 0x38394000, 0x38398000, 0x3839C000, + 0x383A0000, 0x383A4000, 0x383A8000, 0x383AC000, 0x383B0000, 0x383B4000, 0x383B8000, 0x383BC000, 0x383C0000, + 0x383C4000, 0x383C8000, 0x383CC000, 0x383D0000, 0x383D4000, 0x383D8000, 0x383DC000, 0x383E0000, 0x383E4000, + 0x383E8000, 0x383EC000, 0x383F0000, 0x383F4000, 0x383F8000, 0x383FC000, 0x38400000, 0x38404000, 0x38408000, + 0x3840C000, 0x38410000, 0x38414000, 0x38418000, 0x3841C000, 0x38420000, 0x38424000, 0x38428000, 0x3842C000, + 0x38430000, 0x38434000, 0x38438000, 0x3843C000, 0x38440000, 0x38444000, 0x38448000, 0x3844C000, 0x38450000, + 0x38454000, 0x38458000, 0x3845C000, 0x38460000, 0x38464000, 0x38468000, 0x3846C000, 0x38470000, 0x38474000, + 0x38478000, 0x3847C000, 0x38480000, 0x38484000, 0x38488000, 0x3848C000, 0x38490000, 0x38494000, 0x38498000, + 0x3849C000, 0x384A0000, 0x384A4000, 0x384A8000, 0x384AC000, 0x384B0000, 0x384B4000, 0x384B8000, 0x384BC000, + 0x384C0000, 0x384C4000, 0x384C8000, 0x384CC000, 0x384D0000, 0x384D4000, 0x384D8000, 0x384DC000, 0x384E0000, + 0x384E4000, 0x384E8000, 0x384EC000, 0x384F0000, 0x384F4000, 0x384F8000, 0x384FC000, 0x38500000, 0x38504000, + 0x38508000, 0x3850C000, 0x38510000, 0x38514000, 0x38518000, 0x3851C000, 0x38520000, 0x38524000, 0x38528000, + 0x3852C000, 0x38530000, 0x38534000, 0x38538000, 0x3853C000, 0x38540000, 0x38544000, 0x38548000, 0x3854C000, + 0x38550000, 0x38554000, 0x38558000, 0x3855C000, 0x38560000, 0x38564000, 0x38568000, 0x3856C000, 0x38570000, + 0x38574000, 0x38578000, 0x3857C000, 0x38580000, 0x38584000, 0x38588000, 0x3858C000, 0x38590000, 0x38594000, + 0x38598000, 0x3859C000, 0x385A0000, 0x385A4000, 0x385A8000, 0x385AC000, 0x385B0000, 0x385B4000, 0x385B8000, + 0x385BC000, 0x385C0000, 0x385C4000, 0x385C8000, 0x385CC000, 0x385D0000, 0x385D4000, 0x385D8000, 0x385DC000, + 0x385E0000, 0x385E4000, 0x385E8000, 0x385EC000, 0x385F0000, 0x385F4000, 0x385F8000, 0x385FC000, 0x38600000, + 0x38604000, 0x38608000, 0x3860C000, 0x38610000, 0x38614000, 0x38618000, 0x3861C000, 0x38620000, 0x38624000, + 0x38628000, 0x3862C000, 0x38630000, 0x38634000, 0x38638000, 0x3863C000, 0x38640000, 0x38644000, 0x38648000, + 0x3864C000, 0x38650000, 0x38654000, 0x38658000, 0x3865C000, 0x38660000, 0x38664000, 0x38668000, 0x3866C000, + 0x38670000, 0x38674000, 0x38678000, 0x3867C000, 0x38680000, 0x38684000, 0x38688000, 0x3868C000, 0x38690000, + 0x38694000, 0x38698000, 0x3869C000, 0x386A0000, 0x386A4000, 0x386A8000, 0x386AC000, 0x386B0000, 0x386B4000, + 0x386B8000, 0x386BC000, 0x386C0000, 0x386C4000, 0x386C8000, 0x386CC000, 0x386D0000, 0x386D4000, 0x386D8000, + 0x386DC000, 0x386E0000, 0x386E4000, 0x386E8000, 0x386EC000, 0x386F0000, 0x386F4000, 0x386F8000, 0x386FC000, + 0x38700000, 0x38704000, 0x38708000, 0x3870C000, 0x38710000, 0x38714000, 0x38718000, 0x3871C000, 0x38720000, + 0x38724000, 0x38728000, 0x3872C000, 0x38730000, 0x38734000, 0x38738000, 0x3873C000, 0x38740000, 0x38744000, + 0x38748000, 0x3874C000, 0x38750000, 0x38754000, 0x38758000, 0x3875C000, 0x38760000, 0x38764000, 0x38768000, + 0x3876C000, 0x38770000, 0x38774000, 0x38778000, 0x3877C000, 0x38780000, 0x38784000, 0x38788000, 0x3878C000, + 0x38790000, 0x38794000, 0x38798000, 0x3879C000, 0x387A0000, 0x387A4000, 0x387A8000, 0x387AC000, 0x387B0000, + 0x387B4000, 0x387B8000, 0x387BC000, 0x387C0000, 0x387C4000, 0x387C8000, 0x387CC000, 0x387D0000, 0x387D4000, + 0x387D8000, 0x387DC000, 0x387E0000, 0x387E4000, 0x387E8000, 0x387EC000, 0x387F0000, 0x387F4000, 0x387F8000, + 0x387FC000, 0x38000000, 0x38002000, 0x38004000, 0x38006000, 0x38008000, 0x3800A000, 0x3800C000, 0x3800E000, + 0x38010000, 0x38012000, 0x38014000, 0x38016000, 0x38018000, 0x3801A000, 0x3801C000, 0x3801E000, 0x38020000, + 0x38022000, 0x38024000, 0x38026000, 0x38028000, 0x3802A000, 0x3802C000, 0x3802E000, 0x38030000, 0x38032000, + 0x38034000, 0x38036000, 0x38038000, 0x3803A000, 0x3803C000, 0x3803E000, 0x38040000, 0x38042000, 0x38044000, + 0x38046000, 0x38048000, 0x3804A000, 0x3804C000, 0x3804E000, 0x38050000, 0x38052000, 0x38054000, 0x38056000, + 0x38058000, 0x3805A000, 0x3805C000, 0x3805E000, 0x38060000, 0x38062000, 0x38064000, 0x38066000, 0x38068000, + 0x3806A000, 0x3806C000, 0x3806E000, 0x38070000, 0x38072000, 0x38074000, 0x38076000, 0x38078000, 0x3807A000, + 0x3807C000, 0x3807E000, 0x38080000, 0x38082000, 0x38084000, 0x38086000, 0x38088000, 0x3808A000, 0x3808C000, + 0x3808E000, 0x38090000, 0x38092000, 0x38094000, 0x38096000, 0x38098000, 0x3809A000, 0x3809C000, 0x3809E000, + 0x380A0000, 0x380A2000, 0x380A4000, 0x380A6000, 0x380A8000, 0x380AA000, 0x380AC000, 0x380AE000, 0x380B0000, + 0x380B2000, 0x380B4000, 0x380B6000, 0x380B8000, 0x380BA000, 0x380BC000, 0x380BE000, 0x380C0000, 0x380C2000, + 0x380C4000, 0x380C6000, 0x380C8000, 0x380CA000, 0x380CC000, 0x380CE000, 0x380D0000, 0x380D2000, 0x380D4000, + 0x380D6000, 0x380D8000, 0x380DA000, 0x380DC000, 0x380DE000, 0x380E0000, 0x380E2000, 0x380E4000, 0x380E6000, + 0x380E8000, 0x380EA000, 0x380EC000, 0x380EE000, 0x380F0000, 0x380F2000, 0x380F4000, 0x380F6000, 0x380F8000, + 0x380FA000, 0x380FC000, 0x380FE000, 0x38100000, 0x38102000, 0x38104000, 0x38106000, 0x38108000, 0x3810A000, + 0x3810C000, 0x3810E000, 0x38110000, 0x38112000, 0x38114000, 0x38116000, 0x38118000, 0x3811A000, 0x3811C000, + 0x3811E000, 0x38120000, 0x38122000, 0x38124000, 0x38126000, 0x38128000, 0x3812A000, 0x3812C000, 0x3812E000, + 0x38130000, 0x38132000, 0x38134000, 0x38136000, 0x38138000, 0x3813A000, 0x3813C000, 0x3813E000, 0x38140000, + 0x38142000, 0x38144000, 0x38146000, 0x38148000, 0x3814A000, 0x3814C000, 0x3814E000, 0x38150000, 0x38152000, + 0x38154000, 0x38156000, 0x38158000, 0x3815A000, 0x3815C000, 0x3815E000, 0x38160000, 0x38162000, 0x38164000, + 0x38166000, 0x38168000, 0x3816A000, 0x3816C000, 0x3816E000, 0x38170000, 0x38172000, 0x38174000, 0x38176000, + 0x38178000, 0x3817A000, 0x3817C000, 0x3817E000, 0x38180000, 0x38182000, 0x38184000, 0x38186000, 0x38188000, + 0x3818A000, 0x3818C000, 0x3818E000, 0x38190000, 0x38192000, 0x38194000, 0x38196000, 0x38198000, 0x3819A000, + 0x3819C000, 0x3819E000, 0x381A0000, 0x381A2000, 0x381A4000, 0x381A6000, 0x381A8000, 0x381AA000, 0x381AC000, + 0x381AE000, 0x381B0000, 0x381B2000, 0x381B4000, 0x381B6000, 0x381B8000, 0x381BA000, 0x381BC000, 0x381BE000, + 0x381C0000, 0x381C2000, 0x381C4000, 0x381C6000, 0x381C8000, 0x381CA000, 0x381CC000, 0x381CE000, 0x381D0000, + 0x381D2000, 0x381D4000, 0x381D6000, 0x381D8000, 0x381DA000, 0x381DC000, 0x381DE000, 0x381E0000, 0x381E2000, + 0x381E4000, 0x381E6000, 0x381E8000, 0x381EA000, 0x381EC000, 0x381EE000, 0x381F0000, 0x381F2000, 0x381F4000, + 0x381F6000, 0x381F8000, 0x381FA000, 0x381FC000, 0x381FE000, 0x38200000, 0x38202000, 0x38204000, 0x38206000, + 0x38208000, 0x3820A000, 0x3820C000, 0x3820E000, 0x38210000, 0x38212000, 0x38214000, 0x38216000, 0x38218000, + 0x3821A000, 0x3821C000, 0x3821E000, 0x38220000, 0x38222000, 0x38224000, 0x38226000, 0x38228000, 0x3822A000, + 0x3822C000, 0x3822E000, 0x38230000, 0x38232000, 0x38234000, 0x38236000, 0x38238000, 0x3823A000, 0x3823C000, + 0x3823E000, 0x38240000, 0x38242000, 0x38244000, 0x38246000, 0x38248000, 0x3824A000, 0x3824C000, 0x3824E000, + 0x38250000, 0x38252000, 0x38254000, 0x38256000, 0x38258000, 0x3825A000, 0x3825C000, 0x3825E000, 0x38260000, + 0x38262000, 0x38264000, 0x38266000, 0x38268000, 0x3826A000, 0x3826C000, 0x3826E000, 0x38270000, 0x38272000, + 0x38274000, 0x38276000, 0x38278000, 0x3827A000, 0x3827C000, 0x3827E000, 0x38280000, 0x38282000, 0x38284000, + 0x38286000, 0x38288000, 0x3828A000, 0x3828C000, 0x3828E000, 0x38290000, 0x38292000, 0x38294000, 0x38296000, + 0x38298000, 0x3829A000, 0x3829C000, 0x3829E000, 0x382A0000, 0x382A2000, 0x382A4000, 0x382A6000, 0x382A8000, + 0x382AA000, 0x382AC000, 0x382AE000, 0x382B0000, 0x382B2000, 0x382B4000, 0x382B6000, 0x382B8000, 0x382BA000, + 0x382BC000, 0x382BE000, 0x382C0000, 0x382C2000, 0x382C4000, 0x382C6000, 0x382C8000, 0x382CA000, 0x382CC000, + 0x382CE000, 0x382D0000, 0x382D2000, 0x382D4000, 0x382D6000, 0x382D8000, 0x382DA000, 0x382DC000, 0x382DE000, + 0x382E0000, 0x382E2000, 0x382E4000, 0x382E6000, 0x382E8000, 0x382EA000, 0x382EC000, 0x382EE000, 0x382F0000, + 0x382F2000, 0x382F4000, 0x382F6000, 0x382F8000, 0x382FA000, 0x382FC000, 0x382FE000, 0x38300000, 0x38302000, + 0x38304000, 0x38306000, 0x38308000, 0x3830A000, 0x3830C000, 0x3830E000, 0x38310000, 0x38312000, 0x38314000, + 0x38316000, 0x38318000, 0x3831A000, 0x3831C000, 0x3831E000, 0x38320000, 0x38322000, 0x38324000, 0x38326000, + 0x38328000, 0x3832A000, 0x3832C000, 0x3832E000, 0x38330000, 0x38332000, 0x38334000, 0x38336000, 0x38338000, + 0x3833A000, 0x3833C000, 0x3833E000, 0x38340000, 0x38342000, 0x38344000, 0x38346000, 0x38348000, 0x3834A000, + 0x3834C000, 0x3834E000, 0x38350000, 0x38352000, 0x38354000, 0x38356000, 0x38358000, 0x3835A000, 0x3835C000, + 0x3835E000, 0x38360000, 0x38362000, 0x38364000, 0x38366000, 0x38368000, 0x3836A000, 0x3836C000, 0x3836E000, + 0x38370000, 0x38372000, 0x38374000, 0x38376000, 0x38378000, 0x3837A000, 0x3837C000, 0x3837E000, 0x38380000, + 0x38382000, 0x38384000, 0x38386000, 0x38388000, 0x3838A000, 0x3838C000, 0x3838E000, 0x38390000, 0x38392000, + 0x38394000, 0x38396000, 0x38398000, 0x3839A000, 0x3839C000, 0x3839E000, 0x383A0000, 0x383A2000, 0x383A4000, + 0x383A6000, 0x383A8000, 0x383AA000, 0x383AC000, 0x383AE000, 0x383B0000, 0x383B2000, 0x383B4000, 0x383B6000, + 0x383B8000, 0x383BA000, 0x383BC000, 0x383BE000, 0x383C0000, 0x383C2000, 0x383C4000, 0x383C6000, 0x383C8000, + 0x383CA000, 0x383CC000, 0x383CE000, 0x383D0000, 0x383D2000, 0x383D4000, 0x383D6000, 0x383D8000, 0x383DA000, + 0x383DC000, 0x383DE000, 0x383E0000, 0x383E2000, 0x383E4000, 0x383E6000, 0x383E8000, 0x383EA000, 0x383EC000, + 0x383EE000, 0x383F0000, 0x383F2000, 0x383F4000, 0x383F6000, 0x383F8000, 0x383FA000, 0x383FC000, 0x383FE000, + 0x38400000, 0x38402000, 0x38404000, 0x38406000, 0x38408000, 0x3840A000, 0x3840C000, 0x3840E000, 0x38410000, + 0x38412000, 0x38414000, 0x38416000, 0x38418000, 0x3841A000, 0x3841C000, 0x3841E000, 0x38420000, 0x38422000, + 0x38424000, 0x38426000, 0x38428000, 0x3842A000, 0x3842C000, 0x3842E000, 0x38430000, 0x38432000, 0x38434000, + 0x38436000, 0x38438000, 0x3843A000, 0x3843C000, 0x3843E000, 0x38440000, 0x38442000, 0x38444000, 0x38446000, + 0x38448000, 0x3844A000, 0x3844C000, 0x3844E000, 0x38450000, 0x38452000, 0x38454000, 0x38456000, 0x38458000, + 0x3845A000, 0x3845C000, 0x3845E000, 0x38460000, 0x38462000, 0x38464000, 0x38466000, 0x38468000, 0x3846A000, + 0x3846C000, 0x3846E000, 0x38470000, 0x38472000, 0x38474000, 0x38476000, 0x38478000, 0x3847A000, 0x3847C000, + 0x3847E000, 0x38480000, 0x38482000, 0x38484000, 0x38486000, 0x38488000, 0x3848A000, 0x3848C000, 0x3848E000, + 0x38490000, 0x38492000, 0x38494000, 0x38496000, 0x38498000, 0x3849A000, 0x3849C000, 0x3849E000, 0x384A0000, + 0x384A2000, 0x384A4000, 0x384A6000, 0x384A8000, 0x384AA000, 0x384AC000, 0x384AE000, 0x384B0000, 0x384B2000, + 0x384B4000, 0x384B6000, 0x384B8000, 0x384BA000, 0x384BC000, 0x384BE000, 0x384C0000, 0x384C2000, 0x384C4000, + 0x384C6000, 0x384C8000, 0x384CA000, 0x384CC000, 0x384CE000, 0x384D0000, 0x384D2000, 0x384D4000, 0x384D6000, + 0x384D8000, 0x384DA000, 0x384DC000, 0x384DE000, 0x384E0000, 0x384E2000, 0x384E4000, 0x384E6000, 0x384E8000, + 0x384EA000, 0x384EC000, 0x384EE000, 0x384F0000, 0x384F2000, 0x384F4000, 0x384F6000, 0x384F8000, 0x384FA000, + 0x384FC000, 0x384FE000, 0x38500000, 0x38502000, 0x38504000, 0x38506000, 0x38508000, 0x3850A000, 0x3850C000, + 0x3850E000, 0x38510000, 0x38512000, 0x38514000, 0x38516000, 0x38518000, 0x3851A000, 0x3851C000, 0x3851E000, + 0x38520000, 0x38522000, 0x38524000, 0x38526000, 0x38528000, 0x3852A000, 0x3852C000, 0x3852E000, 0x38530000, + 0x38532000, 0x38534000, 0x38536000, 0x38538000, 0x3853A000, 0x3853C000, 0x3853E000, 0x38540000, 0x38542000, + 0x38544000, 0x38546000, 0x38548000, 0x3854A000, 0x3854C000, 0x3854E000, 0x38550000, 0x38552000, 0x38554000, + 0x38556000, 0x38558000, 0x3855A000, 0x3855C000, 0x3855E000, 0x38560000, 0x38562000, 0x38564000, 0x38566000, + 0x38568000, 0x3856A000, 0x3856C000, 0x3856E000, 0x38570000, 0x38572000, 0x38574000, 0x38576000, 0x38578000, + 0x3857A000, 0x3857C000, 0x3857E000, 0x38580000, 0x38582000, 0x38584000, 0x38586000, 0x38588000, 0x3858A000, + 0x3858C000, 0x3858E000, 0x38590000, 0x38592000, 0x38594000, 0x38596000, 0x38598000, 0x3859A000, 0x3859C000, + 0x3859E000, 0x385A0000, 0x385A2000, 0x385A4000, 0x385A6000, 0x385A8000, 0x385AA000, 0x385AC000, 0x385AE000, + 0x385B0000, 0x385B2000, 0x385B4000, 0x385B6000, 0x385B8000, 0x385BA000, 0x385BC000, 0x385BE000, 0x385C0000, + 0x385C2000, 0x385C4000, 0x385C6000, 0x385C8000, 0x385CA000, 0x385CC000, 0x385CE000, 0x385D0000, 0x385D2000, + 0x385D4000, 0x385D6000, 0x385D8000, 0x385DA000, 0x385DC000, 0x385DE000, 0x385E0000, 0x385E2000, 0x385E4000, + 0x385E6000, 0x385E8000, 0x385EA000, 0x385EC000, 0x385EE000, 0x385F0000, 0x385F2000, 0x385F4000, 0x385F6000, + 0x385F8000, 0x385FA000, 0x385FC000, 0x385FE000, 0x38600000, 0x38602000, 0x38604000, 0x38606000, 0x38608000, + 0x3860A000, 0x3860C000, 0x3860E000, 0x38610000, 0x38612000, 0x38614000, 0x38616000, 0x38618000, 0x3861A000, + 0x3861C000, 0x3861E000, 0x38620000, 0x38622000, 0x38624000, 0x38626000, 0x38628000, 0x3862A000, 0x3862C000, + 0x3862E000, 0x38630000, 0x38632000, 0x38634000, 0x38636000, 0x38638000, 0x3863A000, 0x3863C000, 0x3863E000, + 0x38640000, 0x38642000, 0x38644000, 0x38646000, 0x38648000, 0x3864A000, 0x3864C000, 0x3864E000, 0x38650000, + 0x38652000, 0x38654000, 0x38656000, 0x38658000, 0x3865A000, 0x3865C000, 0x3865E000, 0x38660000, 0x38662000, + 0x38664000, 0x38666000, 0x38668000, 0x3866A000, 0x3866C000, 0x3866E000, 0x38670000, 0x38672000, 0x38674000, + 0x38676000, 0x38678000, 0x3867A000, 0x3867C000, 0x3867E000, 0x38680000, 0x38682000, 0x38684000, 0x38686000, + 0x38688000, 0x3868A000, 0x3868C000, 0x3868E000, 0x38690000, 0x38692000, 0x38694000, 0x38696000, 0x38698000, + 0x3869A000, 0x3869C000, 0x3869E000, 0x386A0000, 0x386A2000, 0x386A4000, 0x386A6000, 0x386A8000, 0x386AA000, + 0x386AC000, 0x386AE000, 0x386B0000, 0x386B2000, 0x386B4000, 0x386B6000, 0x386B8000, 0x386BA000, 0x386BC000, + 0x386BE000, 0x386C0000, 0x386C2000, 0x386C4000, 0x386C6000, 0x386C8000, 0x386CA000, 0x386CC000, 0x386CE000, + 0x386D0000, 0x386D2000, 0x386D4000, 0x386D6000, 0x386D8000, 0x386DA000, 0x386DC000, 0x386DE000, 0x386E0000, + 0x386E2000, 0x386E4000, 0x386E6000, 0x386E8000, 0x386EA000, 0x386EC000, 0x386EE000, 0x386F0000, 0x386F2000, + 0x386F4000, 0x386F6000, 0x386F8000, 0x386FA000, 0x386FC000, 0x386FE000, 0x38700000, 0x38702000, 0x38704000, + 0x38706000, 0x38708000, 0x3870A000, 0x3870C000, 0x3870E000, 0x38710000, 0x38712000, 0x38714000, 0x38716000, + 0x38718000, 0x3871A000, 0x3871C000, 0x3871E000, 0x38720000, 0x38722000, 0x38724000, 0x38726000, 0x38728000, + 0x3872A000, 0x3872C000, 0x3872E000, 0x38730000, 0x38732000, 0x38734000, 0x38736000, 0x38738000, 0x3873A000, + 0x3873C000, 0x3873E000, 0x38740000, 0x38742000, 0x38744000, 0x38746000, 0x38748000, 0x3874A000, 0x3874C000, + 0x3874E000, 0x38750000, 0x38752000, 0x38754000, 0x38756000, 0x38758000, 0x3875A000, 0x3875C000, 0x3875E000, + 0x38760000, 0x38762000, 0x38764000, 0x38766000, 0x38768000, 0x3876A000, 0x3876C000, 0x3876E000, 0x38770000, + 0x38772000, 0x38774000, 0x38776000, 0x38778000, 0x3877A000, 0x3877C000, 0x3877E000, 0x38780000, 0x38782000, + 0x38784000, 0x38786000, 0x38788000, 0x3878A000, 0x3878C000, 0x3878E000, 0x38790000, 0x38792000, 0x38794000, + 0x38796000, 0x38798000, 0x3879A000, 0x3879C000, 0x3879E000, 0x387A0000, 0x387A2000, 0x387A4000, 0x387A6000, + 0x387A8000, 0x387AA000, 0x387AC000, 0x387AE000, 0x387B0000, 0x387B2000, 0x387B4000, 0x387B6000, 0x387B8000, + 0x387BA000, 0x387BC000, 0x387BE000, 0x387C0000, 0x387C2000, 0x387C4000, 0x387C6000, 0x387C8000, 0x387CA000, + 0x387CC000, 0x387CE000, 0x387D0000, 0x387D2000, 0x387D4000, 0x387D6000, 0x387D8000, 0x387DA000, 0x387DC000, + 0x387DE000, 0x387E0000, 0x387E2000, 0x387E4000, 0x387E6000, 0x387E8000, 0x387EA000, 0x387EC000, 0x387EE000, + 0x387F0000, 0x387F2000, 0x387F4000, 0x387F6000, 0x387F8000, 0x387FA000, 0x387FC000, 0x387FE000}; + static const uint32 exponent_table[64] = {0x00000000, 0x00800000, 0x01000000, 0x01800000, 0x02000000, 0x02800000, + 0x03000000, 0x03800000, 0x04000000, 0x04800000, 0x05000000, 0x05800000, 0x06000000, 0x06800000, 0x07000000, + 0x07800000, 0x08000000, 0x08800000, 0x09000000, 0x09800000, 0x0A000000, 0x0A800000, 0x0B000000, 0x0B800000, + 0x0C000000, 0x0C800000, 0x0D000000, 0x0D800000, 0x0E000000, 0x0E800000, 0x0F000000, 0x47800000, 0x80000000, + 0x80800000, 0x81000000, 0x81800000, 0x82000000, 0x82800000, 0x83000000, 0x83800000, 0x84000000, 0x84800000, + 0x85000000, 0x85800000, 0x86000000, 0x86800000, 0x87000000, 0x87800000, 0x88000000, 0x88800000, 0x89000000, + 0x89800000, 0x8A000000, 0x8A800000, 0x8B000000, 0x8B800000, 0x8C000000, 0x8C800000, 0x8D000000, 0x8D800000, + 0x8E000000, 0x8E800000, 0x8F000000, 0xC7800000}; + static const unsigned short offset_table[64] = {0, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, + 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, + 1024, 1024, 0, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, + 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024}; + uint32 bits = mantissa_table[offset_table[value >> 10] + (value & 0x3FF)] + exponent_table[value >> 10]; + // return *reinterpret_cast(&bits); //violating strict aliasing! + float out; + std::memcpy(&out, &bits, sizeof(float)); + return out; +} + +/// Convert half-precision to IEEE double-precision. +/// \param value binary representation of half-precision value +/// \return double-precision value +inline double half2float_impl(uint16 value, double, true_type) +{ + typedef bits::type uint32; + typedef bits::type uint64; + uint32 hi = static_cast(value & 0x8000) << 16; + int abs = value & 0x7FFF; + if (abs) + { + hi |= 0x3F000000 << static_cast(abs >= 0x7C00); + for (; abs < 0x400; abs <<= 1, hi -= 0x100000) + ; + hi += static_cast(abs) << 10; + } + uint64 bits = static_cast(hi) << 32; + // return *reinterpret_cast(&bits); //violating strict aliasing! + double out; + std::memcpy(&out, &bits, sizeof(double)); + return out; +} + +/// Convert half-precision to non-IEEE floating point. +/// \tparam T type to convert to (builtin integer type) +/// \param value binary representation of half-precision value +/// \return floating point value +template +T half2float_impl(uint16 value, T, ...) +{ + T out; + int abs = value & 0x7FFF; + if (abs > 0x7C00) + out = std::numeric_limits::has_quiet_NaN ? std::numeric_limits::quiet_NaN() : T(); + else if (abs == 0x7C00) + out = std::numeric_limits::has_infinity ? std::numeric_limits::infinity() : std::numeric_limits::max(); + else if (abs > 0x3FF) + out = std::ldexp(static_cast((abs & 0x3FF) | 0x400), (abs >> 10) - 25); + else + out = std::ldexp(static_cast(abs), -24); + return (value & 0x8000) ? -out : out; +} + +/// Convert half-precision to floating point. +/// \tparam T type to convert to (builtin integer type) +/// \param value binary representation of half-precision value +/// \return floating point value +template +T half2float(uint16 value) +{ + return half2float_impl( + value, T(), bool_type < std::numeric_limits::is_iec559 && sizeof(typename bits::type) == sizeof(T) > ()); +} + +/// Convert half-precision floating point to integer. +/// \tparam R rounding mode to use, `std::round_indeterminate` for fastest rounding +/// \tparam E `true` for round to even, `false` for round away from zero +/// \tparam T type to convert to (buitlin integer type with at least 16 bits precision, excluding any implicit sign +/// bits) \param value binary representation of half-precision value \return integral value +template +T half2int_impl(uint16 value) +{ +#if HALF_ENABLE_CPP11_STATIC_ASSERT && HALF_ENABLE_CPP11_TYPE_TRAITS + static_assert(std::is_integral::value, "half to int conversion only supports builtin integer types"); +#endif + uint32_t e = value & 0x7FFF; + if (e >= 0x7C00) + return (value & 0x8000) ? std::numeric_limits::min() : std::numeric_limits::max(); + if (e < 0x3800) + { + if (R == std::round_toward_infinity) + return T(~(value >> 15) & (e != 0)); + else if (R == std::round_toward_neg_infinity) + return -T(value > 0x8000); + return T(); + } + uint32_t m = (value & 0x3FF) | 0x400; + e >>= 10; + if (e < 25) + { + if (R == std::round_to_nearest) + m += (1 << (24 - e)) - (~(m >> (25 - e)) & E); + else if (R == std::round_toward_infinity) + m += ((value >> 15) - 1) & ((1 << (25 - e)) - 1U); + else if (R == std::round_toward_neg_infinity) + m += -(value >> 15) & ((1 << (25 - e)) - 1U); + m >>= 25 - e; + } + else + m <<= e - 25; + return (value & 0x8000) ? -static_cast(m) : static_cast(m); +} + +/// Convert half-precision floating point to integer. +/// \tparam R rounding mode to use, `std::round_indeterminate` for fastest rounding +/// \tparam T type to convert to (buitlin integer type with at least 16 bits precision, excluding any implicit sign +/// bits) \param value binary representation of half-precision value \return integral value +template +T half2int(uint16 value) +{ + return half2int_impl(value); +} + +/// Convert half-precision floating point to integer using round-to-nearest-away-from-zero. +/// \tparam T type to convert to (buitlin integer type with at least 16 bits precision, excluding any implicit sign +/// bits) \param value binary representation of half-precision value \return integral value +template +T half2int_up(uint16 value) +{ + return half2int_impl(value); +} + +/// Round half-precision number to nearest integer value. +/// \tparam R rounding mode to use, `std::round_indeterminate` for fastest rounding +/// \tparam E `true` for round to even, `false` for round away from zero +/// \param value binary representation of half-precision value +/// \return half-precision bits for nearest integral value +template +uint16 round_half_impl(uint16 value) +{ + uint32_t e = value & 0x7FFF; + uint16 result = value; + if (e < 0x3C00) + { + result &= 0x8000; + if (R == std::round_to_nearest) + result |= 0x3C00U & -(e >= (0x3800 + E)); + else if (R == std::round_toward_infinity) + result |= 0x3C00U & -(~(value >> 15) & (e != 0)); + else if (R == std::round_toward_neg_infinity) + result |= 0x3C00U & -(value > 0x8000); + } + else if (e < 0x6400) + { + e = 25 - (e >> 10); + uint32_t mask = (1 << e) - 1; + if (R == std::round_to_nearest) + result += (1 << (e - 1)) - (~(result >> e) & E); + else if (R == std::round_toward_infinity) + result += mask & ((value >> 15) - 1); + else if (R == std::round_toward_neg_infinity) + result += mask & -(value >> 15); + result &= ~mask; + } + return result; +} + +/// Round half-precision number to nearest integer value. +/// \tparam R rounding mode to use, `std::round_indeterminate` for fastest rounding +/// \param value binary representation of half-precision value +/// \return half-precision bits for nearest integral value +template +uint16 round_half(uint16 value) +{ + return round_half_impl(value); +} + +/// Round half-precision number to nearest integer value using round-to-nearest-away-from-zero. +/// \param value binary representation of half-precision value +/// \return half-precision bits for nearest integral value +inline uint16 round_half_up(uint16 value) +{ + return round_half_impl(value); +} +/// \} + +struct functions; +template +struct unary_specialized; +template +struct binary_specialized; +template +struct half_caster; +} // namespace detail + +/// Half-precision floating point type. +/// This class implements an IEEE-conformant half-precision floating point type with the usual arithmetic operators and +/// conversions. It is implicitly convertible to single-precision floating point, which makes artihmetic expressions and +/// functions with mixed-type operands to be of the most precise operand type. Additionally all arithmetic operations +/// (and many mathematical functions) are carried out in single-precision internally. All conversions from single- to +/// half-precision are done using the library's default rounding mode, but temporary results inside chained arithmetic +/// expressions are kept in single-precision as long as possible (while of course still maintaining a strong +/// half-precision type). +/// +/// According to the C++98/03 definition, the half type is not a POD type. But according to C++11's less strict and +/// extended definitions it is both a standard layout type and a trivially copyable type (even if not a POD type), which +/// means it can be standard-conformantly copied using raw binary copies. But in this context some more words about the +/// actual size of the type. Although the half is representing an IEEE 16-bit type, it does not neccessarily have to be +/// of exactly 16-bits size. But on any reasonable implementation the actual binary representation of this type will +/// most probably not ivolve any additional "magic" or padding beyond the simple binary representation of the underlying +/// 16-bit IEEE number, even if not strictly guaranteed by the standard. But even then it only has an actual size of 16 +/// bits if your C++ implementation supports an unsigned integer type of exactly 16 bits width. But this should be the +/// case on nearly any reasonable platform. +/// +/// So if your C++ implementation is not totally exotic or imposes special alignment requirements, it is a reasonable +/// assumption that the data of a half is just comprised of the 2 bytes of the underlying IEEE representation. +class half +{ + friend struct detail::functions; + friend struct detail::unary_specialized; + friend struct detail::binary_specialized; + template + friend struct detail::half_caster; + friend class std::numeric_limits; +#if HALF_ENABLE_CPP11_HASH + friend struct std::hash; +#endif +#if HALF_ENABLE_CPP11_USER_LITERALS + friend half literal::operator"" _h(long double); +#endif + +public: + /// Default constructor. + /// This initializes the half to 0. Although this does not match the builtin types' default-initialization semantics + /// and may be less efficient than no initialization, it is needed to provide proper value-initialization semantics. + HALF_CONSTEXPR half() HALF_NOEXCEPT : data_() {} + + /// Copy constructor. + /// \tparam T type of concrete half expression + /// \param rhs half expression to copy from + half(detail::expr rhs) + : data_(detail::float2half(static_cast(rhs))) + { + } + + /// Conversion constructor. + /// \param rhs float to convert + explicit half(float rhs) + : data_(detail::float2half(rhs)) + { + } + + /// Conversion to single-precision. + /// \return single precision value representing expression value + operator float() const + { + return detail::half2float(data_); + } + + /// Assignment operator. + /// \tparam T type of concrete half expression + /// \param rhs half expression to copy from + /// \return reference to this half + half& operator=(detail::expr rhs) + { + return *this = static_cast(rhs); + } + + /// Arithmetic assignment. + /// \tparam T type of concrete half expression + /// \param rhs half expression to add + /// \return reference to this half + template + typename detail::enable::type operator+=(T rhs) + { + return *this += static_cast(rhs); + } + + /// Arithmetic assignment. + /// \tparam T type of concrete half expression + /// \param rhs half expression to subtract + /// \return reference to this half + template + typename detail::enable::type operator-=(T rhs) + { + return *this -= static_cast(rhs); + } + + /// Arithmetic assignment. + /// \tparam T type of concrete half expression + /// \param rhs half expression to multiply with + /// \return reference to this half + template + typename detail::enable::type operator*=(T rhs) + { + return *this *= static_cast(rhs); + } + + /// Arithmetic assignment. + /// \tparam T type of concrete half expression + /// \param rhs half expression to divide by + /// \return reference to this half + template + typename detail::enable::type operator/=(T rhs) + { + return *this /= static_cast(rhs); + } + + /// Assignment operator. + /// \param rhs single-precision value to copy from + /// \return reference to this half + half& operator=(float rhs) + { + data_ = detail::float2half(rhs); + return *this; + } + + /// Arithmetic assignment. + /// \param rhs single-precision value to add + /// \return reference to this half + half& operator+=(float rhs) + { + data_ = detail::float2half(detail::half2float(data_) + rhs); + return *this; + } + + /// Arithmetic assignment. + /// \param rhs single-precision value to subtract + /// \return reference to this half + half& operator-=(float rhs) + { + data_ = detail::float2half(detail::half2float(data_) - rhs); + return *this; + } + + /// Arithmetic assignment. + /// \param rhs single-precision value to multiply with + /// \return reference to this half + half& operator*=(float rhs) + { + data_ = detail::float2half(detail::half2float(data_) * rhs); + return *this; + } + + /// Arithmetic assignment. + /// \param rhs single-precision value to divide by + /// \return reference to this half + half& operator/=(float rhs) + { + data_ = detail::float2half(detail::half2float(data_) / rhs); + return *this; + } + + /// Prefix increment. + /// \return incremented half value + half& operator++() + { + return *this += 1.0f; + } + + /// Prefix decrement. + /// \return decremented half value + half& operator--() + { + return *this -= 1.0f; + } + + /// Postfix increment. + /// \return non-incremented half value + half operator++(int) + { + half out(*this); + ++*this; + return out; + } + + /// Postfix decrement. + /// \return non-decremented half value + half operator--(int) + { + half out(*this); + --*this; + return out; + } + +private: + /// Rounding mode to use + static const std::float_round_style round_style = (std::float_round_style)(HALF_ROUND_STYLE); + + /// Constructor. + /// \param bits binary representation to set half to + HALF_CONSTEXPR half(detail::binary_t, detail::uint16 bits) HALF_NOEXCEPT : data_(bits) {} + + /// Internal binary representation + detail::uint16 data_; +}; + +#if HALF_ENABLE_CPP11_USER_LITERALS +namespace literal +{ +/// Half literal. +/// While this returns an actual half-precision value, half literals can unfortunately not be constant expressions due +/// to rather involved conversions. +/// \param value literal value +/// \return half with given value (if representable) +inline half operator"" _h(long double value) +{ + return half(detail::binary, detail::float2half(value)); +} +} // namespace literal +#endif + +namespace detail +{ +/// Wrapper implementing unspecialized half-precision functions. +struct functions +{ + /// Addition implementation. + /// \param x first operand + /// \param y second operand + /// \return Half-precision sum stored in single-precision + static expr plus(float x, float y) + { + return expr(x + y); + } + + /// Subtraction implementation. + /// \param x first operand + /// \param y second operand + /// \return Half-precision difference stored in single-precision + static expr minus(float x, float y) + { + return expr(x - y); + } + + /// Multiplication implementation. + /// \param x first operand + /// \param y second operand + /// \return Half-precision product stored in single-precision + static expr multiplies(float x, float y) + { + return expr(x * y); + } + + /// Division implementation. + /// \param x first operand + /// \param y second operand + /// \return Half-precision quotient stored in single-precision + static expr divides(float x, float y) + { + return expr(x / y); + } + + /// Output implementation. + /// \param out stream to write to + /// \param arg value to write + /// \return reference to stream + template + static std::basic_ostream& write(std::basic_ostream& out, float arg) + { + return out << arg; + } + + /// Input implementation. + /// \param in stream to read from + /// \param arg half to read into + /// \return reference to stream + template + static std::basic_istream& read(std::basic_istream& in, half& arg) + { + float f; + if (in >> f) + arg = f; + return in; + } + + /// Modulo implementation. + /// \param x first operand + /// \param y second operand + /// \return Half-precision division remainder stored in single-precision + static expr fmod(float x, float y) + { + return expr(std::fmod(x, y)); + } + + /// Remainder implementation. + /// \param x first operand + /// \param y second operand + /// \return Half-precision division remainder stored in single-precision + static expr remainder(float x, float y) + { +#if HALF_ENABLE_CPP11_CMATH + return expr(std::remainder(x, y)); +#else + if (builtin_isnan(x) || builtin_isnan(y)) + return expr(std::numeric_limits::quiet_NaN()); + float ax = std::fabs(x), ay = std::fabs(y); + if (ax >= 65536.0f || ay < std::ldexp(1.0f, -24)) + return expr(std::numeric_limits::quiet_NaN()); + if (ay >= 65536.0f) + return expr(x); + if (ax == ay) + return expr(builtin_signbit(x) ? -0.0f : 0.0f); + ax = std::fmod(ax, ay + ay); + float y2 = 0.5f * ay; + if (ax > y2) + { + ax -= ay; + if (ax >= y2) + ax -= ay; + } + return expr(builtin_signbit(x) ? -ax : ax); +#endif + } + + /// Remainder implementation. + /// \param x first operand + /// \param y second operand + /// \param quo address to store quotient bits at + /// \return Half-precision division remainder stored in single-precision + static expr remquo(float x, float y, int* quo) + { +#if HALF_ENABLE_CPP11_CMATH + return expr(std::remquo(x, y, quo)); +#else + if (builtin_isnan(x) || builtin_isnan(y)) + return expr(std::numeric_limits::quiet_NaN()); + bool sign = builtin_signbit(x), qsign = static_cast(sign ^ builtin_signbit(y)); + float ax = std::fabs(x), ay = std::fabs(y); + if (ax >= 65536.0f || ay < std::ldexp(1.0f, -24)) + return expr(std::numeric_limits::quiet_NaN()); + if (ay >= 65536.0f) + return expr(x); + if (ax == ay) + return *quo = qsign ? -1 : 1, expr(sign ? -0.0f : 0.0f); + ax = std::fmod(ax, 8.0f * ay); + int cquo = 0; + if (ax >= 4.0f * ay) + { + ax -= 4.0f * ay; + cquo += 4; + } + if (ax >= 2.0f * ay) + { + ax -= 2.0f * ay; + cquo += 2; + } + float y2 = 0.5f * ay; + if (ax > y2) + { + ax -= ay; + ++cquo; + if (ax >= y2) + { + ax -= ay; + ++cquo; + } + } + return *quo = qsign ? -cquo : cquo, expr(sign ? -ax : ax); +#endif + } + + /// Positive difference implementation. + /// \param x first operand + /// \param y second operand + /// \return Positive difference stored in single-precision + static expr fdim(float x, float y) + { +#if HALF_ENABLE_CPP11_CMATH + return expr(std::fdim(x, y)); +#else + return expr((x <= y) ? 0.0f : (x - y)); +#endif + } + + /// Fused multiply-add implementation. + /// \param x first operand + /// \param y second operand + /// \param z third operand + /// \return \a x * \a y + \a z stored in single-precision + static expr fma(float x, float y, float z) + { +#if HALF_ENABLE_CPP11_CMATH && defined(FP_FAST_FMAF) + return expr(std::fma(x, y, z)); +#else + return expr(x * y + z); +#endif + } + + /// Get NaN. + /// \return Half-precision quiet NaN + static half nanh() + { + return half(binary, 0x7FFF); + } + + /// Exponential implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + static expr exp(float arg) + { + return expr(std::exp(arg)); + } + + /// Exponential implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + static expr expm1(float arg) + { +#if HALF_ENABLE_CPP11_CMATH + return expr(std::expm1(arg)); +#else + return expr(static_cast(std::exp(static_cast(arg)) - 1.0)); +#endif + } + + /// Binary exponential implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + static expr exp2(float arg) + { +#if HALF_ENABLE_CPP11_CMATH + return expr(std::exp2(arg)); +#else + return expr(static_cast(std::exp(arg * 0.69314718055994530941723212145818))); +#endif + } + + /// Logarithm implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + static expr log(float arg) + { + return expr(std::log(arg)); + } + + /// Common logarithm implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + static expr log10(float arg) + { + return expr(std::log10(arg)); + } + + /// Logarithm implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + static expr log1p(float arg) + { +#if HALF_ENABLE_CPP11_CMATH + return expr(std::log1p(arg)); +#else + return expr(static_cast(std::log(1.0 + arg))); +#endif + } + + /// Binary logarithm implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + static expr log2(float arg) + { +#if HALF_ENABLE_CPP11_CMATH + return expr(std::log2(arg)); +#else + return expr(static_cast(std::log(static_cast(arg)) * 1.4426950408889634073599246810019)); +#endif + } + + /// Square root implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + static expr sqrt(float arg) + { + return expr(std::sqrt(arg)); + } + + /// Cubic root implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + static expr cbrt(float arg) + { +#if HALF_ENABLE_CPP11_CMATH + return expr(std::cbrt(arg)); +#else + if (builtin_isnan(arg) || builtin_isinf(arg)) + return expr(arg); + return expr(builtin_signbit(arg) ? -static_cast(std::pow(-static_cast(arg), 1.0 / 3.0)) + : static_cast(std::pow(static_cast(arg), 1.0 / 3.0))); +#endif + } + + /// Hypotenuse implementation. + /// \param x first argument + /// \param y second argument + /// \return function value stored in single-preicision + static expr hypot(float x, float y) + { +#if HALF_ENABLE_CPP11_CMATH + return expr(std::hypot(x, y)); +#else + return expr((builtin_isinf(x) || builtin_isinf(y)) + ? std::numeric_limits::infinity() + : static_cast(std::sqrt(static_cast(x) * x + static_cast(y) * y))); +#endif + } + + /// Power implementation. + /// \param base value to exponentiate + /// \param exp power to expontiate to + /// \return function value stored in single-preicision + static expr pow(float base, float exp) + { + return expr(std::pow(base, exp)); + } + + /// Sine implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + static expr sin(float arg) + { + return expr(std::sin(arg)); + } + + /// Cosine implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + static expr cos(float arg) + { + return expr(std::cos(arg)); + } + + /// Tan implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + static expr tan(float arg) + { + return expr(std::tan(arg)); + } + + /// Arc sine implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + static expr asin(float arg) + { + return expr(std::asin(arg)); + } + + /// Arc cosine implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + static expr acos(float arg) + { + return expr(std::acos(arg)); + } + + /// Arc tangent implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + static expr atan(float arg) + { + return expr(std::atan(arg)); + } + + /// Arc tangent implementation. + /// \param x first argument + /// \param y second argument + /// \return function value stored in single-preicision + static expr atan2(float x, float y) + { + return expr(std::atan2(x, y)); + } + + /// Hyperbolic sine implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + static expr sinh(float arg) + { + return expr(std::sinh(arg)); + } + + /// Hyperbolic cosine implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + static expr cosh(float arg) + { + return expr(std::cosh(arg)); + } + + /// Hyperbolic tangent implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + static expr tanh(float arg) + { + return expr(std::tanh(arg)); + } + + /// Hyperbolic area sine implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + static expr asinh(float arg) + { +#if HALF_ENABLE_CPP11_CMATH + return expr(std::asinh(arg)); +#else + return expr((arg == -std::numeric_limits::infinity()) + ? arg + : static_cast(std::log(arg + std::sqrt(arg * arg + 1.0)))); +#endif + } + + /// Hyperbolic area cosine implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + static expr acosh(float arg) + { +#if HALF_ENABLE_CPP11_CMATH + return expr(std::acosh(arg)); +#else + return expr((arg < -1.0f) ? std::numeric_limits::quiet_NaN() + : static_cast(std::log(arg + std::sqrt(arg * arg - 1.0)))); +#endif + } + + /// Hyperbolic area tangent implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + static expr atanh(float arg) + { +#if HALF_ENABLE_CPP11_CMATH + return expr(std::atanh(arg)); +#else + return expr(static_cast(0.5 * std::log((1.0 + arg) / (1.0 - arg)))); +#endif + } + + /// Error function implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + static expr erf(float arg) + { +#if HALF_ENABLE_CPP11_CMATH + return expr(std::erf(arg)); +#else + return expr(static_cast(erf(static_cast(arg)))); +#endif + } + + /// Complementary implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + static expr erfc(float arg) + { +#if HALF_ENABLE_CPP11_CMATH + return expr(std::erfc(arg)); +#else + return expr(static_cast(1.0 - erf(static_cast(arg)))); +#endif + } + + /// Gamma logarithm implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + static expr lgamma(float arg) + { +#if HALF_ENABLE_CPP11_CMATH + return expr(std::lgamma(arg)); +#else + if (builtin_isinf(arg)) + return expr(std::numeric_limits::infinity()); + if (arg < 0.0f) + { + float i, f = std::modf(-arg, &i); + if (f == 0.0f) + return expr(std::numeric_limits::infinity()); + return expr(static_cast(1.1447298858494001741434273513531 + - std::log(std::abs(std::sin(3.1415926535897932384626433832795 * f))) - lgamma(1.0 - arg))); + } + return expr(static_cast(lgamma(static_cast(arg)))); +#endif + } + + /// Gamma implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + static expr tgamma(float arg) + { +#if HALF_ENABLE_CPP11_CMATH + return expr(std::tgamma(arg)); +#else + if (arg == 0.0f) + return builtin_signbit(arg) ? expr(-std::numeric_limits::infinity()) + : expr(std::numeric_limits::infinity()); + if (arg < 0.0f) + { + float i, f = std::modf(-arg, &i); + if (f == 0.0f) + return expr(std::numeric_limits::quiet_NaN()); + double value = 3.1415926535897932384626433832795 + / (std::sin(3.1415926535897932384626433832795 * f) * std::exp(lgamma(1.0 - arg))); + return expr(static_cast((std::fmod(i, 2.0f) == 0.0f) ? -value : value)); + } + if (builtin_isinf(arg)) + return expr(arg); + return expr(static_cast(std::exp(lgamma(static_cast(arg))))); +#endif + } + + /// Floor implementation. + /// \param arg value to round + /// \return rounded value + static half floor(half arg) + { + return half(binary, round_half(arg.data_)); + } + + /// Ceiling implementation. + /// \param arg value to round + /// \return rounded value + static half ceil(half arg) + { + return half(binary, round_half(arg.data_)); + } + + /// Truncation implementation. + /// \param arg value to round + /// \return rounded value + static half trunc(half arg) + { + return half(binary, round_half(arg.data_)); + } + + /// Nearest integer implementation. + /// \param arg value to round + /// \return rounded value + static half round(half arg) + { + return half(binary, round_half_up(arg.data_)); + } + + /// Nearest integer implementation. + /// \param arg value to round + /// \return rounded value + static long lround(half arg) + { + return detail::half2int_up(arg.data_); + } + + /// Nearest integer implementation. + /// \param arg value to round + /// \return rounded value + static half rint(half arg) + { + return half(binary, round_half(arg.data_)); + } + + /// Nearest integer implementation. + /// \param arg value to round + /// \return rounded value + static long lrint(half arg) + { + return detail::half2int(arg.data_); + } + +#if HALF_ENABLE_CPP11_LONG_LONG + /// Nearest integer implementation. + /// \param arg value to round + /// \return rounded value + static long long llround(half arg) + { + return detail::half2int_up(arg.data_); + } + + /// Nearest integer implementation. + /// \param arg value to round + /// \return rounded value + static long long llrint(half arg) + { + return detail::half2int(arg.data_); + } +#endif + + /// Decompression implementation. + /// \param arg number to decompress + /// \param exp address to store exponent at + /// \return normalized significant + static half frexp(half arg, int* exp) + { + int m = arg.data_ & 0x7FFF, e = -14; + if (m >= 0x7C00 || !m) + return *exp = 0, arg; + for (; m < 0x400; m <<= 1, --e) + ; + return *exp = e + (m >> 10), half(binary, (arg.data_ & 0x8000) | 0x3800 | (m & 0x3FF)); + } + + /// Decompression implementation. + /// \param arg number to decompress + /// \param iptr address to store integer part at + /// \return fractional part + static half modf(half arg, half* iptr) + { + uint32_t e = arg.data_ & 0x7FFF; + if (e >= 0x6400) + return *iptr = arg, half(binary, arg.data_ & (0x8000U | -(e > 0x7C00))); + if (e < 0x3C00) + return iptr->data_ = arg.data_ & 0x8000, arg; + e >>= 10; + uint32_t mask = (1 << (25 - e)) - 1, m = arg.data_ & mask; + iptr->data_ = arg.data_ & ~mask; + if (!m) + return half(binary, arg.data_ & 0x8000); + for (; m < 0x400; m <<= 1, --e) + ; + return half(binary, static_cast((arg.data_ & 0x8000) | (e << 10) | (m & 0x3FF))); + } + + /// Scaling implementation. + /// \param arg number to scale + /// \param exp power of two to scale by + /// \return scaled number + static half scalbln(half arg, long exp) + { + uint32_t m = arg.data_ & 0x7FFF; + if (m >= 0x7C00 || !m) + return arg; + for (; m < 0x400; m <<= 1, --exp) + ; + exp += m >> 10; + uint16 value = arg.data_ & 0x8000; + if (exp > 30) + { + if (half::round_style == std::round_toward_zero) + value |= 0x7BFF; + else if (half::round_style == std::round_toward_infinity) + value |= 0x7C00 - (value >> 15); + else if (half::round_style == std::round_toward_neg_infinity) + value |= 0x7BFF + (value >> 15); + else + value |= 0x7C00; + } + else if (exp > 0) + value |= (exp << 10) | (m & 0x3FF); + else if (exp > -11) + { + m = (m & 0x3FF) | 0x400; + if (half::round_style == std::round_to_nearest) + { + m += 1 << -exp; +#if HALF_ROUND_TIES_TO_EVEN + m -= (m >> (1 - exp)) & 1; +#endif + } + else if (half::round_style == std::round_toward_infinity) + m += ((value >> 15) - 1) & ((1 << (1 - exp)) - 1U); + else if (half::round_style == std::round_toward_neg_infinity) + m += -(value >> 15) & ((1 << (1 - exp)) - 1U); + value |= m >> (1 - exp); + } + else if (half::round_style == std::round_toward_infinity) + value -= (value >> 15) - 1; + else if (half::round_style == std::round_toward_neg_infinity) + value += value >> 15; + return half(binary, value); + } + + /// Exponent implementation. + /// \param arg number to query + /// \return floating point exponent + static int ilogb(half arg) + { + int abs = arg.data_ & 0x7FFF; + if (!abs) + return FP_ILOGB0; + if (abs < 0x7C00) + { + int exp = (abs >> 10) - 15; + if (abs < 0x400) + for (; abs < 0x200; abs <<= 1, --exp) + ; + return exp; + } + if (abs > 0x7C00) + return FP_ILOGBNAN; + return INT_MAX; + } + + /// Exponent implementation. + /// \param arg number to query + /// \return floating point exponent + static half logb(half arg) + { + int abs = arg.data_ & 0x7FFF; + if (!abs) + return half(binary, 0xFC00); + if (abs < 0x7C00) + { + int exp = (abs >> 10) - 15; + if (abs < 0x400) + for (; abs < 0x200; abs <<= 1, --exp) + ; + uint16 bits = (exp < 0) << 15; + if (exp) + { + uint32_t m = std::abs(exp) << 6, e = 18; + for (; m < 0x400; m <<= 1, --e) + ; + bits |= (e << 10) + m; + } + return half(binary, bits); + } + if (abs > 0x7C00) + return arg; + return half(binary, 0x7C00); + } + + /// Enumeration implementation. + /// \param from number to increase/decrease + /// \param to direction to enumerate into + /// \return next representable number + static half nextafter(half from, half to) + { + uint16 fabs = from.data_ & 0x7FFF, tabs = to.data_ & 0x7FFF; + if (fabs > 0x7C00) + return from; + if (tabs > 0x7C00 || from.data_ == to.data_ || !(fabs | tabs)) + return to; + if (!fabs) + return half(binary, (to.data_ & 0x8000) + 1); + bool lt = ((fabs == from.data_) ? static_cast(fabs) : -static_cast(fabs)) + < ((tabs == to.data_) ? static_cast(tabs) : -static_cast(tabs)); + return half(binary, from.data_ + (((from.data_ >> 15) ^ static_cast(lt)) << 1) - 1); + } + + /// Enumeration implementation. + /// \param from number to increase/decrease + /// \param to direction to enumerate into + /// \return next representable number + static half nexttoward(half from, long double to) + { + if (isnan(from)) + return from; + long double lfrom = static_cast(from); + if (builtin_isnan(to) || lfrom == to) + return half(static_cast(to)); + if (!(from.data_ & 0x7FFF)) + return half(binary, (static_cast(builtin_signbit(to)) << 15) + 1); + return half(binary, from.data_ + (((from.data_ >> 15) ^ static_cast(lfrom < to)) << 1) - 1); + } + + /// Sign implementation + /// \param x first operand + /// \param y second operand + /// \return composed value + static half copysign(half x, half y) + { + return half(binary, x.data_ ^ ((x.data_ ^ y.data_) & 0x8000)); + } + + /// Classification implementation. + /// \param arg value to classify + /// \retval true if infinite number + /// \retval false else + static int fpclassify(half arg) + { + uint32_t abs = arg.data_ & 0x7FFF; + return abs + ? ((abs > 0x3FF) ? ((abs >= 0x7C00) ? ((abs > 0x7C00) ? FP_NAN : FP_INFINITE) : FP_NORMAL) : FP_SUBNORMAL) + : FP_ZERO; + } + + /// Classification implementation. + /// \param arg value to classify + /// \retval true if finite number + /// \retval false else + static bool isfinite(half arg) + { + return (arg.data_ & 0x7C00) != 0x7C00; + } + + /// Classification implementation. + /// \param arg value to classify + /// \retval true if infinite number + /// \retval false else + static bool isinf(half arg) + { + return (arg.data_ & 0x7FFF) == 0x7C00; + } + + /// Classification implementation. + /// \param arg value to classify + /// \retval true if not a number + /// \retval false else + static bool isnan(half arg) + { + return (arg.data_ & 0x7FFF) > 0x7C00; + } + + /// Classification implementation. + /// \param arg value to classify + /// \retval true if normal number + /// \retval false else + static bool isnormal(half arg) + { + return ((arg.data_ & 0x7C00) != 0) & ((arg.data_ & 0x7C00) != 0x7C00); + } + + /// Sign bit implementation. + /// \param arg value to check + /// \retval true if signed + /// \retval false if unsigned + static bool signbit(half arg) + { + return (arg.data_ & 0x8000) != 0; + } + + /// Comparison implementation. + /// \param x first operand + /// \param y second operand + /// \retval true if operands equal + /// \retval false else + static bool isequal(half x, half y) + { + return (x.data_ == y.data_ || !((x.data_ | y.data_) & 0x7FFF)) && !isnan(x); + } + + /// Comparison implementation. + /// \param x first operand + /// \param y second operand + /// \retval true if operands not equal + /// \retval false else + static bool isnotequal(half x, half y) + { + return (x.data_ != y.data_ && ((x.data_ | y.data_) & 0x7FFF)) || isnan(x); + } + + /// Comparison implementation. + /// \param x first operand + /// \param y second operand + /// \retval true if \a x > \a y + /// \retval false else + static bool isgreater(half x, half y) + { + int xabs = x.data_ & 0x7FFF, yabs = y.data_ & 0x7FFF; + return xabs <= 0x7C00 && yabs <= 0x7C00 + && (((xabs == x.data_) ? xabs : -xabs) > ((yabs == y.data_) ? yabs : -yabs)); + } + + /// Comparison implementation. + /// \param x first operand + /// \param y second operand + /// \retval true if \a x >= \a y + /// \retval false else + static bool isgreaterequal(half x, half y) + { + int xabs = x.data_ & 0x7FFF, yabs = y.data_ & 0x7FFF; + return xabs <= 0x7C00 && yabs <= 0x7C00 + && (((xabs == x.data_) ? xabs : -xabs) >= ((yabs == y.data_) ? yabs : -yabs)); + } + + /// Comparison implementation. + /// \param x first operand + /// \param y second operand + /// \retval true if \a x < \a y + /// \retval false else + static bool isless(half x, half y) + { + int xabs = x.data_ & 0x7FFF, yabs = y.data_ & 0x7FFF; + return xabs <= 0x7C00 && yabs <= 0x7C00 + && (((xabs == x.data_) ? xabs : -xabs) < ((yabs == y.data_) ? yabs : -yabs)); + } + + /// Comparison implementation. + /// \param x first operand + /// \param y second operand + /// \retval true if \a x <= \a y + /// \retval false else + static bool islessequal(half x, half y) + { + int xabs = x.data_ & 0x7FFF, yabs = y.data_ & 0x7FFF; + return xabs <= 0x7C00 && yabs <= 0x7C00 + && (((xabs == x.data_) ? xabs : -xabs) <= ((yabs == y.data_) ? yabs : -yabs)); + } + + /// Comparison implementation. + /// \param x first operand + /// \param y second operand + /// \retval true if either \a x > \a y nor \a x < \a y + /// \retval false else + static bool islessgreater(half x, half y) + { + int xabs = x.data_ & 0x7FFF, yabs = y.data_ & 0x7FFF; + if (xabs > 0x7C00 || yabs > 0x7C00) + return false; + int a = (xabs == x.data_) ? xabs : -xabs, b = (yabs == y.data_) ? yabs : -yabs; + return a < b || a > b; + } + + /// Comparison implementation. + /// \param x first operand + /// \param y second operand + /// \retval true if operand unordered + /// \retval false else + static bool isunordered(half x, half y) + { + return isnan(x) || isnan(y); + } + +private: + static double erf(double arg) + { + if (builtin_isinf(arg)) + return (arg < 0.0) ? -1.0 : 1.0; + double x2 = arg * arg, ax2 = 0.147 * x2, + value = std::sqrt(1.0 - std::exp(-x2 * (1.2732395447351626861510701069801 + ax2) / (1.0 + ax2))); + return builtin_signbit(arg) ? -value : value; + } + + static double lgamma(double arg) + { + double v = 1.0; + for (; arg < 8.0; ++arg) + v *= arg; + double w = 1.0 / (arg * arg); + return (((((((-0.02955065359477124183006535947712 * w + 0.00641025641025641025641025641026) * w + + -0.00191752691752691752691752691753) + * w + + 8.4175084175084175084175084175084e-4) + * w + + -5.952380952380952380952380952381e-4) + * w + + 7.9365079365079365079365079365079e-4) + * w + + -0.00277777777777777777777777777778) + * w + + 0.08333333333333333333333333333333) + / arg + + 0.91893853320467274178032973640562 - std::log(v) - arg + (arg - 0.5) * std::log(arg); + } +}; + +/// Wrapper for unary half-precision functions needing specialization for individual argument types. +/// \tparam T argument type +template +struct unary_specialized +{ + /// Negation implementation. + /// \param arg value to negate + /// \return negated value + static HALF_CONSTEXPR half negate(half arg) + { + return half(binary, arg.data_ ^ 0x8000); + } + + /// Absolute value implementation. + /// \param arg function argument + /// \return absolute value + static half fabs(half arg) + { + return half(binary, arg.data_ & 0x7FFF); + } +}; +template <> +struct unary_specialized +{ + static HALF_CONSTEXPR expr negate(float arg) + { + return expr(-arg); + } + static expr fabs(float arg) + { + return expr(std::fabs(arg)); + } +}; + +/// Wrapper for binary half-precision functions needing specialization for individual argument types. +/// \tparam T first argument type +/// \tparam U first argument type +template +struct binary_specialized +{ + /// Minimum implementation. + /// \param x first operand + /// \param y second operand + /// \return minimum value + static expr fmin(float x, float y) + { +#if HALF_ENABLE_CPP11_CMATH + return expr(std::fmin(x, y)); +#else + if (builtin_isnan(x)) + return expr(y); + if (builtin_isnan(y)) + return expr(x); + return expr(std::min(x, y)); +#endif + } + + /// Maximum implementation. + /// \param x first operand + /// \param y second operand + /// \return maximum value + static expr fmax(float x, float y) + { +#if HALF_ENABLE_CPP11_CMATH + return expr(std::fmax(x, y)); +#else + if (builtin_isnan(x)) + return expr(y); + if (builtin_isnan(y)) + return expr(x); + return expr(std::max(x, y)); +#endif + } +}; +template <> +struct binary_specialized +{ + static half fmin(half x, half y) + { + int xabs = x.data_ & 0x7FFF, yabs = y.data_ & 0x7FFF; + if (xabs > 0x7C00) + return y; + if (yabs > 0x7C00) + return x; + return (((xabs == x.data_) ? xabs : -xabs) > ((yabs == y.data_) ? yabs : -yabs)) ? y : x; + } + static half fmax(half x, half y) + { + int xabs = x.data_ & 0x7FFF, yabs = y.data_ & 0x7FFF; + if (xabs > 0x7C00) + return y; + if (yabs > 0x7C00) + return x; + return (((xabs == x.data_) ? xabs : -xabs) < ((yabs == y.data_) ? yabs : -yabs)) ? y : x; + } +}; + +/// Helper class for half casts. +/// This class template has to be specialized for all valid cast argument to define an appropriate static `cast` member +/// function and a corresponding `type` member denoting its return type. +/// \tparam T destination type +/// \tparam U source type +/// \tparam R rounding mode to use +template +struct half_caster +{ +}; +template +struct half_caster +{ +#if HALF_ENABLE_CPP11_STATIC_ASSERT && HALF_ENABLE_CPP11_TYPE_TRAITS + static_assert(std::is_arithmetic::value, "half_cast from non-arithmetic type unsupported"); +#endif + + static half cast(U arg) + { + return cast_impl(arg, is_float()); + }; + +private: + static half cast_impl(U arg, true_type) + { + return half(binary, float2half(arg)); + } + static half cast_impl(U arg, false_type) + { + return half(binary, int2half(arg)); + } +}; +template +struct half_caster +{ +#if HALF_ENABLE_CPP11_STATIC_ASSERT && HALF_ENABLE_CPP11_TYPE_TRAITS + static_assert(std::is_arithmetic::value, "half_cast to non-arithmetic type unsupported"); +#endif + + static T cast(half arg) + { + return cast_impl(arg, is_float()); + } + +private: + static T cast_impl(half arg, true_type) + { + return half2float(arg.data_); + } + static T cast_impl(half arg, false_type) + { + return half2int(arg.data_); + } +}; +template +struct half_caster +{ +#if HALF_ENABLE_CPP11_STATIC_ASSERT && HALF_ENABLE_CPP11_TYPE_TRAITS + static_assert(std::is_arithmetic::value, "half_cast to non-arithmetic type unsupported"); +#endif + + static T cast(expr arg) + { + return cast_impl(arg, is_float()); + } + +private: + static T cast_impl(float arg, true_type) + { + return static_cast(arg); + } + static T cast_impl(half arg, false_type) + { + return half2int(arg.data_); + } +}; +template +struct half_caster +{ + static half cast(half arg) + { + return arg; + } +}; +template +struct half_caster : half_caster +{ +}; + +/// \name Comparison operators +/// \{ + +/// Comparison for equality. +/// \param x first operand +/// \param y second operand +/// \retval true if operands equal +/// \retval false else +template +typename enable::type operator==(T x, U y) +{ + return functions::isequal(x, y); +} + +/// Comparison for inequality. +/// \param x first operand +/// \param y second operand +/// \retval true if operands not equal +/// \retval false else +template +typename enable::type operator!=(T x, U y) +{ + return functions::isnotequal(x, y); +} + +/// Comparison for less than. +/// \param x first operand +/// \param y second operand +/// \retval true if \a x less than \a y +/// \retval false else +template +typename enable::type operator<(T x, U y) +{ + return functions::isless(x, y); +} + +/// Comparison for greater than. +/// \param x first operand +/// \param y second operand +/// \retval true if \a x greater than \a y +/// \retval false else +template +typename enable::type operator>(T x, U y) +{ + return functions::isgreater(x, y); +} + +/// Comparison for less equal. +/// \param x first operand +/// \param y second operand +/// \retval true if \a x less equal \a y +/// \retval false else +template +typename enable::type operator<=(T x, U y) +{ + return functions::islessequal(x, y); +} + +/// Comparison for greater equal. +/// \param x first operand +/// \param y second operand +/// \retval true if \a x greater equal \a y +/// \retval false else +template +typename enable::type operator>=(T x, U y) +{ + return functions::isgreaterequal(x, y); +} + +/// \} +/// \name Arithmetic operators +/// \{ + +/// Add halfs. +/// \param x left operand +/// \param y right operand +/// \return sum of half expressions +template +typename enable::type operator+(T x, U y) +{ + return functions::plus(x, y); +} + +/// Subtract halfs. +/// \param x left operand +/// \param y right operand +/// \return difference of half expressions +template +typename enable::type operator-(T x, U y) +{ + return functions::minus(x, y); +} + +/// Multiply halfs. +/// \param x left operand +/// \param y right operand +/// \return product of half expressions +template +typename enable::type operator*(T x, U y) +{ + return functions::multiplies(x, y); +} + +/// Divide halfs. +/// \param x left operand +/// \param y right operand +/// \return quotient of half expressions +template +typename enable::type operator/(T x, U y) +{ + return functions::divides(x, y); +} + +/// Identity. +/// \param arg operand +/// \return uncahnged operand +template +HALF_CONSTEXPR typename enable::type operator+(T arg) +{ + return arg; +} + +/// Negation. +/// \param arg operand +/// \return negated operand +template +HALF_CONSTEXPR typename enable::type operator-(T arg) +{ + return unary_specialized::negate(arg); +} + +/// \} +/// \name Input and output +/// \{ + +/// Output operator. +/// \param out output stream to write into +/// \param arg half expression to write +/// \return reference to output stream +template +typename enable&, T>::type operator<<(std::basic_ostream& out, T arg) +{ + return functions::write(out, arg); +} + +/// Input operator. +/// \param in input stream to read from +/// \param arg half to read into +/// \return reference to input stream +template +std::basic_istream& operator>>(std::basic_istream& in, half& arg) +{ + return functions::read(in, arg); +} + +/// \} +/// \name Basic mathematical operations +/// \{ + +/// Absolute value. +/// \param arg operand +/// \return absolute value of \a arg +// template typename enable::type abs(T arg) { return unary_specialized::fabs(arg); } +inline half abs(half arg) +{ + return unary_specialized::fabs(arg); +} +inline expr abs(expr arg) +{ + return unary_specialized::fabs(arg); +} + +/// Absolute value. +/// \param arg operand +/// \return absolute value of \a arg +// template typename enable::type fabs(T arg) { return unary_specialized::fabs(arg); } +inline half fabs(half arg) +{ + return unary_specialized::fabs(arg); +} +inline expr fabs(expr arg) +{ + return unary_specialized::fabs(arg); +} + +/// Remainder of division. +/// \param x first operand +/// \param y second operand +/// \return remainder of floating point division. +// template typename enable::type fmod(T x, U y) { return functions::fmod(x, y); } +inline expr fmod(half x, half y) +{ + return functions::fmod(x, y); +} +inline expr fmod(half x, expr y) +{ + return functions::fmod(x, y); +} +inline expr fmod(expr x, half y) +{ + return functions::fmod(x, y); +} +inline expr fmod(expr x, expr y) +{ + return functions::fmod(x, y); +} + +/// Remainder of division. +/// \param x first operand +/// \param y second operand +/// \return remainder of floating point division. +// template typename enable::type remainder(T x, U y) { return +// functions::remainder(x, y); } +inline expr remainder(half x, half y) +{ + return functions::remainder(x, y); +} +inline expr remainder(half x, expr y) +{ + return functions::remainder(x, y); +} +inline expr remainder(expr x, half y) +{ + return functions::remainder(x, y); +} +inline expr remainder(expr x, expr y) +{ + return functions::remainder(x, y); +} + +/// Remainder of division. +/// \param x first operand +/// \param y second operand +/// \param quo address to store some bits of quotient at +/// \return remainder of floating point division. +// template typename enable::type remquo(T x, U y, int *quo) { return +// functions::remquo(x, y, quo); } +inline expr remquo(half x, half y, int* quo) +{ + return functions::remquo(x, y, quo); +} +inline expr remquo(half x, expr y, int* quo) +{ + return functions::remquo(x, y, quo); +} +inline expr remquo(expr x, half y, int* quo) +{ + return functions::remquo(x, y, quo); +} +inline expr remquo(expr x, expr y, int* quo) +{ + return functions::remquo(x, y, quo); +} + +/// Fused multiply add. +/// \param x first operand +/// \param y second operand +/// \param z third operand +/// \return ( \a x * \a y ) + \a z rounded as one operation. +// template typename enable::type fma(T x, U y, V z) { return +// functions::fma(x, y, z); } +inline expr fma(half x, half y, half z) +{ + return functions::fma(x, y, z); +} +inline expr fma(half x, half y, expr z) +{ + return functions::fma(x, y, z); +} +inline expr fma(half x, expr y, half z) +{ + return functions::fma(x, y, z); +} +inline expr fma(half x, expr y, expr z) +{ + return functions::fma(x, y, z); +} +inline expr fma(expr x, half y, half z) +{ + return functions::fma(x, y, z); +} +inline expr fma(expr x, half y, expr z) +{ + return functions::fma(x, y, z); +} +inline expr fma(expr x, expr y, half z) +{ + return functions::fma(x, y, z); +} +inline expr fma(expr x, expr y, expr z) +{ + return functions::fma(x, y, z); +} + +/// Maximum of half expressions. +/// \param x first operand +/// \param y second operand +/// \return maximum of operands +// template typename result::type fmax(T x, U y) { return +// binary_specialized::fmax(x, y); } +inline half fmax(half x, half y) +{ + return binary_specialized::fmax(x, y); +} +inline expr fmax(half x, expr y) +{ + return binary_specialized::fmax(x, y); +} +inline expr fmax(expr x, half y) +{ + return binary_specialized::fmax(x, y); +} +inline expr fmax(expr x, expr y) +{ + return binary_specialized::fmax(x, y); +} + +/// Minimum of half expressions. +/// \param x first operand +/// \param y second operand +/// \return minimum of operands +// template typename result::type fmin(T x, U y) { return +// binary_specialized::fmin(x, y); } +inline half fmin(half x, half y) +{ + return binary_specialized::fmin(x, y); +} +inline expr fmin(half x, expr y) +{ + return binary_specialized::fmin(x, y); +} +inline expr fmin(expr x, half y) +{ + return binary_specialized::fmin(x, y); +} +inline expr fmin(expr x, expr y) +{ + return binary_specialized::fmin(x, y); +} + +/// Positive difference. +/// \param x first operand +/// \param y second operand +/// \return \a x - \a y or 0 if difference negative +// template typename enable::type fdim(T x, U y) { return functions::fdim(x, y); } +inline expr fdim(half x, half y) +{ + return functions::fdim(x, y); +} +inline expr fdim(half x, expr y) +{ + return functions::fdim(x, y); +} +inline expr fdim(expr x, half y) +{ + return functions::fdim(x, y); +} +inline expr fdim(expr x, expr y) +{ + return functions::fdim(x, y); +} + +/// Get NaN value. +/// \return quiet NaN +inline half nanh(const char*) +{ + return functions::nanh(); +} + +/// \} +/// \name Exponential functions +/// \{ + +/// Exponential function. +/// \param arg function argument +/// \return e raised to \a arg +// template typename enable::type exp(T arg) { return functions::exp(arg); } +inline expr exp(half arg) +{ + return functions::exp(arg); +} +inline expr exp(expr arg) +{ + return functions::exp(arg); +} + +/// Exponential minus one. +/// \param arg function argument +/// \return e raised to \a arg subtracted by 1 +// template typename enable::type expm1(T arg) { return functions::expm1(arg); } +inline expr expm1(half arg) +{ + return functions::expm1(arg); +} +inline expr expm1(expr arg) +{ + return functions::expm1(arg); +} + +/// Binary exponential. +/// \param arg function argument +/// \return 2 raised to \a arg +// template typename enable::type exp2(T arg) { return functions::exp2(arg); } +inline expr exp2(half arg) +{ + return functions::exp2(arg); +} +inline expr exp2(expr arg) +{ + return functions::exp2(arg); +} + +/// Natural logorithm. +/// \param arg function argument +/// \return logarithm of \a arg to base e +// template typename enable::type log(T arg) { return functions::log(arg); } +inline expr log(half arg) +{ + return functions::log(arg); +} +inline expr log(expr arg) +{ + return functions::log(arg); +} + +/// Common logorithm. +/// \param arg function argument +/// \return logarithm of \a arg to base 10 +// template typename enable::type log10(T arg) { return functions::log10(arg); } +inline expr log10(half arg) +{ + return functions::log10(arg); +} +inline expr log10(expr arg) +{ + return functions::log10(arg); +} + +/// Natural logorithm. +/// \param arg function argument +/// \return logarithm of \a arg plus 1 to base e +// template typename enable::type log1p(T arg) { return functions::log1p(arg); } +inline expr log1p(half arg) +{ + return functions::log1p(arg); +} +inline expr log1p(expr arg) +{ + return functions::log1p(arg); +} + +/// Binary logorithm. +/// \param arg function argument +/// \return logarithm of \a arg to base 2 +// template typename enable::type log2(T arg) { return functions::log2(arg); } +inline expr log2(half arg) +{ + return functions::log2(arg); +} +inline expr log2(expr arg) +{ + return functions::log2(arg); +} + +/// \} +/// \name Power functions +/// \{ + +/// Square root. +/// \param arg function argument +/// \return square root of \a arg +// template typename enable::type sqrt(T arg) { return functions::sqrt(arg); } +inline expr sqrt(half arg) +{ + return functions::sqrt(arg); +} +inline expr sqrt(expr arg) +{ + return functions::sqrt(arg); +} + +/// Cubic root. +/// \param arg function argument +/// \return cubic root of \a arg +// template typename enable::type cbrt(T arg) { return functions::cbrt(arg); } +inline expr cbrt(half arg) +{ + return functions::cbrt(arg); +} +inline expr cbrt(expr arg) +{ + return functions::cbrt(arg); +} + +/// Hypotenuse function. +/// \param x first argument +/// \param y second argument +/// \return square root of sum of squares without internal over- or underflows +// template typename enable::type hypot(T x, U y) { return functions::hypot(x, y); +//} +inline expr hypot(half x, half y) +{ + return functions::hypot(x, y); +} +inline expr hypot(half x, expr y) +{ + return functions::hypot(x, y); +} +inline expr hypot(expr x, half y) +{ + return functions::hypot(x, y); +} +inline expr hypot(expr x, expr y) +{ + return functions::hypot(x, y); +} + +/// Power function. +/// \param base first argument +/// \param exp second argument +/// \return \a base raised to \a exp +// template typename enable::type pow(T base, U exp) { return functions::pow(base, +// exp); } +inline expr pow(half base, half exp) +{ + return functions::pow(base, exp); +} +inline expr pow(half base, expr exp) +{ + return functions::pow(base, exp); +} +inline expr pow(expr base, half exp) +{ + return functions::pow(base, exp); +} +inline expr pow(expr base, expr exp) +{ + return functions::pow(base, exp); +} + +/// \} +/// \name Trigonometric functions +/// \{ + +/// Sine function. +/// \param arg function argument +/// \return sine value of \a arg +// template typename enable::type sin(T arg) { return functions::sin(arg); } +inline expr sin(half arg) +{ + return functions::sin(arg); +} +inline expr sin(expr arg) +{ + return functions::sin(arg); +} + +/// Cosine function. +/// \param arg function argument +/// \return cosine value of \a arg +// template typename enable::type cos(T arg) { return functions::cos(arg); } +inline expr cos(half arg) +{ + return functions::cos(arg); +} +inline expr cos(expr arg) +{ + return functions::cos(arg); +} + +/// Tangent function. +/// \param arg function argument +/// \return tangent value of \a arg +// template typename enable::type tan(T arg) { return functions::tan(arg); } +inline expr tan(half arg) +{ + return functions::tan(arg); +} +inline expr tan(expr arg) +{ + return functions::tan(arg); +} + +/// Arc sine. +/// \param arg function argument +/// \return arc sine value of \a arg +// template typename enable::type asin(T arg) { return functions::asin(arg); } +inline expr asin(half arg) +{ + return functions::asin(arg); +} +inline expr asin(expr arg) +{ + return functions::asin(arg); +} + +/// Arc cosine function. +/// \param arg function argument +/// \return arc cosine value of \a arg +// template typename enable::type acos(T arg) { return functions::acos(arg); } +inline expr acos(half arg) +{ + return functions::acos(arg); +} +inline expr acos(expr arg) +{ + return functions::acos(arg); +} + +/// Arc tangent function. +/// \param arg function argument +/// \return arc tangent value of \a arg +// template typename enable::type atan(T arg) { return functions::atan(arg); } +inline expr atan(half arg) +{ + return functions::atan(arg); +} +inline expr atan(expr arg) +{ + return functions::atan(arg); +} + +/// Arc tangent function. +/// \param x first argument +/// \param y second argument +/// \return arc tangent value +// template typename enable::type atan2(T x, U y) { return functions::atan2(x, y); +//} +inline expr atan2(half x, half y) +{ + return functions::atan2(x, y); +} +inline expr atan2(half x, expr y) +{ + return functions::atan2(x, y); +} +inline expr atan2(expr x, half y) +{ + return functions::atan2(x, y); +} +inline expr atan2(expr x, expr y) +{ + return functions::atan2(x, y); +} + +/// \} +/// \name Hyperbolic functions +/// \{ + +/// Hyperbolic sine. +/// \param arg function argument +/// \return hyperbolic sine value of \a arg +// template typename enable::type sinh(T arg) { return functions::sinh(arg); } +inline expr sinh(half arg) +{ + return functions::sinh(arg); +} +inline expr sinh(expr arg) +{ + return functions::sinh(arg); +} + +/// Hyperbolic cosine. +/// \param arg function argument +/// \return hyperbolic cosine value of \a arg +// template typename enable::type cosh(T arg) { return functions::cosh(arg); } +inline expr cosh(half arg) +{ + return functions::cosh(arg); +} +inline expr cosh(expr arg) +{ + return functions::cosh(arg); +} + +/// Hyperbolic tangent. +/// \param arg function argument +/// \return hyperbolic tangent value of \a arg +// template typename enable::type tanh(T arg) { return functions::tanh(arg); } +inline expr tanh(half arg) +{ + return functions::tanh(arg); +} +inline expr tanh(expr arg) +{ + return functions::tanh(arg); +} + +/// Hyperbolic area sine. +/// \param arg function argument +/// \return area sine value of \a arg +// template typename enable::type asinh(T arg) { return functions::asinh(arg); } +inline expr asinh(half arg) +{ + return functions::asinh(arg); +} +inline expr asinh(expr arg) +{ + return functions::asinh(arg); +} + +/// Hyperbolic area cosine. +/// \param arg function argument +/// \return area cosine value of \a arg +// template typename enable::type acosh(T arg) { return functions::acosh(arg); } +inline expr acosh(half arg) +{ + return functions::acosh(arg); +} +inline expr acosh(expr arg) +{ + return functions::acosh(arg); +} + +/// Hyperbolic area tangent. +/// \param arg function argument +/// \return area tangent value of \a arg +// template typename enable::type atanh(T arg) { return functions::atanh(arg); } +inline expr atanh(half arg) +{ + return functions::atanh(arg); +} +inline expr atanh(expr arg) +{ + return functions::atanh(arg); +} + +/// \} +/// \name Error and gamma functions +/// \{ + +/// Error function. +/// \param arg function argument +/// \return error function value of \a arg +// template typename enable::type erf(T arg) { return functions::erf(arg); } +inline expr erf(half arg) +{ + return functions::erf(arg); +} +inline expr erf(expr arg) +{ + return functions::erf(arg); +} + +/// Complementary error function. +/// \param arg function argument +/// \return 1 minus error function value of \a arg +// template typename enable::type erfc(T arg) { return functions::erfc(arg); } +inline expr erfc(half arg) +{ + return functions::erfc(arg); +} +inline expr erfc(expr arg) +{ + return functions::erfc(arg); +} + +/// Natural logarithm of gamma function. +/// \param arg function argument +/// \return natural logarith of gamma function for \a arg +// template typename enable::type lgamma(T arg) { return functions::lgamma(arg); } +inline expr lgamma(half arg) +{ + return functions::lgamma(arg); +} +inline expr lgamma(expr arg) +{ + return functions::lgamma(arg); +} + +/// Gamma function. +/// \param arg function argument +/// \return gamma function value of \a arg +// template typename enable::type tgamma(T arg) { return functions::tgamma(arg); } +inline expr tgamma(half arg) +{ + return functions::tgamma(arg); +} +inline expr tgamma(expr arg) +{ + return functions::tgamma(arg); +} + +/// \} +/// \name Rounding +/// \{ + +/// Nearest integer not less than half value. +/// \param arg half to round +/// \return nearest integer not less than \a arg +// template typename enable::type ceil(T arg) { return functions::ceil(arg); } +inline half ceil(half arg) +{ + return functions::ceil(arg); +} +inline half ceil(expr arg) +{ + return functions::ceil(arg); +} + +/// Nearest integer not greater than half value. +/// \param arg half to round +/// \return nearest integer not greater than \a arg +// template typename enable::type floor(T arg) { return functions::floor(arg); } +inline half floor(half arg) +{ + return functions::floor(arg); +} +inline half floor(expr arg) +{ + return functions::floor(arg); +} + +/// Nearest integer not greater in magnitude than half value. +/// \param arg half to round +/// \return nearest integer not greater in magnitude than \a arg +// template typename enable::type trunc(T arg) { return functions::trunc(arg); } +inline half trunc(half arg) +{ + return functions::trunc(arg); +} +inline half trunc(expr arg) +{ + return functions::trunc(arg); +} + +/// Nearest integer. +/// \param arg half to round +/// \return nearest integer, rounded away from zero in half-way cases +// template typename enable::type round(T arg) { return functions::round(arg); } +inline half round(half arg) +{ + return functions::round(arg); +} +inline half round(expr arg) +{ + return functions::round(arg); +} + +/// Nearest integer. +/// \param arg half to round +/// \return nearest integer, rounded away from zero in half-way cases +// template typename enable::type lround(T arg) { return functions::lround(arg); } +inline long lround(half arg) +{ + return functions::lround(arg); +} +inline long lround(expr arg) +{ + return functions::lround(arg); +} + +/// Nearest integer using half's internal rounding mode. +/// \param arg half expression to round +/// \return nearest integer using default rounding mode +// template typename enable::type nearbyint(T arg) { return functions::nearbyint(arg); } +inline half nearbyint(half arg) +{ + return functions::rint(arg); +} +inline half nearbyint(expr arg) +{ + return functions::rint(arg); +} + +/// Nearest integer using half's internal rounding mode. +/// \param arg half expression to round +/// \return nearest integer using default rounding mode +// template typename enable::type rint(T arg) { return functions::rint(arg); } +inline half rint(half arg) +{ + return functions::rint(arg); +} +inline half rint(expr arg) +{ + return functions::rint(arg); +} + +/// Nearest integer using half's internal rounding mode. +/// \param arg half expression to round +/// \return nearest integer using default rounding mode +// template typename enable::type lrint(T arg) { return functions::lrint(arg); } +inline long lrint(half arg) +{ + return functions::lrint(arg); +} +inline long lrint(expr arg) +{ + return functions::lrint(arg); +} +#if HALF_ENABLE_CPP11_LONG_LONG +/// Nearest integer. +/// \param arg half to round +/// \return nearest integer, rounded away from zero in half-way cases +// template typename enable::type llround(T arg) { return functions::llround(arg); } +inline long long llround(half arg) +{ + return functions::llround(arg); +} +inline long long llround(expr arg) +{ + return functions::llround(arg); +} + +/// Nearest integer using half's internal rounding mode. +/// \param arg half expression to round +/// \return nearest integer using default rounding mode +// template typename enable::type llrint(T arg) { return functions::llrint(arg); } +inline long long llrint(half arg) +{ + return functions::llrint(arg); +} +inline long long llrint(expr arg) +{ + return functions::llrint(arg); +} +#endif + +/// \} +/// \name Floating point manipulation +/// \{ + +/// Decompress floating point number. +/// \param arg number to decompress +/// \param exp address to store exponent at +/// \return significant in range [0.5, 1) +// template typename enable::type frexp(T arg, int *exp) { return functions::frexp(arg, exp); } +inline half frexp(half arg, int* exp) +{ + return functions::frexp(arg, exp); +} +inline half frexp(expr arg, int* exp) +{ + return functions::frexp(arg, exp); +} + +/// Multiply by power of two. +/// \param arg number to modify +/// \param exp power of two to multiply with +/// \return \a arg multplied by 2 raised to \a exp +// template typename enable::type ldexp(T arg, int exp) { return functions::scalbln(arg, exp); +//} +inline half ldexp(half arg, int exp) +{ + return functions::scalbln(arg, exp); +} +inline half ldexp(expr arg, int exp) +{ + return functions::scalbln(arg, exp); +} + +/// Extract integer and fractional parts. +/// \param arg number to decompress +/// \param iptr address to store integer part at +/// \return fractional part +// template typename enable::type modf(T arg, half *iptr) { return functions::modf(arg, iptr); +//} +inline half modf(half arg, half* iptr) +{ + return functions::modf(arg, iptr); +} +inline half modf(expr arg, half* iptr) +{ + return functions::modf(arg, iptr); +} + +/// Multiply by power of two. +/// \param arg number to modify +/// \param exp power of two to multiply with +/// \return \a arg multplied by 2 raised to \a exp +// template typename enable::type scalbn(T arg, int exp) { return functions::scalbln(arg, exp); +//} +inline half scalbn(half arg, int exp) +{ + return functions::scalbln(arg, exp); +} +inline half scalbn(expr arg, int exp) +{ + return functions::scalbln(arg, exp); +} + +/// Multiply by power of two. +/// \param arg number to modify +/// \param exp power of two to multiply with +/// \return \a arg multplied by 2 raised to \a exp +// template typename enable::type scalbln(T arg, long exp) { return functions::scalbln(arg, +// exp); +//} +inline half scalbln(half arg, long exp) +{ + return functions::scalbln(arg, exp); +} +inline half scalbln(expr arg, long exp) +{ + return functions::scalbln(arg, exp); +} + +/// Extract exponent. +/// \param arg number to query +/// \return floating point exponent +/// \retval FP_ILOGB0 for zero +/// \retval FP_ILOGBNAN for NaN +/// \retval MAX_INT for infinity +// template typename enable::type ilogb(T arg) { return functions::ilogb(arg); } +inline int ilogb(half arg) +{ + return functions::ilogb(arg); +} +inline int ilogb(expr arg) +{ + return functions::ilogb(arg); +} + +/// Extract exponent. +/// \param arg number to query +/// \return floating point exponent +// template typename enable::type logb(T arg) { return functions::logb(arg); } +inline half logb(half arg) +{ + return functions::logb(arg); +} +inline half logb(expr arg) +{ + return functions::logb(arg); +} + +/// Next representable value. +/// \param from value to compute next representable value for +/// \param to direction towards which to compute next value +/// \return next representable value after \a from in direction towards \a to +// template typename enable::type nextafter(T from, U to) { return +// functions::nextafter(from, to); } +inline half nextafter(half from, half to) +{ + return functions::nextafter(from, to); +} +inline half nextafter(half from, expr to) +{ + return functions::nextafter(from, to); +} +inline half nextafter(expr from, half to) +{ + return functions::nextafter(from, to); +} +inline half nextafter(expr from, expr to) +{ + return functions::nextafter(from, to); +} + +/// Next representable value. +/// \param from value to compute next representable value for +/// \param to direction towards which to compute next value +/// \return next representable value after \a from in direction towards \a to +// template typename enable::type nexttoward(T from, long double to) { return +// functions::nexttoward(from, to); } +inline half nexttoward(half from, long double to) +{ + return functions::nexttoward(from, to); +} +inline half nexttoward(expr from, long double to) +{ + return functions::nexttoward(from, to); +} + +/// Take sign. +/// \param x value to change sign for +/// \param y value to take sign from +/// \return value equal to \a x in magnitude and to \a y in sign +// template typename enable::type copysign(T x, U y) { return +// functions::copysign(x, y); } +inline half copysign(half x, half y) +{ + return functions::copysign(x, y); +} +inline half copysign(half x, expr y) +{ + return functions::copysign(x, y); +} +inline half copysign(expr x, half y) +{ + return functions::copysign(x, y); +} +inline half copysign(expr x, expr y) +{ + return functions::copysign(x, y); +} + +/// \} +/// \name Floating point classification +/// \{ + +/// Classify floating point value. +/// \param arg number to classify +/// \retval FP_ZERO for positive and negative zero +/// \retval FP_SUBNORMAL for subnormal numbers +/// \retval FP_INFINITY for positive and negative infinity +/// \retval FP_NAN for NaNs +/// \retval FP_NORMAL for all other (normal) values +// template typename enable::type fpclassify(T arg) { return functions::fpclassify(arg); } +inline int fpclassify(half arg) +{ + return functions::fpclassify(arg); +} +inline int fpclassify(expr arg) +{ + return functions::fpclassify(arg); +} + +/// Check if finite number. +/// \param arg number to check +/// \retval true if neither infinity nor NaN +/// \retval false else +// template typename enable::type isfinite(T arg) { return functions::isfinite(arg); } +inline bool isfinite(half arg) +{ + return functions::isfinite(arg); +} +inline bool isfinite(expr arg) +{ + return functions::isfinite(arg); +} + +/// Check for infinity. +/// \param arg number to check +/// \retval true for positive or negative infinity +/// \retval false else +// template typename enable::type isinf(T arg) { return functions::isinf(arg); } +inline bool isinf(half arg) +{ + return functions::isinf(arg); +} +inline bool isinf(expr arg) +{ + return functions::isinf(arg); +} + +/// Check for NaN. +/// \param arg number to check +/// \retval true for NaNs +/// \retval false else +// template typename enable::type isnan(T arg) { return functions::isnan(arg); } +inline bool isnan(half arg) +{ + return functions::isnan(arg); +} +inline bool isnan(expr arg) +{ + return functions::isnan(arg); +} + +/// Check if normal number. +/// \param arg number to check +/// \retval true if normal number +/// \retval false if either subnormal, zero, infinity or NaN +// template typename enable::type isnormal(T arg) { return functions::isnormal(arg); } +inline bool isnormal(half arg) +{ + return functions::isnormal(arg); +} +inline bool isnormal(expr arg) +{ + return functions::isnormal(arg); +} + +/// Check sign. +/// \param arg number to check +/// \retval true for negative number +/// \retval false for positive number +// template typename enable::type signbit(T arg) { return functions::signbit(arg); } +inline bool signbit(half arg) +{ + return functions::signbit(arg); +} +inline bool signbit(expr arg) +{ + return functions::signbit(arg); +} + +/// \} +/// \name Comparison +/// \{ + +/// Comparison for greater than. +/// \param x first operand +/// \param y second operand +/// \retval true if \a x greater than \a y +/// \retval false else +// template typename enable::type isgreater(T x, U y) { return +// functions::isgreater(x, y); } +inline bool isgreater(half x, half y) +{ + return functions::isgreater(x, y); +} +inline bool isgreater(half x, expr y) +{ + return functions::isgreater(x, y); +} +inline bool isgreater(expr x, half y) +{ + return functions::isgreater(x, y); +} +inline bool isgreater(expr x, expr y) +{ + return functions::isgreater(x, y); +} + +/// Comparison for greater equal. +/// \param x first operand +/// \param y second operand +/// \retval true if \a x greater equal \a y +/// \retval false else +// template typename enable::type isgreaterequal(T x, U y) { return +// functions::isgreaterequal(x, y); } +inline bool isgreaterequal(half x, half y) +{ + return functions::isgreaterequal(x, y); +} +inline bool isgreaterequal(half x, expr y) +{ + return functions::isgreaterequal(x, y); +} +inline bool isgreaterequal(expr x, half y) +{ + return functions::isgreaterequal(x, y); +} +inline bool isgreaterequal(expr x, expr y) +{ + return functions::isgreaterequal(x, y); +} + +/// Comparison for less than. +/// \param x first operand +/// \param y second operand +/// \retval true if \a x less than \a y +/// \retval false else +// template typename enable::type isless(T x, U y) { return functions::isless(x, +// y); +//} +inline bool isless(half x, half y) +{ + return functions::isless(x, y); +} +inline bool isless(half x, expr y) +{ + return functions::isless(x, y); +} +inline bool isless(expr x, half y) +{ + return functions::isless(x, y); +} +inline bool isless(expr x, expr y) +{ + return functions::isless(x, y); +} + +/// Comparison for less equal. +/// \param x first operand +/// \param y second operand +/// \retval true if \a x less equal \a y +/// \retval false else +// template typename enable::type islessequal(T x, U y) { return +// functions::islessequal(x, y); } +inline bool islessequal(half x, half y) +{ + return functions::islessequal(x, y); +} +inline bool islessequal(half x, expr y) +{ + return functions::islessequal(x, y); +} +inline bool islessequal(expr x, half y) +{ + return functions::islessequal(x, y); +} +inline bool islessequal(expr x, expr y) +{ + return functions::islessequal(x, y); +} + +/// Comarison for less or greater. +/// \param x first operand +/// \param y second operand +/// \retval true if either less or greater +/// \retval false else +// template typename enable::type islessgreater(T x, U y) { return +// functions::islessgreater(x, y); } +inline bool islessgreater(half x, half y) +{ + return functions::islessgreater(x, y); +} +inline bool islessgreater(half x, expr y) +{ + return functions::islessgreater(x, y); +} +inline bool islessgreater(expr x, half y) +{ + return functions::islessgreater(x, y); +} +inline bool islessgreater(expr x, expr y) +{ + return functions::islessgreater(x, y); +} + +/// Check if unordered. +/// \param x first operand +/// \param y second operand +/// \retval true if unordered (one or two NaN operands) +/// \retval false else +// template typename enable::type isunordered(T x, U y) { return +// functions::isunordered(x, y); } +inline bool isunordered(half x, half y) +{ + return functions::isunordered(x, y); +} +inline bool isunordered(half x, expr y) +{ + return functions::isunordered(x, y); +} +inline bool isunordered(expr x, half y) +{ + return functions::isunordered(x, y); +} +inline bool isunordered(expr x, expr y) +{ + return functions::isunordered(x, y); +} + +/// \name Casting +/// \{ + +/// Cast to or from half-precision floating point number. +/// This casts between [half](\ref half_float::half) and any built-in arithmetic type. The values are converted +/// directly using the given rounding mode, without any roundtrip over `float` that a `static_cast` would otherwise do. +/// It uses the default rounding mode. +/// +/// Using this cast with neither of the two types being a [half](\ref half_float::half) or with any of the two types +/// not being a built-in arithmetic type (apart from [half](\ref half_float::half), of course) results in a compiler +/// error and casting between [half](\ref half_float::half)s is just a no-op. +/// \tparam T destination type (half or built-in arithmetic type) +/// \tparam U source type (half or built-in arithmetic type) +/// \param arg value to cast +/// \return \a arg converted to destination type +template +T half_cast(U arg) +{ + return half_caster::cast(arg); +} + +/// Cast to or from half-precision floating point number. +/// This casts between [half](\ref half_float::half) and any built-in arithmetic type. The values are converted +/// directly using the given rounding mode, without any roundtrip over `float` that a `static_cast` would otherwise do. +/// +/// Using this cast with neither of the two types being a [half](\ref half_float::half) or with any of the two types +/// not being a built-in arithmetic type (apart from [half](\ref half_float::half), of course) results in a compiler +/// error and casting between [half](\ref half_float::half)s is just a no-op. +/// \tparam T destination type (half or built-in arithmetic type) +/// \tparam R rounding mode to use. +/// \tparam U source type (half or built-in arithmetic type) +/// \param arg value to cast +/// \return \a arg converted to destination type +template +T half_cast(U arg) +{ + return half_caster::cast(arg); +} +/// \} +} // namespace detail + +using detail::operator==; +using detail::operator!=; +using detail::operator<; +using detail::operator>; +using detail::operator<=; +using detail::operator>=; +using detail::operator+; +using detail::operator-; +using detail::operator*; +using detail::operator/; +using detail::operator<<; +using detail::operator>>; + +using detail::abs; +using detail::acos; +using detail::acosh; +using detail::asin; +using detail::asinh; +using detail::atan; +using detail::atan2; +using detail::atanh; +using detail::cbrt; +using detail::ceil; +using detail::cos; +using detail::cosh; +using detail::erf; +using detail::erfc; +using detail::exp; +using detail::exp2; +using detail::expm1; +using detail::fabs; +using detail::fdim; +using detail::floor; +using detail::fma; +using detail::fmax; +using detail::fmin; +using detail::fmod; +using detail::hypot; +using detail::lgamma; +using detail::log; +using detail::log10; +using detail::log1p; +using detail::log2; +using detail::lrint; +using detail::lround; +using detail::nanh; +using detail::nearbyint; +using detail::pow; +using detail::remainder; +using detail::remquo; +using detail::rint; +using detail::round; +using detail::sin; +using detail::sinh; +using detail::sqrt; +using detail::tan; +using detail::tanh; +using detail::tgamma; +using detail::trunc; +#if HALF_ENABLE_CPP11_LONG_LONG +using detail::llrint; +using detail::llround; +#endif +using detail::copysign; +using detail::fpclassify; +using detail::frexp; +using detail::ilogb; +using detail::isfinite; +using detail::isgreater; +using detail::isgreaterequal; +using detail::isinf; +using detail::isless; +using detail::islessequal; +using detail::islessgreater; +using detail::isnan; +using detail::isnormal; +using detail::isunordered; +using detail::ldexp; +using detail::logb; +using detail::modf; +using detail::nextafter; +using detail::nexttoward; +using detail::scalbln; +using detail::scalbn; +using detail::signbit; + +using detail::half_cast; +} // namespace half_float + +/// Extensions to the C++ standard library. +namespace std +{ +/// Numeric limits for half-precision floats. +/// Because of the underlying single-precision implementation of many operations, it inherits some properties from +/// `std::numeric_limits`. +template <> +class numeric_limits : public numeric_limits +{ +public: + /// Supports signed values. + static HALF_CONSTEXPR_CONST bool is_signed = true; + + /// Is not exact. + static HALF_CONSTEXPR_CONST bool is_exact = false; + + /// Doesn't provide modulo arithmetic. + static HALF_CONSTEXPR_CONST bool is_modulo = false; + + /// IEEE conformant. + static HALF_CONSTEXPR_CONST bool is_iec559 = true; + + /// Supports infinity. + static HALF_CONSTEXPR_CONST bool has_infinity = true; + + /// Supports quiet NaNs. + static HALF_CONSTEXPR_CONST bool has_quiet_NaN = true; + + /// Supports subnormal values. + static HALF_CONSTEXPR_CONST float_denorm_style has_denorm = denorm_present; + + /// Rounding mode. + /// Due to the mix of internal single-precision computations (using the rounding mode of the underlying + /// single-precision implementation) with the rounding mode of the single-to-half conversions, the actual rounding + /// mode might be `std::round_indeterminate` if the default half-precision rounding mode doesn't match the + /// single-precision rounding mode. + static HALF_CONSTEXPR_CONST float_round_style round_style + = (std::numeric_limits::round_style == half_float::half::round_style) ? half_float::half::round_style + : round_indeterminate; + + /// Significant digits. + static HALF_CONSTEXPR_CONST int digits = 11; + + /// Significant decimal digits. + static HALF_CONSTEXPR_CONST int digits10 = 3; + + /// Required decimal digits to represent all possible values. + static HALF_CONSTEXPR_CONST int max_digits10 = 5; + + /// Number base. + static HALF_CONSTEXPR_CONST int radix = 2; + + /// One more than smallest exponent. + static HALF_CONSTEXPR_CONST int min_exponent = -13; + + /// Smallest normalized representable power of 10. + static HALF_CONSTEXPR_CONST int min_exponent10 = -4; + + /// One more than largest exponent + static HALF_CONSTEXPR_CONST int max_exponent = 16; + + /// Largest finitely representable power of 10. + static HALF_CONSTEXPR_CONST int max_exponent10 = 4; + + /// Smallest positive normal value. + static HALF_CONSTEXPR half_float::half min() HALF_NOTHROW + { + return half_float::half(half_float::detail::binary, 0x0400); + } + + /// Smallest finite value. + static HALF_CONSTEXPR half_float::half lowest() HALF_NOTHROW + { + return half_float::half(half_float::detail::binary, 0xFBFF); + } + + /// Largest finite value. + static HALF_CONSTEXPR half_float::half max() HALF_NOTHROW + { + return half_float::half(half_float::detail::binary, 0x7BFF); + } + + /// Difference between one and next representable value. + static HALF_CONSTEXPR half_float::half epsilon() HALF_NOTHROW + { + return half_float::half(half_float::detail::binary, 0x1400); + } + + /// Maximum rounding error. + static HALF_CONSTEXPR half_float::half round_error() HALF_NOTHROW + { + return half_float::half(half_float::detail::binary, (round_style == std::round_to_nearest) ? 0x3800 : 0x3C00); + } + + /// Positive infinity. + static HALF_CONSTEXPR half_float::half infinity() HALF_NOTHROW + { + return half_float::half(half_float::detail::binary, 0x7C00); + } + + /// Quiet NaN. + static HALF_CONSTEXPR half_float::half quiet_NaN() HALF_NOTHROW + { + return half_float::half(half_float::detail::binary, 0x7FFF); + } + + /// Signalling NaN. + static HALF_CONSTEXPR half_float::half signaling_NaN() HALF_NOTHROW + { + return half_float::half(half_float::detail::binary, 0x7DFF); + } + + /// Smallest positive subnormal value. + static HALF_CONSTEXPR half_float::half denorm_min() HALF_NOTHROW + { + return half_float::half(half_float::detail::binary, 0x0001); + } +}; + +#if HALF_ENABLE_CPP11_HASH +/// Hash function for half-precision floats. +/// This is only defined if C++11 `std::hash` is supported and enabled. +template <> +struct hash //: unary_function +{ + /// Type of function argument. + typedef half_float::half argument_type; + + /// Function return type. + typedef size_t result_type; + + /// Compute hash function. + /// \param arg half to hash + /// \return hash value + result_type operator()(argument_type arg) const + { + return hash()(static_cast(arg.data_) & -(arg.data_ != 0x8000)); + } +}; +#endif +} // namespace std + +#undef HALF_CONSTEXPR +#undef HALF_CONSTEXPR_CONST +#undef HALF_NOEXCEPT +#undef HALF_NOTHROW +#ifdef HALF_POP_WARNINGS +#pragma warning(pop) +#undef HALF_POP_WARNINGS +#endif + +#endif diff --git a/Code/TestTRTInterDll/trtinfer_lib/common/logger.cpp b/Code/TestTRTInterDll/trtinfer_lib/common/logger.cpp new file mode 100644 index 0000000..85c21e7 --- /dev/null +++ b/Code/TestTRTInterDll/trtinfer_lib/common/logger.cpp @@ -0,0 +1,41 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "logger.h" +#include "ErrorRecorder.h" +#include "logging.h" +using namespace nvinfer1; +SampleErrorRecorder gRecorder; +namespace sample +{ + Logger gLogger{ Logger::Severity::kINFO }; + LogStreamConsumer gLogVerbose{ LOG_VERBOSE(gLogger) }; + LogStreamConsumer gLogInfo{ LOG_INFO(gLogger) }; + LogStreamConsumer gLogWarning{ LOG_WARN(gLogger) }; + LogStreamConsumer gLogError{ LOG_ERROR(gLogger) }; + LogStreamConsumer gLogFatal{ LOG_FATAL(gLogger) }; + + void setReportableSeverity(Logger::Severity severity) + { + gLogger.setReportableSeverity(severity); + gLogVerbose.setReportableSeverity(severity); + gLogInfo.setReportableSeverity(severity); + gLogWarning.setReportableSeverity(severity); + gLogError.setReportableSeverity(severity); + gLogFatal.setReportableSeverity(severity); + } +} // namespace sample diff --git a/Code/TestTRTInterDll/trtinfer_lib/common/logger.h b/Code/TestTRTInterDll/trtinfer_lib/common/logger.h new file mode 100644 index 0000000..ff59bfa --- /dev/null +++ b/Code/TestTRTInterDll/trtinfer_lib/common/logger.h @@ -0,0 +1,37 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LOGGER_H +#define LOGGER_H + +#include "logging.h" + +class SampleErrorRecorder; +extern SampleErrorRecorder gRecorder; +namespace sample +{ +extern Logger gLogger; +extern LogStreamConsumer gLogVerbose; +extern LogStreamConsumer gLogInfo; +extern LogStreamConsumer gLogWarning; +extern LogStreamConsumer gLogError; +extern LogStreamConsumer gLogFatal; + +void setReportableSeverity(Logger::Severity severity); +} // namespace sample + +#endif // LOGGER_H diff --git a/Code/TestTRTInterDll/trtinfer_lib/common/logging.h b/Code/TestTRTInterDll/trtinfer_lib/common/logging.h new file mode 100644 index 0000000..38cbbd0 --- /dev/null +++ b/Code/TestTRTInterDll/trtinfer_lib/common/logging.h @@ -0,0 +1,579 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef TENSORRT_LOGGING_H +#define TENSORRT_LOGGING_H + +#include "NvInferRuntimeCommon.h" +#include "sampleOptions.h" +#include +#include +#include +#include +#include +#include +#include +#include + +namespace sample +{ + +using Severity = nvinfer1::ILogger::Severity; + +class LogStreamConsumerBuffer : public std::stringbuf +{ +public: + LogStreamConsumerBuffer(std::ostream& stream, const std::string& prefix, bool shouldLog) + : mOutput(stream) + , mPrefix(prefix) + , mShouldLog(shouldLog) + { + } + + LogStreamConsumerBuffer(LogStreamConsumerBuffer&& other) noexcept + : mOutput(other.mOutput) + , mPrefix(other.mPrefix) + , mShouldLog(other.mShouldLog) + { + } + LogStreamConsumerBuffer(const LogStreamConsumerBuffer& other) = delete; + LogStreamConsumerBuffer() = delete; + LogStreamConsumerBuffer& operator=(const LogStreamConsumerBuffer&) = delete; + LogStreamConsumerBuffer& operator=(LogStreamConsumerBuffer&&) = delete; + + ~LogStreamConsumerBuffer() override + { + // std::streambuf::pbase() gives a pointer to the beginning of the buffered part of the output sequence + // std::streambuf::pptr() gives a pointer to the current position of the output sequence + // if the pointer to the beginning is not equal to the pointer to the current position, + // call putOutput() to log the output to the stream + if (pbase() != pptr()) + { + putOutput(); + } + } + + //! + //! synchronizes the stream buffer and returns 0 on success + //! synchronizing the stream buffer consists of inserting the buffer contents into the stream, + //! resetting the buffer and flushing the stream + //! + int32_t sync() override + { + putOutput(); + return 0; + } + + void putOutput() + { + if (mShouldLog) + { + // prepend timestamp + std::time_t timestamp = std::time(nullptr); + tm* tm_local = std::localtime(×tamp); + mOutput << "["; + mOutput << std::setw(2) << std::setfill('0') << 1 + tm_local->tm_mon << "/"; + mOutput << std::setw(2) << std::setfill('0') << tm_local->tm_mday << "/"; + mOutput << std::setw(4) << std::setfill('0') << 1900 + tm_local->tm_year << "-"; + mOutput << std::setw(2) << std::setfill('0') << tm_local->tm_hour << ":"; + mOutput << std::setw(2) << std::setfill('0') << tm_local->tm_min << ":"; + mOutput << std::setw(2) << std::setfill('0') << tm_local->tm_sec << "] "; + // std::stringbuf::str() gets the string contents of the buffer + // insert the buffer contents pre-appended by the appropriate prefix into the stream + mOutput << mPrefix << str(); + } + // set the buffer to empty + str(""); + // flush the stream + mOutput.flush(); + } + + void setShouldLog(bool shouldLog) + { + mShouldLog = shouldLog; + } + +private: + std::ostream& mOutput; + std::string mPrefix; + bool mShouldLog{}; +}; // class LogStreamConsumerBuffer + +//! +//! \class LogStreamConsumerBase +//! \brief Convenience object used to initialize LogStreamConsumerBuffer before std::ostream in LogStreamConsumer +//! +class LogStreamConsumerBase +{ +public: + LogStreamConsumerBase(std::ostream& stream, const std::string& prefix, bool shouldLog) + : mBuffer(stream, prefix, shouldLog) + { + } + +protected: + std::mutex mLogMutex; + LogStreamConsumerBuffer mBuffer; +}; // class LogStreamConsumerBase + +//! +//! \class LogStreamConsumer +//! \brief Convenience object used to facilitate use of C++ stream syntax when logging messages. +//! Order of base classes is LogStreamConsumerBase and then std::ostream. +//! This is because the LogStreamConsumerBase class is used to initialize the LogStreamConsumerBuffer member field +//! in LogStreamConsumer and then the address of the buffer is passed to std::ostream. +//! This is necessary to prevent the address of an uninitialized buffer from being passed to std::ostream. +//! Please do not change the order of the parent classes. +//! +class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream +{ +public: + //! + //! \brief Creates a LogStreamConsumer which logs messages with level severity. + //! Reportable severity determines if the messages are severe enough to be logged. + //! + LogStreamConsumer(nvinfer1::ILogger::Severity reportableSeverity, nvinfer1::ILogger::Severity severity) + : LogStreamConsumerBase(severityOstream(severity), severityPrefix(severity), severity <= reportableSeverity) + , std::ostream(&mBuffer) // links the stream buffer with the stream + , mShouldLog(severity <= reportableSeverity) + , mSeverity(severity) + { + } + + LogStreamConsumer(LogStreamConsumer&& other) noexcept + : LogStreamConsumerBase(severityOstream(other.mSeverity), severityPrefix(other.mSeverity), other.mShouldLog) + , std::ostream(&mBuffer) // links the stream buffer with the stream + , mShouldLog(other.mShouldLog) + , mSeverity(other.mSeverity) + { + } + LogStreamConsumer(const LogStreamConsumer& other) = delete; + LogStreamConsumer() = delete; + ~LogStreamConsumer() override = default; + LogStreamConsumer& operator=(const LogStreamConsumer&) = delete; + LogStreamConsumer& operator=(LogStreamConsumer&&) = delete; + + void setReportableSeverity(Severity reportableSeverity) + { + mShouldLog = mSeverity <= reportableSeverity; + mBuffer.setShouldLog(mShouldLog); + } + + std::mutex& getMutex() + { + return mLogMutex; + } + + bool getShouldLog() const + { + return mShouldLog; + } + +private: + static std::ostream& severityOstream(Severity severity) + { + return severity >= Severity::kINFO ? std::cout : std::cerr; + } + + static std::string severityPrefix(Severity severity) + { + switch (severity) + { + case Severity::kINTERNAL_ERROR: return "[F] "; + case Severity::kERROR: return "[E] "; + case Severity::kWARNING: return "[W] "; + case Severity::kINFO: return "[I] "; + case Severity::kVERBOSE: return "[V] "; + default: assert(0); return ""; + } + } + + bool mShouldLog; + Severity mSeverity; +}; // class LogStreamConsumer + +template +LogStreamConsumer& operator<<(LogStreamConsumer& logger, const T& obj) +{ + if (logger.getShouldLog()) + { + std::lock_guard guard(logger.getMutex()); + auto& os = static_cast(logger); + os << obj; + } + return logger; +} + +//! +//! Special handling std::endl +//! +inline LogStreamConsumer& operator<<(LogStreamConsumer& logger, std::ostream& (*f)(std::ostream&) ) +{ + if (logger.getShouldLog()) + { + std::lock_guard guard(logger.getMutex()); + auto& os = static_cast(logger); + os << f; + } + return logger; +} + +inline LogStreamConsumer& operator<<(LogStreamConsumer& logger, const nvinfer1::Dims& dims) +{ + if (logger.getShouldLog()) + { + std::lock_guard guard(logger.getMutex()); + auto& os = static_cast(logger); + for (int32_t i = 0; i < dims.nbDims; ++i) + { + os << (i ? "x" : "") << dims.d[i]; + } + } + return logger; +} + +//! +//! \class Logger +//! +//! \brief Class which manages logging of TensorRT tools and samples +//! +//! \details This class provides a common interface for TensorRT tools and samples to log information to the console, +//! and supports logging two types of messages: +//! +//! - Debugging messages with an associated severity (info, warning, error, or internal error/fatal) +//! - Test pass/fail messages +//! +//! The advantage of having all samples use this class for logging as opposed to emitting directly to stdout/stderr is +//! that the logic for controlling the verbosity and formatting of sample output is centralized in one location. +//! +//! In the future, this class could be extended to support dumping test results to a file in some standard format +//! (for example, JUnit XML), and providing additional metadata (e.g. timing the duration of a test run). +//! +//! TODO: For backwards compatibility with existing samples, this class inherits directly from the nvinfer1::ILogger +//! interface, which is problematic since there isn't a clean separation between messages coming from the TensorRT +//! library and messages coming from the sample. +//! +//! In the future (once all samples are updated to use Logger::getTRTLogger() to access the ILogger) we can refactor the +//! class to eliminate the inheritance and instead make the nvinfer1::ILogger implementation a member of the Logger +//! object. +//! +class Logger : public nvinfer1::ILogger +{ +public: + explicit Logger(Severity severity = Severity::kWARNING) + : mReportableSeverity(severity) + { + } + + //! + //! \enum TestResult + //! \brief Represents the state of a given test + //! + enum class TestResult + { + kRUNNING, //!< The test is running + kPASSED, //!< The test passed + kFAILED, //!< The test failed + kWAIVED //!< The test was waived + }; + + //! + //! \brief Forward-compatible method for retrieving the nvinfer::ILogger associated with this Logger + //! \return The nvinfer1::ILogger associated with this Logger + //! + //! TODO Once all samples are updated to use this method to register the logger with TensorRT, + //! we can eliminate the inheritance of Logger from ILogger + //! + nvinfer1::ILogger& getTRTLogger() noexcept + { + return *this; + } + + //! + //! \brief Implementation of the nvinfer1::ILogger::log() virtual method + //! + //! Note samples should not be calling this function directly; it will eventually go away once we eliminate the + //! inheritance from nvinfer1::ILogger + //! + void log(Severity severity, const char* msg) noexcept override + { + LogStreamConsumer(mReportableSeverity, severity) << "[TRT] " << std::string(msg) << std::endl; + } + + //! + //! \brief Method for controlling the verbosity of logging output + //! + //! \param severity The logger will only emit messages that have severity of this level or higher. + //! + void setReportableSeverity(Severity severity) noexcept + { + mReportableSeverity = severity; + } + + //! + //! \brief Opaque handle that holds logging information for a particular test + //! + //! This object is an opaque handle to information used by the Logger to print test results. + //! The sample must call Logger::defineTest() in order to obtain a TestAtom that can be used + //! with Logger::reportTest{Start,End}(). + //! + class TestAtom + { + public: + TestAtom(TestAtom&&) = default; + + private: + friend class Logger; + + TestAtom(bool started, const std::string& name, const std::string& cmdline) + : mStarted(started) + , mName(name) + , mCmdline(cmdline) + { + } + + bool mStarted; + std::string mName; + std::string mCmdline; + }; + + //! + //! \brief Define a test for logging + //! + //! \param[in] name The name of the test. This should be a string starting with + //! "TensorRT" and containing dot-separated strings containing + //! the characters [A-Za-z0-9_]. + //! For example, "TensorRT.sample_googlenet" + //! \param[in] cmdline The command line used to reproduce the test + // + //! \return a TestAtom that can be used in Logger::reportTest{Start,End}(). + //! + static TestAtom defineTest(const std::string& name, const std::string& cmdline) + { + return TestAtom(false, name, cmdline); + } + + //! + //! \brief A convenience overloaded version of defineTest() that accepts an array of command-line arguments + //! as input + //! + //! \param[in] name The name of the test + //! \param[in] argc The number of command-line arguments + //! \param[in] argv The array of command-line arguments (given as C strings) + //! + //! \return a TestAtom that can be used in Logger::reportTest{Start,End}(). + //! + static TestAtom defineTest(const std::string& name, int32_t argc, char const* const* argv) + { + // Append TensorRT version as info + const std::string vname = name + " [TensorRT v" + std::to_string(NV_TENSORRT_VERSION) + "]"; + auto cmdline = genCmdlineString(argc, argv); + return defineTest(vname, cmdline); + } + + //! + //! \brief Report that a test has started. + //! + //! \pre reportTestStart() has not been called yet for the given testAtom + //! + //! \param[in] testAtom The handle to the test that has started + //! + static void reportTestStart(TestAtom& testAtom) + { + reportTestResult(testAtom, TestResult::kRUNNING); + assert(!testAtom.mStarted); + testAtom.mStarted = true; + } + + //! + //! \brief Report that a test has ended. + //! + //! \pre reportTestStart() has been called for the given testAtom + //! + //! \param[in] testAtom The handle to the test that has ended + //! \param[in] result The result of the test. Should be one of TestResult::kPASSED, + //! TestResult::kFAILED, TestResult::kWAIVED + //! + static void reportTestEnd(TestAtom const& testAtom, TestResult result) + { + assert(result != TestResult::kRUNNING); + assert(testAtom.mStarted); + reportTestResult(testAtom, result); + } + + static int32_t reportPass(TestAtom const& testAtom) + { + reportTestEnd(testAtom, TestResult::kPASSED); + return EXIT_SUCCESS; + } + + static int32_t reportFail(TestAtom const& testAtom) + { + reportTestEnd(testAtom, TestResult::kFAILED); + return EXIT_FAILURE; + } + + static int32_t reportWaive(TestAtom const& testAtom) + { + reportTestEnd(testAtom, TestResult::kWAIVED); + return EXIT_SUCCESS; + } + + static int32_t reportTest(TestAtom const& testAtom, bool pass) + { + return pass ? reportPass(testAtom) : reportFail(testAtom); + } + + Severity getReportableSeverity() const + { + return mReportableSeverity; + } + +private: + //! + //! \brief returns an appropriate string for prefixing a log message with the given severity + //! + static const char* severityPrefix(Severity severity) + { + switch (severity) + { + case Severity::kINTERNAL_ERROR: return "[F] "; + case Severity::kERROR: return "[E] "; + case Severity::kWARNING: return "[W] "; + case Severity::kINFO: return "[I] "; + case Severity::kVERBOSE: return "[V] "; + default: assert(0); return ""; + } + } + + //! + //! \brief returns an appropriate string for prefixing a test result message with the given result + //! + static const char* testResultString(TestResult result) + { + switch (result) + { + case TestResult::kRUNNING: return "RUNNING"; + case TestResult::kPASSED: return "PASSED"; + case TestResult::kFAILED: return "FAILED"; + case TestResult::kWAIVED: return "WAIVED"; + default: assert(0); return ""; + } + } + + //! + //! \brief returns an appropriate output stream (cout or cerr) to use with the given severity + //! + static std::ostream& severityOstream(Severity severity) + { + return severity >= Severity::kINFO ? std::cout : std::cerr; + } + + //! + //! \brief method that implements logging test results + //! + static void reportTestResult(TestAtom const& testAtom, TestResult result) + { + severityOstream(Severity::kINFO) << "&&&& " << testResultString(result) << " " << testAtom.mName << " # " + << testAtom.mCmdline << std::endl; + } + + //! + //! \brief generate a command line string from the given (argc, argv) values + //! + static std::string genCmdlineString(int32_t argc, char const* const* argv) + { + std::stringstream ss; + for (int32_t i = 0; i < argc; i++) + { + if (i > 0) + { + ss << " "; + } + ss << argv[i]; + } + return ss.str(); + } + + Severity mReportableSeverity; +}; // class Logger + +namespace +{ +//! +//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kVERBOSE +//! +//! Example usage: +//! +//! LOG_VERBOSE(logger) << "hello world" << std::endl; +//! +inline LogStreamConsumer LOG_VERBOSE(const Logger& logger) +{ + return LogStreamConsumer(logger.getReportableSeverity(), Severity::kVERBOSE); +} + +//! +//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINFO +//! +//! Example usage: +//! +//! LOG_INFO(logger) << "hello world" << std::endl; +//! +inline LogStreamConsumer LOG_INFO(const Logger& logger) +{ + return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINFO); +} + +//! +//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kWARNING +//! +//! Example usage: +//! +//! LOG_WARN(logger) << "hello world" << std::endl; +//! +inline LogStreamConsumer LOG_WARN(const Logger& logger) +{ + return LogStreamConsumer(logger.getReportableSeverity(), Severity::kWARNING); +} + +//! +//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kERROR +//! +//! Example usage: +//! +//! LOG_ERROR(logger) << "hello world" << std::endl; +//! +inline LogStreamConsumer LOG_ERROR(const Logger& logger) +{ + return LogStreamConsumer(logger.getReportableSeverity(), Severity::kERROR); +} + +//! +//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINTERNAL_ERROR +//! ("fatal" severity) +//! +//! Example usage: +//! +//! LOG_FATAL(logger) << "hello world" << std::endl; +//! +inline LogStreamConsumer LOG_FATAL(const Logger& logger) +{ + return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINTERNAL_ERROR); +} +} // anonymous namespace +} // namespace sample +#endif // TENSORRT_LOGGING_H diff --git a/Code/TestTRTInterDll/trtinfer_lib/common/parserOnnxConfig.h b/Code/TestTRTInterDll/trtinfer_lib/common/parserOnnxConfig.h new file mode 100644 index 0000000..b1c4e43 --- /dev/null +++ b/Code/TestTRTInterDll/trtinfer_lib/common/parserOnnxConfig.h @@ -0,0 +1,152 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef PARSER_ONNX_CONFIG_H +#define PARSER_ONNX_CONFIG_H + +#include +#include +#include + +#include "NvInfer.h" +#include "NvOnnxConfig.h" +#include "NvOnnxParser.h" + +#define ONNX_DEBUG 1 + +/** + * \class ParserOnnxConfig + * \brief Configuration Manager Class Concrete Implementation + * + * \note: + * + */ + +class ParserOnnxConfig : public nvonnxparser::IOnnxConfig +{ + +protected: + std::string mModelFilename{}; + std::string mTextFilename{}; + std::string mFullTextFilename{}; + nvinfer1::DataType mModelDtype; + nvonnxparser::IOnnxConfig::Verbosity mVerbosity; + bool mPrintLayercInfo; + +public: + ParserOnnxConfig() + : mModelDtype(nvinfer1::DataType::kFLOAT) + , mVerbosity(static_cast(nvinfer1::ILogger::Severity::kWARNING)) + , mPrintLayercInfo(false) + { +#ifdef ONNX_DEBUG + if (isDebug()) + { + std::cout << " ParserOnnxConfig::ctor(): " << this << "\t" << std::endl; + } +#endif + } + +protected: + ~ParserOnnxConfig() override + { +#ifdef ONNX_DEBUG + if (isDebug()) + { + std::cout << "ParserOnnxConfig::dtor(): " << this << std::endl; + } +#endif + } + +public: + void setModelDtype(const nvinfer1::DataType modelDtype) noexcept override + { + mModelDtype = modelDtype; + } + + nvinfer1::DataType getModelDtype() const noexcept override + { + return mModelDtype; + } + + const char* getModelFileName() const noexcept override + { + return mModelFilename.c_str(); + } + void setModelFileName(const char* onnxFilename) noexcept override + { + mModelFilename = std::string(onnxFilename); + } + nvonnxparser::IOnnxConfig::Verbosity getVerbosityLevel() const noexcept override + { + return mVerbosity; + } + void addVerbosity() noexcept override + { + ++mVerbosity; + } + void reduceVerbosity() noexcept override + { + --mVerbosity; + } + void setVerbosityLevel(nvonnxparser::IOnnxConfig::Verbosity verbosity) noexcept override + { + mVerbosity = verbosity; + } + + const char* getTextFileName() const noexcept override + { + return mTextFilename.c_str(); + } + void setTextFileName(const char* textFilename) noexcept override + { + mTextFilename = std::string(textFilename); + } + const char* getFullTextFileName() const noexcept override + { + return mFullTextFilename.c_str(); + } + void setFullTextFileName(const char* fullTextFilename) noexcept override + { + mFullTextFilename = std::string(fullTextFilename); + } + bool getPrintLayerInfo() const noexcept override + { + return mPrintLayercInfo; + } + void setPrintLayerInfo(bool src) noexcept override + { + mPrintLayercInfo = src; + } //!< get the boolean variable corresponding to the Layer Info, see getPrintLayerInfo() + + virtual bool isDebug() const noexcept + { +#if ONNX_DEBUG + return (std::getenv("ONNX_DEBUG") ? true : false); +#else + return false; +#endif + } + + void destroy() noexcept override + { + delete this; + } + +}; // class ParserOnnxConfig + +#endif diff --git a/Code/TestTRTInterDll/trtinfer_lib/common/safeCommon.h b/Code/TestTRTInterDll/trtinfer_lib/common/safeCommon.h new file mode 100644 index 0000000..326257a --- /dev/null +++ b/Code/TestTRTInterDll/trtinfer_lib/common/safeCommon.h @@ -0,0 +1,224 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef TENSORRT_SAFE_COMMON_H +#define TENSORRT_SAFE_COMMON_H + +#include "cuda_runtime.h" +#include "NvInferRuntimeCommon.h" +#include +#include +#include +#include +#include +#include + +// For safeLoadLibrary +#ifdef _MSC_VER +// Needed so that the max/min definitions in windows.h do not conflict with std::max/min. +#define NOMINMAX +#include +#undef NOMINMAX +#else +#include +#endif + +#undef CHECK +#define CHECK(status) \ + do \ + { \ + auto ret = (status); \ + if (ret != 0) \ + { \ + std::cerr << "Cuda failure: " << ret << std::endl; \ + abort(); \ + } \ + } while (0) + +#undef SAFE_ASSERT +#define SAFE_ASSERT(condition) \ + do \ + { \ + if (!(condition)) \ + { \ + std::cerr << "Assertion failure: " << #condition << std::endl; \ + abort(); \ + } \ + } while (0) + +namespace samplesCommon +{ +template +inline std::shared_ptr infer_object(T* obj) +{ + if (!obj) + { + throw std::runtime_error("Failed to create object"); + } + return std::shared_ptr(obj); +} + +inline uint32_t elementSize(nvinfer1::DataType t) +{ + switch (t) + { + case nvinfer1::DataType::kINT32: + case nvinfer1::DataType::kFLOAT: return 4; + case nvinfer1::DataType::kHALF: return 2; + case nvinfer1::DataType::kINT8: return 1; + case nvinfer1::DataType::kUINT8: return 1; + case nvinfer1::DataType::kBOOL: return 1; + case nvinfer1::DataType::kFP8: return 1; + } + return 0; +} + +template +inline A divUp(A x, B n) +{ + return (x + n - 1) / n; +} + +inline int64_t volume(nvinfer1::Dims const& d) +{ + return std::accumulate(d.d, d.d + d.nbDims, int64_t{1}, std::multiplies{}); +} + +// Return m rounded up to nearest multiple of n +template +inline T roundUp(T m, T n) +{ + return ((m + n - 1) / n) * n; +} + +//! comps is the number of components in a vector. Ignored if vecDim < 0. +inline int64_t volume(nvinfer1::Dims dims, int32_t vecDim, int32_t comps, int32_t batch) +{ + if (vecDim >= 0) + { + dims.d[vecDim] = roundUp(dims.d[vecDim], comps); + } + return samplesCommon::volume(dims) * std::max(batch, 1); +} + +//! +//! \class TrtCudaGraphSafe +//! \brief Managed CUDA graph +//! +class TrtCudaGraphSafe +{ +public: + explicit TrtCudaGraphSafe() = default; + + TrtCudaGraphSafe(const TrtCudaGraphSafe&) = delete; + + TrtCudaGraphSafe& operator=(const TrtCudaGraphSafe&) = delete; + + TrtCudaGraphSafe(TrtCudaGraphSafe&&) = delete; + + TrtCudaGraphSafe& operator=(TrtCudaGraphSafe&&) = delete; + + ~TrtCudaGraphSafe() + { + if (mGraphExec) + { + cudaGraphExecDestroy(mGraphExec); + } + } + + void beginCapture(cudaStream_t& stream) + { + // cudaStreamCaptureModeGlobal is the only allowed mode in SAFE CUDA + CHECK(cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal)); + } + + bool launch(cudaStream_t& stream) + { + return cudaGraphLaunch(mGraphExec, stream) == cudaSuccess; + } + + void endCapture(cudaStream_t& stream) + { + CHECK(cudaStreamEndCapture(stream, &mGraph)); + CHECK(cudaGraphInstantiate(&mGraphExec, mGraph, nullptr, nullptr, 0)); + CHECK(cudaGraphDestroy(mGraph)); + } + + void endCaptureOnError(cudaStream_t& stream) + { + // There are two possibilities why stream capture would fail: + // (1) stream is in cudaErrorStreamCaptureInvalidated state. + // (2) TRT reports a failure. + // In case (1), the returning mGraph should be nullptr. + // In case (2), the returning mGraph is not nullptr, but it should not be used. + const auto ret = cudaStreamEndCapture(stream, &mGraph); + if (ret == cudaErrorStreamCaptureInvalidated) + { + SAFE_ASSERT(mGraph == nullptr); + } + else + { + SAFE_ASSERT(ret == cudaSuccess); + SAFE_ASSERT(mGraph != nullptr); + CHECK(cudaGraphDestroy(mGraph)); + mGraph = nullptr; + } + // Clean up any CUDA error. + cudaGetLastError(); + sample::gLogError << "The CUDA graph capture on the stream has failed." << std::endl; + } + +private: + cudaGraph_t mGraph{}; + cudaGraphExec_t mGraphExec{}; +}; + +inline void safeLoadLibrary(const std::string& path) +{ +#ifdef _MSC_VER + void* handle = LoadLibrary(path.c_str()); +#else + int32_t flags{RTLD_LAZY}; + void* handle = dlopen(path.c_str(), flags); +#endif + if (handle == nullptr) + { +#ifdef _MSC_VER + sample::gLogError << "Could not load plugin library: " << path << std::endl; +#else + sample::gLogError << "Could not load plugin library: " << path << ", due to: " << dlerror() << std::endl; +#endif + } +} + +inline std::vector safeSplitString(std::string str, char delimiter = ',') +{ + std::vector splitVect; + std::stringstream ss(str); + std::string substr; + + while (ss.good()) + { + getline(ss, substr, delimiter); + splitVect.emplace_back(std::move(substr)); + } + return splitVect; +} + +} // namespace samplesCommon + +#endif // TENSORRT_SAFE_COMMON_H diff --git a/Code/TestTRTInterDll/trtinfer_lib/common/sampleConfig.h b/Code/TestTRTInterDll/trtinfer_lib/common/sampleConfig.h new file mode 100644 index 0000000..ec1948a --- /dev/null +++ b/Code/TestTRTInterDll/trtinfer_lib/common/sampleConfig.h @@ -0,0 +1,338 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef SampleConfig_H +#define SampleConfig_H + +#include +#include +#include + +#include "NvInfer.h" +#include "NvOnnxConfig.h" +class SampleConfig : public nvonnxparser::IOnnxConfig +{ +public: + enum class InputDataFormat : int + { + kASCII = 0, + kPPM = 1 + }; + +private: + std::string mModelFilename; + std::string mEngineFilename; + std::string mTextFilename; + std::string mFullTextFilename; + std::string mImageFilename; + std::string mReferenceFilename; + std::string mOutputFilename; + std::string mCalibrationFilename; + std::string mTimingCacheFilename; + int64_t mLabel{-1}; + int64_t mMaxBatchSize{32}; + int64_t mCalibBatchSize{0}; + int64_t mMaxNCalibBatch{0}; + int64_t mFirstCalibBatch{0}; + int64_t mUseDLACore{-1}; + nvinfer1::DataType mModelDtype{nvinfer1::DataType::kFLOAT}; + bool mTF32{true}; + Verbosity mVerbosity{static_cast(nvinfer1::ILogger::Severity::kWARNING)}; + bool mPrintLayercInfo{false}; + bool mDebugBuilder{false}; + InputDataFormat mInputDataFormat{InputDataFormat::kASCII}; + uint64_t mTopK{0}; + float mFailurePercentage{-1.0f}; + float mTolerance{0.0f}; + float mAbsTolerance{1e-5f}; + +public: + SampleConfig() + { +#ifdef ONNX_DEBUG + if (isDebug()) + { + std::cout << " SampleConfig::ctor(): " << this << "\t" << std::endl; + } +#endif + } + +protected: + ~SampleConfig() override + { +#ifdef ONNX_DEBUG + if (isDebug()) + { + std::cout << "SampleConfig::dtor(): " << this << std::endl; + } +#endif + } + +public: + void setModelDtype(const nvinfer1::DataType mdt) noexcept override + { + mModelDtype = mdt; + } + + nvinfer1::DataType getModelDtype() const noexcept override + { + return mModelDtype; + } + + bool getTF32() const noexcept + { + return mTF32; + } + + void setTF32(bool enabled) noexcept + { + mTF32 = enabled; + } + + const char* getModelFileName() const noexcept override + { + return mModelFilename.c_str(); + } + + void setModelFileName(const char* onnxFilename) noexcept override + { + mModelFilename = std::string(onnxFilename); + } + Verbosity getVerbosityLevel() const noexcept override + { + return mVerbosity; + } + void addVerbosity() noexcept override + { + ++mVerbosity; + } + void reduceVerbosity() noexcept override + { + --mVerbosity; + } + void setVerbosityLevel(Verbosity v) noexcept override + { + mVerbosity = v; + } + const char* getEngineFileName() const noexcept + { + return mEngineFilename.c_str(); + } + void setEngineFileName(const char* engineFilename) noexcept + { + mEngineFilename = std::string(engineFilename); + } + const char* getTextFileName() const noexcept override + { + return mTextFilename.c_str(); + } + void setTextFileName(const char* textFilename) noexcept override + { + mTextFilename = std::string(textFilename); + } + const char* getFullTextFileName() const noexcept override + { + return mFullTextFilename.c_str(); + } + void setFullTextFileName(const char* fullTextFilename) noexcept override + { + mFullTextFilename = std::string(fullTextFilename); + } + void setLabel(int64_t label) noexcept + { + mLabel = label; + } //!< set the Label + + int64_t getLabel() const noexcept + { + return mLabel; + } //!< get the Label + + bool getPrintLayerInfo() const noexcept override + { + return mPrintLayercInfo; + } + + void setPrintLayerInfo(bool b) noexcept override + { + mPrintLayercInfo = b; + } //!< get the boolean variable corresponding to the Layer Info, see getPrintLayerInfo() + + void setMaxBatchSize(int64_t maxBatchSize) noexcept + { + mMaxBatchSize = maxBatchSize; + } //!< set the Max Batch Size + int64_t getMaxBatchSize() const noexcept + { + return mMaxBatchSize; + } //!< get the Max Batch Size + + void setCalibBatchSize(int64_t CalibBatchSize) noexcept + { + mCalibBatchSize = CalibBatchSize; + } //!< set the calibration batch size + int64_t getCalibBatchSize() const noexcept + { + return mCalibBatchSize; + } //!< get calibration batch size + + void setMaxNCalibBatch(int64_t MaxNCalibBatch) noexcept + { + mMaxNCalibBatch = MaxNCalibBatch; + } //!< set Max Number of Calibration Batches + int64_t getMaxNCalibBatch() const noexcept + { + return mMaxNCalibBatch; + } //!< get the Max Number of Calibration Batches + + void setFirstCalibBatch(int64_t FirstCalibBatch) noexcept + { + mFirstCalibBatch = FirstCalibBatch; + } //!< set the first calibration batch + int64_t getFirstCalibBatch() const noexcept + { + return mFirstCalibBatch; + } //!< get the first calibration batch + + void setUseDLACore(int64_t UseDLACore) noexcept + { + mUseDLACore = UseDLACore; + } //!< set the DLA core to use + int64_t getUseDLACore() const noexcept + { + return mUseDLACore; + } //!< get the DLA core to use + + void setDebugBuilder() noexcept + { + mDebugBuilder = true; + } //!< enable the Debug info, while building the engine. + bool getDebugBuilder() const noexcept + { + return mDebugBuilder; + } //!< get the boolean variable, corresponding to the debug builder + + const char* getImageFileName() const noexcept //!< set Image file name (PPM or ASCII) + { + return mImageFilename.c_str(); + } + void setImageFileName(const char* imageFilename) noexcept //!< get the Image file name + { + mImageFilename = std::string(imageFilename); + } + const char* getReferenceFileName() const noexcept + { + return mReferenceFilename.c_str(); + } + void setReferenceFileName(const char* referenceFilename) noexcept //!< set reference file name + { + mReferenceFilename = std::string(referenceFilename); + } + + void setInputDataFormat(InputDataFormat idt) noexcept + { + mInputDataFormat = idt; + } //!< specifies expected data format of the image file (PPM or ASCII) + InputDataFormat getInputDataFormat() const noexcept + { + return mInputDataFormat; + } //!< returns the expected data format of the image file. + + const char* getOutputFileName() const noexcept //!< specifies the file to save the results + { + return mOutputFilename.c_str(); + } + void setOutputFileName(const char* outputFilename) noexcept //!< get the output file name + { + mOutputFilename = std::string(outputFilename); + } + + const char* getCalibrationFileName() const noexcept + { + return mCalibrationFilename.c_str(); + } //!< specifies the file containing the list of image files for int8 calibration + void setCalibrationFileName(const char* calibrationFilename) noexcept //!< get the int 8 calibration list file name + { + mCalibrationFilename = std::string(calibrationFilename); + } + + uint64_t getTopK() const noexcept + { + return mTopK; + } + void setTopK(uint64_t topK) noexcept + { + mTopK = topK; + } //!< If this options is specified, return the K top probabilities. + + float getFailurePercentage() const noexcept + { + return mFailurePercentage; + } + + void setFailurePercentage(float f) noexcept + { + mFailurePercentage = f; + } + + float getAbsoluteTolerance() const noexcept + { + return mAbsTolerance; + } + + void setAbsoluteTolerance(float a) noexcept + { + mAbsTolerance = a; + } + + float getTolerance() const noexcept + { + return mTolerance; + } + + void setTolerance(float t) noexcept + { + mTolerance = t; + } + + const char* getTimingCacheFilename() const noexcept + { + return mTimingCacheFilename.c_str(); + } + + void setTimingCacheFileName(const char* timingCacheFilename) noexcept + { + mTimingCacheFilename = std::string(timingCacheFilename); + } + + bool isDebug() const noexcept + { +#if ONNX_DEBUG + return (std::getenv("ONNX_DEBUG") ? true : false); +#else + return false; +#endif + } + + void destroy() noexcept override + { + delete this; + } + +}; // class SampleConfig + +#endif diff --git a/Code/TestTRTInterDll/trtinfer_lib/common/sampleDevice.h b/Code/TestTRTInterDll/trtinfer_lib/common/sampleDevice.h new file mode 100644 index 0000000..83cd53c --- /dev/null +++ b/Code/TestTRTInterDll/trtinfer_lib/common/sampleDevice.h @@ -0,0 +1,554 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef TRT_SAMPLE_DEVICE_H +#define TRT_SAMPLE_DEVICE_H + +#include +#include +#include +#include +#include + +#include "sampleUtils.h" + +namespace sample +{ + +inline void cudaCheck(cudaError_t ret, std::ostream& err = std::cerr) +{ + if (ret != cudaSuccess) + { + err << "Cuda failure: " << cudaGetErrorString(ret) << std::endl; + abort(); + } +} + +class TrtCudaEvent; + +namespace +{ + +void cudaSleep(void* sleep) +{ + std::this_thread::sleep_for(std::chrono::duration(*static_cast(sleep))); +} + +} // namespace + +//! +//! \class TrtCudaStream +//! \brief Managed CUDA stream +//! +class TrtCudaStream +{ +public: + TrtCudaStream() + { + cudaCheck(cudaStreamCreate(&mStream)); + } + + TrtCudaStream(const TrtCudaStream&) = delete; + + TrtCudaStream& operator=(const TrtCudaStream&) = delete; + + TrtCudaStream(TrtCudaStream&&) = delete; + + TrtCudaStream& operator=(TrtCudaStream&&) = delete; + + ~TrtCudaStream() + { + cudaCheck(cudaStreamDestroy(mStream)); + } + + cudaStream_t get() const + { + return mStream; + } + + void synchronize() + { + cudaCheck(cudaStreamSynchronize(mStream)); + } + + void wait(TrtCudaEvent& event); + + void sleep(float* ms) + { + cudaCheck(cudaLaunchHostFunc(mStream, cudaSleep, ms)); + } + +private: + cudaStream_t mStream{}; +}; + +//! +//! \class TrtCudaEvent +//! \brief Managed CUDA event +//! +class TrtCudaEvent +{ +public: + explicit TrtCudaEvent(bool blocking = true) + { + const uint32_t flags = blocking ? cudaEventBlockingSync : cudaEventDefault; + cudaCheck(cudaEventCreateWithFlags(&mEvent, flags)); + } + + TrtCudaEvent(const TrtCudaEvent&) = delete; + + TrtCudaEvent& operator=(const TrtCudaEvent&) = delete; + + TrtCudaEvent(TrtCudaEvent&&) = delete; + + TrtCudaEvent& operator=(TrtCudaEvent&&) = delete; + + ~TrtCudaEvent() + { + cudaCheck(cudaEventDestroy(mEvent)); + } + + cudaEvent_t get() const + { + return mEvent; + } + + void record(const TrtCudaStream& stream) + { + cudaCheck(cudaEventRecord(mEvent, stream.get())); + } + + void synchronize() + { + cudaCheck(cudaEventSynchronize(mEvent)); + } + + // Returns time elapsed time in milliseconds + float operator-(const TrtCudaEvent& e) const + { + float time{0}; + cudaCheck(cudaEventElapsedTime(&time, e.get(), get())); + return time; + } + +private: + cudaEvent_t mEvent{}; +}; + +inline void TrtCudaStream::wait(TrtCudaEvent& event) +{ + cudaCheck(cudaStreamWaitEvent(mStream, event.get(), 0)); +} + +//! +//! \class TrtCudaGraph +//! \brief Managed CUDA graph +//! +class TrtCudaGraph +{ +public: + explicit TrtCudaGraph() = default; + + TrtCudaGraph(const TrtCudaGraph&) = delete; + + TrtCudaGraph& operator=(const TrtCudaGraph&) = delete; + + TrtCudaGraph(TrtCudaGraph&&) = delete; + + TrtCudaGraph& operator=(TrtCudaGraph&&) = delete; + + ~TrtCudaGraph() + { + if (mGraphExec) + { + cudaGraphExecDestroy(mGraphExec); + } + } + + void beginCapture(TrtCudaStream& stream) + { + cudaCheck(cudaStreamBeginCapture(stream.get(), cudaStreamCaptureModeThreadLocal)); + } + + bool launch(TrtCudaStream& stream) + { + return cudaGraphLaunch(mGraphExec, stream.get()) == cudaSuccess; + } + + void endCapture(TrtCudaStream& stream) + { + cudaCheck(cudaStreamEndCapture(stream.get(), &mGraph)); + cudaCheck(cudaGraphInstantiate(&mGraphExec, mGraph, nullptr, nullptr, 0)); + cudaCheck(cudaGraphDestroy(mGraph)); + } + + void endCaptureOnError(TrtCudaStream& stream) + { + // There are two possibilities why stream capture would fail: + // (1) stream is in cudaErrorStreamCaptureInvalidated state. + // (2) TRT reports a failure. + // In case (1), the returning mGraph should be nullptr. + // In case (2), the returning mGraph is not nullptr, but it should not be used. + const auto ret = cudaStreamEndCapture(stream.get(), &mGraph); + if (ret == cudaErrorStreamCaptureInvalidated) + { + assert(mGraph == nullptr); + } + else + { + assert(ret == cudaSuccess); + assert(mGraph != nullptr); + cudaCheck(cudaGraphDestroy(mGraph)); + mGraph = nullptr; + } + // Clean up any CUDA error. + cudaGetLastError(); + sample::gLogWarning << "The CUDA graph capture on the stream has failed." << std::endl; + } + +private: + cudaGraph_t mGraph{}; + cudaGraphExec_t mGraphExec{}; +}; + +//! +//! \class TrtCudaBuffer +//! \brief Managed buffer for host and device +//! +template +class TrtCudaBuffer +{ +public: + TrtCudaBuffer() = default; + + TrtCudaBuffer(const TrtCudaBuffer&) = delete; + + TrtCudaBuffer& operator=(const TrtCudaBuffer&) = delete; + + TrtCudaBuffer(TrtCudaBuffer&& rhs) + { + reset(rhs.mPtr); + rhs.mPtr = nullptr; + } + + TrtCudaBuffer& operator=(TrtCudaBuffer&& rhs) + { + if (this != &rhs) + { + reset(rhs.mPtr); + rhs.mPtr = nullptr; + } + return *this; + } + + ~TrtCudaBuffer() + { + reset(); + } + + TrtCudaBuffer(size_t size) + { + A()(&mPtr, size); + } + + void allocate(size_t size) + { + reset(); + A()(&mPtr, size); + } + + void reset(void* ptr = nullptr) + { + if (mPtr) + { + D()(mPtr); + } + mPtr = ptr; + } + + void* get() const + { + return mPtr; + } + +private: + void* mPtr{nullptr}; +}; + +struct DeviceAllocator +{ + void operator()(void** ptr, size_t size) + { + cudaCheck(cudaMalloc(ptr, size)); + } +}; + +struct DeviceDeallocator +{ + void operator()(void* ptr) + { + cudaCheck(cudaFree(ptr)); + } +}; + +struct ManagedAllocator +{ + void operator()(void** ptr, size_t size) + { + cudaCheck(cudaMallocManaged(ptr, size)); + } +}; + +struct HostAllocator +{ + void operator()(void** ptr, size_t size) + { + cudaCheck(cudaMallocHost(ptr, size)); + } +}; + +struct HostDeallocator +{ + void operator()(void* ptr) + { + cudaCheck(cudaFreeHost(ptr)); + } +}; + +using TrtDeviceBuffer = TrtCudaBuffer; +using TrtManagedBuffer = TrtCudaBuffer; + +using TrtHostBuffer = TrtCudaBuffer; + +//! +//! \class MirroredBuffer +//! \brief Coupled host and device buffers +//! +class IMirroredBuffer +{ +public: + //! + //! Allocate memory for the mirrored buffer give the size + //! of the allocation. + //! + virtual void allocate(size_t size) = 0; + + //! + //! Get the pointer to the device side buffer. + //! + //! \return pointer to device memory or nullptr if uninitialized. + //! + virtual void* getDeviceBuffer() const = 0; + + //! + //! Get the pointer to the host side buffer. + //! + //! \return pointer to host memory or nullptr if uninitialized. + //! + virtual void* getHostBuffer() const = 0; + + //! + //! Copy the memory from host to device. + //! + virtual void hostToDevice(TrtCudaStream& stream) = 0; + + //! + //! Copy the memory from device to host. + //! + virtual void deviceToHost(TrtCudaStream& stream) = 0; + + //! + //! Interface to get the size of the memory + //! + //! \return the size of memory allocated. + //! + virtual size_t getSize() const = 0; + + //! + //! Virtual destructor declaraion + //! + virtual ~IMirroredBuffer() = default; + +}; // class IMirroredBuffer + +//! +//! Class to have a separate memory buffer for discrete device and host allocations. +//! +class DiscreteMirroredBuffer : public IMirroredBuffer +{ +public: + void allocate(size_t size) override + { + mSize = size; + mHostBuffer.allocate(size); + mDeviceBuffer.allocate(size); + } + + void* getDeviceBuffer() const override + { + return mDeviceBuffer.get(); + } + + void* getHostBuffer() const override + { + return mHostBuffer.get(); + } + + void hostToDevice(TrtCudaStream& stream) override + { + cudaCheck(cudaMemcpyAsync(mDeviceBuffer.get(), mHostBuffer.get(), mSize, cudaMemcpyHostToDevice, stream.get())); + } + + void deviceToHost(TrtCudaStream& stream) override + { + cudaCheck(cudaMemcpyAsync(mHostBuffer.get(), mDeviceBuffer.get(), mSize, cudaMemcpyDeviceToHost, stream.get())); + } + + size_t getSize() const override + { + return mSize; + } + +private: + size_t mSize{0}; + TrtHostBuffer mHostBuffer; + TrtDeviceBuffer mDeviceBuffer; +}; // class DiscreteMirroredBuffer + +//! +//! Class to have a unified memory buffer for embedded devices. +//! +class UnifiedMirroredBuffer : public IMirroredBuffer +{ +public: + void allocate(size_t size) override + { + mSize = size; + mBuffer.allocate(size); + } + + void* getDeviceBuffer() const override + { + return mBuffer.get(); + } + + void* getHostBuffer() const override + { + return mBuffer.get(); + } + + void hostToDevice(TrtCudaStream& stream) override + { + // Does nothing since we are using unified memory. + } + + void deviceToHost(TrtCudaStream& stream) override + { + // Does nothing since we are using unified memory. + } + + size_t getSize() const override + { + return mSize; + } + +private: + size_t mSize{0}; + TrtManagedBuffer mBuffer; +}; // class UnifiedMirroredBuffer + +//! +//! Class to allocate memory for outputs with data-dependent shapes. The sizes of those are unknown so pre-allocation is +//! not possible. +//! +class OutputAllocator : public nvinfer1::IOutputAllocator +{ +public: + OutputAllocator(IMirroredBuffer* buffer) + : mBuffer(buffer) + { + } + + void* reallocateOutput( + char const* tensorName, void* currentMemory, uint64_t size, uint64_t alignment) noexcept override + { + // Some memory allocators return nullptr when allocating zero bytes, but TensorRT requires a non-null ptr + // even for empty tensors, so allocate a dummy byte. + size = std::max(size, static_cast(1)); + if (size > mSize) + { + mBuffer->allocate(roundUp(size, alignment)); + mSize = size; + } + return mBuffer->getDeviceBuffer(); + } + + void notifyShape(char const* tensorName, nvinfer1::Dims const& dims) noexcept override {} + + IMirroredBuffer* getBuffer() + { + return mBuffer.get(); + } + + ~OutputAllocator() override {} + +private: + std::unique_ptr mBuffer; + uint64_t mSize{}; +}; + +inline void setCudaDevice(int device, std::ostream& os) +{ + cudaCheck(cudaSetDevice(device)); + + cudaDeviceProp properties; + cudaCheck(cudaGetDeviceProperties(&properties, device)); + + // clang-format off + os << "=== Device Information ===" << std::endl; + os << "Selected Device: " << properties.name << std::endl; + os << "Compute Capability: " << properties.major << "." << properties.minor << std::endl; + os << "SMs: " << properties.multiProcessorCount << std::endl; + os << "Device Global Memory: " << (properties.totalGlobalMem >> 20) << " MiB" << std::endl; + os << "Shared Memory per SM: " << (properties.sharedMemPerMultiprocessor >> 10) << " KiB" << std::endl; + os << "Memory Bus Width: " << properties.memoryBusWidth << " bits" + << " (ECC " << (properties.ECCEnabled != 0 ? "enabled" : "disabled") << ")" << std::endl; + os << "Application Compute Clock Rate: " << properties.clockRate / 1000000.0F << " GHz" << std::endl; + os << "Application Memory Clock Rate: " << properties.memoryClockRate / 1000000.0F << " GHz" << std::endl; + os << std::endl; + os << "Note: The application clock rates do not reflect the actual clock rates that the GPU is " + << "currently running at." << std::endl; + // clang-format on +} + +inline int32_t getCudaDriverVersion() +{ + int32_t version{-1}; + cudaCheck(cudaDriverGetVersion(&version)); + return version; +} + +inline int32_t getCudaRuntimeVersion() +{ + int32_t version{-1}; + cudaCheck(cudaRuntimeGetVersion(&version)); + return version; +} + +} // namespace sample + +#endif // TRT_SAMPLE_DEVICE_H diff --git a/Code/TestTRTInterDll/trtinfer_lib/common/sampleEngines.cpp b/Code/TestTRTInterDll/trtinfer_lib/common/sampleEngines.cpp new file mode 100644 index 0000000..eb8f918 --- /dev/null +++ b/Code/TestTRTInterDll/trtinfer_lib/common/sampleEngines.cpp @@ -0,0 +1,1699 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "NvCaffeParser.h" +#include "NvInfer.h" +#include "NvOnnxParser.h" +#include "NvUffParser.h" + +#include "ErrorRecorder.h" +#include "common.h" +#include "half.h" +#include "logger.h" +#include "sampleDevice.h" +#include "sampleEngines.h" +#include "sampleOptions.h" +#include "sampleUtils.h" + +using namespace nvinfer1; + +namespace sample +{ + +namespace +{ + +struct CaffeBufferShutter +{ + ~CaffeBufferShutter() + { + shutdownCaffeParser(); + } +}; + +struct UffBufferShutter +{ + ~UffBufferShutter() + { + shutdownUffParser(); + } +}; + +std::map readScalesFromCalibrationCache(std::string const& calibrationFile) +{ + std::map tensorScales; + std::ifstream cache{calibrationFile}; + if (!cache.is_open()) + { + sample::gLogError << "[TRT] Can not open provided calibration cache file" << std::endl; + return tensorScales; + } + std::string line; + while (std::getline(cache, line)) + { + auto colonPos = line.find_last_of(':'); + if (colonPos != std::string::npos) + { + // Scales should be stored in calibration cache as 32-bit floating numbers encoded as 32-bit integers + int32_t scalesAsInt = std::stoi(line.substr(colonPos + 2, 8), nullptr, 16); + auto const tensorName = line.substr(0, colonPos); + tensorScales[tensorName] = *reinterpret_cast(&scalesAsInt); + } + } + cache.close(); + return tensorScales; +} +} // namespace + +nvinfer1::ICudaEngine* LazilyDeserializedEngine::get() +{ + SMP_RETVAL_IF_FALSE( + !mIsSafe, "Safe mode is enabled, but trying to get standard engine!", nullptr, sample::gLogError); + + if (mEngine == nullptr) + { + SMP_RETVAL_IF_FALSE( + !mEngineBlob.empty(), "Engine blob is empty. Nothing to deserialize!", nullptr, sample::gLogError); + + using time_point = std::chrono::time_point; + using duration = std::chrono::duration; + time_point const deserializeStartTime{std::chrono::high_resolution_clock::now()}; + + if (mLeanDLLPath.empty()) + { + mRuntime.reset(createRuntime()); + } + else + { + mParentRuntime.reset(createRuntime()); + ASSERT(mParentRuntime.get() != nullptr); + + mRuntime.reset(mParentRuntime->loadRuntime(mLeanDLLPath.c_str())); + } + ASSERT(mRuntime.get() != nullptr); + + if (mVersionCompatible) + { + // Application needs to opt into allowing deserialization of engines with embedded lean runtime. + mRuntime->setEngineHostCodeAllowed(true); + } + + if (!mTempdir.empty()) + { + mRuntime->setTemporaryDirectory(mTempdir.c_str()); + } + + mRuntime->setTempfileControlFlags(mTempfileControls); + + SMP_RETVAL_IF_FALSE(mRuntime != nullptr, "runtime creation failed", nullptr, sample::gLogError); + if (mDLACore != -1) + { + mRuntime->setDLACore(mDLACore); + } + mRuntime->setErrorRecorder(&gRecorder); + for (auto const& pluginPath : mDynamicPlugins) + { + mRuntime->getPluginRegistry().loadLibrary(pluginPath.c_str()); + } + mEngine.reset(mRuntime->deserializeCudaEngine(mEngineBlob.data(), mEngineBlob.size())); + SMP_RETVAL_IF_FALSE(mEngine != nullptr, "Engine deserialization failed", nullptr, sample::gLogError); + + time_point const deserializeEndTime{std::chrono::high_resolution_clock::now()}; + sample::gLogInfo << "Engine deserialized in " + << duration(deserializeEndTime - deserializeStartTime).count() << " sec." << std::endl; + } + + return mEngine.get(); +} + +nvinfer1::ICudaEngine* LazilyDeserializedEngine::release() +{ + return mEngine.release(); +} + +nvinfer1::safe::ICudaEngine* LazilyDeserializedEngine::getSafe() +{ + SMP_RETVAL_IF_FALSE( + mIsSafe, "Safe mode is not enabled, but trying to get safe engine!", nullptr, sample::gLogError); + + ASSERT(sample::hasSafeRuntime()); + if (mSafeEngine == nullptr) + { + SMP_RETVAL_IF_FALSE( + !mEngineBlob.empty(), "Engine blob is empty. Nothing to deserialize!", nullptr, sample::gLogError); + + SMP_RETVAL_IF_FALSE( + mDLACore == -1, "Safe DLA engine built with kDLA_STANDALONE should not be deserialized in TRT!", nullptr, + sample::gLogError); + + using time_point = std::chrono::time_point; + using duration = std::chrono::duration; + time_point const deserializeStartTime{std::chrono::high_resolution_clock::now()}; + + std::unique_ptr safeRuntime{sample::createSafeInferRuntime(sample::gLogger.getTRTLogger())}; + SMP_RETVAL_IF_FALSE(safeRuntime != nullptr, "SafeRuntime creation failed", nullptr, sample::gLogError); + safeRuntime->setErrorRecorder(&gRecorder); + mSafeEngine.reset( + safeRuntime->deserializeCudaEngine(mEngineBlob.data(), mEngineBlob.size())); + SMP_RETVAL_IF_FALSE(mSafeEngine != nullptr, "SafeEngine deserialization failed", nullptr, sample::gLogError); + + time_point const deserializeEndTime{std::chrono::high_resolution_clock::now()}; + sample::gLogInfo << "SafeEngine deserialized in " + << duration(deserializeEndTime - deserializeStartTime).count() << " sec." << std::endl; + } + + return mSafeEngine.get(); +} + +void setTensorScalesFromCalibration(nvinfer1::INetworkDefinition& network, std::vector const& inputFormats, + std::vector const& outputFormats, std::string const& calibrationFile) +{ + auto const tensorScales = readScalesFromCalibrationCache(calibrationFile); + bool const broadcastInputFormats = broadcastIOFormats(inputFormats, network.getNbInputs()); + for (int32_t i = 0, n = network.getNbInputs(); i < n; ++i) + { + int32_t formatIdx = broadcastInputFormats ? 0 : i; + if (!inputFormats.empty() && inputFormats[formatIdx].first == DataType::kINT8) + { + auto* input = network.getInput(i); + auto const calibScale = tensorScales.at(input->getName()); + input->setDynamicRange(-127 * calibScale, 127 * calibScale); + } + } + bool const broadcastOutputFormats = broadcastIOFormats(outputFormats, network.getNbInputs()); + for (int32_t i = 0, n = network.getNbOutputs(); i < n; ++i) + { + int32_t formatIdx = broadcastOutputFormats ? 0 : i; + if (!outputFormats.empty() && outputFormats[formatIdx].first == DataType::kINT8) + { + auto* output = network.getOutput(i); + auto const calibScale = tensorScales.at(output->getName()); + output->setDynamicRange(-127 * calibScale, 127 * calibScale); + } + } +} + +//! +//! \brief Generate a network definition for a given model +//! +//! \param[in] model Model options for this network +//! \param[in,out] network Network storing the parsed results +//! \param[in,out] err Error stream +//! \param[out] vcPluginLibrariesUsed If not nullptr, will be populated with paths to VC plugin libraries required by +//! the parsed network. +//! +//! \return Parser The parser used to initialize the network and that holds the weights for the network, or an invalid +//! parser (the returned parser converts to false if tested) +//! +//! Constant input dimensions in the model must not be changed in the corresponding +//! network definition, because its correctness may rely on the constants. +//! +//! \see Parser::operator bool() +//! +Parser modelToNetwork(const ModelOptions& model, nvinfer1::INetworkDefinition& network, std::ostream& err, + std::vector* vcPluginLibrariesUsed) +{ + sample::gLogInfo << "Start parsing network model." << std::endl; + auto const tBegin = std::chrono::high_resolution_clock::now(); + + Parser parser; + std::string const& modelName = model.baseModel.model; + switch (model.baseModel.format) + { + case ModelFormat::kCAFFE: + { + using namespace nvcaffeparser1; + parser.caffeParser.reset(sampleCreateCaffeParser()); + CaffeBufferShutter bufferShutter; + auto const* const blobNameToTensor = parser.caffeParser->parse( + model.prototxt.c_str(), modelName.empty() ? nullptr : modelName.c_str(), network, DataType::kFLOAT); + if (!blobNameToTensor) + { + err << "Failed to parse caffe model or prototxt, tensors blob not found" << std::endl; + parser.caffeParser.reset(); + break; + } + + for (auto const& s : model.outputs) + { + if (blobNameToTensor->find(s.c_str()) == nullptr) + { + err << "Could not find output blob " << s << std::endl; + parser.caffeParser.reset(); + break; + } + network.markOutput(*blobNameToTensor->find(s.c_str())); + } + break; + } + case ModelFormat::kUFF: + { + using namespace nvuffparser; + parser.uffParser.reset(sampleCreateUffParser()); + UffBufferShutter bufferShutter; + for (auto const& s : model.uffInputs.inputs) + { + if (!parser.uffParser->registerInput( + s.first.c_str(), s.second, model.uffInputs.NHWC ? UffInputOrder::kNHWC : UffInputOrder::kNCHW)) + { + err << "Failed to register input " << s.first << std::endl; + parser.uffParser.reset(); + break; + } + } + + for (auto const& s : model.outputs) + { + if (!parser.uffParser->registerOutput(s.c_str())) + { + err << "Failed to register output " << s << std::endl; + parser.uffParser.reset(); + break; + } + } + + if (!parser.uffParser->parse(model.baseModel.model.c_str(), network)) + { + err << "Failed to parse uff file" << std::endl; + parser.uffParser.reset(); + break; + } + break; + } + case ModelFormat::kONNX: + { + using namespace nvonnxparser; + parser.onnxParser.reset(createONNXParser(network)); + if (!parser.onnxParser->parseFromFile( + model.baseModel.model.c_str(), static_cast(sample::gLogger.getReportableSeverity()))) + { + err << "Failed to parse onnx file" << std::endl; + parser.onnxParser.reset(); + } + if (vcPluginLibrariesUsed && parser.onnxParser.get()) + { + int64_t nbPluginLibs; + char const* const* pluginLibArray = parser.onnxParser->getUsedVCPluginLibraries(nbPluginLibs); + if (nbPluginLibs >= 0) + { + vcPluginLibrariesUsed->reserve(nbPluginLibs); + for (int64_t i = 0; i < nbPluginLibs; ++i) + { + sample::gLogInfo << "Using VC plugin library " << pluginLibArray[i] << std::endl; + vcPluginLibrariesUsed->emplace_back(std::string{pluginLibArray[i]}); + } + } + else + { + sample::gLogWarning << "Failure to query VC plugin libraries required by parsed ONNX network" + << std::endl; + } + } + break; + } + case ModelFormat::kANY: break; + } + + auto const tEnd = std::chrono::high_resolution_clock::now(); + float const parseTime = std::chrono::duration(tEnd - tBegin).count(); + + sample::gLogInfo << "Finished parsing network model. Parse time: " << parseTime << std::endl; + return parser; +} + +namespace +{ + +class RndInt8Calibrator : public nvinfer1::IInt8EntropyCalibrator2 +{ +public: + RndInt8Calibrator(int32_t batches, std::vector& elemCount, std::string const& cacheFile, + nvinfer1::INetworkDefinition const& network, std::ostream& err); + + ~RndInt8Calibrator() override + { + for (auto& elem : mInputDeviceBuffers) + { + cudaCheck(cudaFree(elem.second), mErr); + } + } + + bool getBatch(void* bindings[], char const* names[], int32_t nbBindings) noexcept override; + + int32_t getBatchSize() const noexcept override + { + return 1; + } + + const void* readCalibrationCache(size_t& length) noexcept override; + + void writeCalibrationCache(void const*, size_t) noexcept override {} + +private: + int32_t mBatches{}; + int32_t mCurrentBatch{}; + std::string mCacheFile; + std::map mInputDeviceBuffers; + std::vector mCalibrationCache; + std::ostream& mErr; +}; + +RndInt8Calibrator::RndInt8Calibrator(int32_t batches, std::vector& elemCount, std::string const& cacheFile, + INetworkDefinition const& network, std::ostream& err) + : mBatches(batches) + , mCurrentBatch(0) + , mCacheFile(cacheFile) + , mErr(err) +{ + std::ifstream tryCache(cacheFile, std::ios::binary); + if (tryCache.good()) + { + return; + } + + std::default_random_engine generator; + std::uniform_real_distribution distribution(-1.0F, 1.0F); + auto gen = [&generator, &distribution]() { return distribution(generator); }; + + for (int32_t i = 0; i < network.getNbInputs(); i++) + { + auto* input = network.getInput(i); + std::vector rnd_data(elemCount[i]); + std::generate_n(rnd_data.begin(), elemCount[i], gen); + + void* data; + cudaCheck(cudaMalloc(&data, elemCount[i] * sizeof(float)), mErr); + cudaCheck(cudaMemcpy(data, rnd_data.data(), elemCount[i] * sizeof(float), cudaMemcpyHostToDevice), mErr); + + mInputDeviceBuffers.insert(std::make_pair(input->getName(), data)); + } +} + +bool RndInt8Calibrator::getBatch(void* bindings[], char const* names[], int32_t nbBindings) noexcept +{ + if (mCurrentBatch >= mBatches) + { + return false; + } + + for (int32_t i = 0; i < nbBindings; ++i) + { + bindings[i] = mInputDeviceBuffers[names[i]]; + } + + ++mCurrentBatch; + + return true; +} + +const void* RndInt8Calibrator::readCalibrationCache(size_t& length) noexcept +{ + mCalibrationCache.clear(); + std::ifstream input(mCacheFile, std::ios::binary); + input >> std::noskipws; + if (input.good()) + { + std::copy( + std::istream_iterator(input), std::istream_iterator(), std::back_inserter(mCalibrationCache)); + } + + length = mCalibrationCache.size(); + return !mCalibrationCache.empty() ? mCalibrationCache.data() : nullptr; +} + +bool setTensorDynamicRange(INetworkDefinition const& network, float inRange = 2.0F, float outRange = 4.0F) +{ + // Ensure that all layer inputs have a dynamic range. + for (int32_t l = 0; l < network.getNbLayers(); l++) + { + auto* layer = network.getLayer(l); + for (int32_t i = 0; i < layer->getNbInputs(); i++) + { + ITensor* input{layer->getInput(i)}; + // Optional inputs are nullptr here and are from RNN layers. + if (input && !input->dynamicRangeIsSet()) + { + // Concat should propagate dynamic range from outputs to inputs to avoid + // Re-quantization during the concatenation + auto dynRange = (layer->getType() == LayerType::kCONCATENATION) ? outRange : inRange; + if (!input->setDynamicRange(-dynRange, dynRange)) + { + return false; + } + } + } + for (int32_t o = 0; o < layer->getNbOutputs(); o++) + { + ITensor* output{layer->getOutput(o)}; + // Optional outputs are nullptr here and are from RNN layers. + if (output && !output->dynamicRangeIsSet()) + { + // Pooling must have the same input and output dynamic range. + if (layer->getType() == LayerType::kPOOLING) + { + if (!output->setDynamicRange(-inRange, inRange)) + { + return false; + } + } + else + { + if (!output->setDynamicRange(-outRange, outRange)) + { + return false; + } + } + } + } + } + return true; +} + +void setLayerPrecisions(INetworkDefinition& network, LayerPrecisions const& layerPrecisions) +{ + bool const hasGlobalPrecision{layerPrecisions.find("*") != layerPrecisions.end()}; + auto const globalPrecision = hasGlobalPrecision ? layerPrecisions.at("*") : nvinfer1::DataType::kFLOAT; + bool hasLayerPrecisionSkipped{false}; + for (int32_t layerIdx = 0; layerIdx < network.getNbLayers(); ++layerIdx) + { + auto* layer = network.getLayer(layerIdx); + auto const layerName = layer->getName(); + if (layerPrecisions.find(layer->getName()) != layerPrecisions.end()) + { + layer->setPrecision(layerPrecisions.at(layer->getName())); + } + else if (hasGlobalPrecision) + { + // We should not set the layer precision if its default precision is INT32 or Bool. + if (layer->getPrecision() == nvinfer1::DataType::kINT32 + || layer->getPrecision() == nvinfer1::DataType::kBOOL) + { + hasLayerPrecisionSkipped = true; + sample::gLogVerbose << "Skipped setting precision for layer " << layerName << " because the " + << " default layer precision is INT32 or Bool." << std::endl; + continue; + } + // We should not set the constant layer precision if its weights are in INT32. + if (layer->getType() == nvinfer1::LayerType::kCONSTANT + && static_cast(layer)->getWeights().type == nvinfer1::DataType::kINT32) + { + hasLayerPrecisionSkipped = true; + sample::gLogVerbose << "Skipped setting precision for layer " << layerName << " because this " + << "constant layer has INT32 weights." << std::endl; + continue; + } + // We should not set the layer precision if the layer operates on a shape tensor. + if (layer->getNbInputs() >= 1 && layer->getInput(0)->isShapeTensor()) + { + hasLayerPrecisionSkipped = true; + sample::gLogVerbose << "Skipped setting precision for layer " << layerName << " because this layer " + << "operates on a shape tensor." << std::endl; + continue; + } + if (layer->getNbInputs() >= 1 && layer->getInput(0)->getType() == nvinfer1::DataType::kINT32 + && layer->getNbOutputs() >= 1 && layer->getOutput(0)->getType() == nvinfer1::DataType::kINT32) + { + hasLayerPrecisionSkipped = true; + sample::gLogVerbose << "Skipped setting precision for layer " << layerName << " because this " + << "layer has INT32 input and output." << std::endl; + continue; + } + // All heuristics passed. Set the layer precision. + layer->setPrecision(globalPrecision); + } + } + + if (hasLayerPrecisionSkipped) + { + sample::gLogInfo << "Skipped setting precisions for some layers. Check verbose logs for more details." + << std::endl; + } +} + +void setLayerOutputTypes(INetworkDefinition& network, LayerOutputTypes const& layerOutputTypes) +{ + bool const hasGlobalOutputType{layerOutputTypes.find("*") != layerOutputTypes.end()}; + auto const globalOutputType = hasGlobalOutputType ? layerOutputTypes.at("*").at(0) : nvinfer1::DataType::kFLOAT; + bool hasLayerOutputTypeSkipped{false}; + for (int32_t layerIdx = 0; layerIdx < network.getNbLayers(); ++layerIdx) + { + auto* layer = network.getLayer(layerIdx); + auto const layerName = layer->getName(); + auto const nbOutputs = layer->getNbOutputs(); + if (layerOutputTypes.find(layer->getName()) != layerOutputTypes.end()) + { + auto const& outputTypes = layerOutputTypes.at(layer->getName()); + bool const isBroadcast = (outputTypes.size() == 1); + if (!isBroadcast && static_cast(outputTypes.size()) != nbOutputs) + { + sample::gLogError << "Layer " << layerName << " has " << nbOutputs << " outputs but " + << outputTypes.size() << " output types are given in --layerOutputTypes flag." + << std::endl; + throw std::invalid_argument("Invalid --layerOutputTypes flag."); + } + for (int32_t outputIdx = 0; outputIdx < nbOutputs; ++outputIdx) + { + layer->setOutputType(outputIdx, outputTypes.at(isBroadcast ? 0 : outputIdx)); + } + } + else if (hasGlobalOutputType) + { + // We should not set the layer output types if its default precision is INT32 or Bool. + if (layer->getPrecision() == nvinfer1::DataType::kINT32 + || layer->getPrecision() == nvinfer1::DataType::kBOOL) + { + hasLayerOutputTypeSkipped = true; + sample::gLogVerbose << "Skipped setting output types for layer " << layerName << " because the " + << " default layer precision is INT32 or Bool." << std::endl; + continue; + } + // We should not set the constant layer output types if its weights are in INT32. + if (layer->getType() == nvinfer1::LayerType::kCONSTANT + && static_cast(layer)->getWeights().type == nvinfer1::DataType::kINT32) + { + hasLayerOutputTypeSkipped = true; + sample::gLogVerbose << "Skipped setting output types for layer " << layerName << " because this " + << "constant layer has INT32 weights." << std::endl; + continue; + } + for (int32_t outputIdx = 0; outputIdx < nbOutputs; ++outputIdx) + { + // We should not set the output type if the output is a shape tensor. + if (layer->getOutput(0)->isShapeTensor()) + { + hasLayerOutputTypeSkipped = true; + sample::gLogVerbose << "Skipped setting output type for output " << outputIdx << " of layer " + << layerName << " because it is a shape tensor." << std::endl; + continue; + } + layer->setOutputType(outputIdx, globalOutputType); + } + } + } + + if (hasLayerOutputTypeSkipped) + { + sample::gLogInfo << "Skipped setting output types for some layers. Check verbose logs for more details." + << std::endl; + } +} + +void setLayerDeviceTypes( + INetworkDefinition const& network, IBuilderConfig& config, LayerDeviceTypes const& layerDeviceTypes) +{ + for (int32_t layerIdx = 0; layerIdx < network.getNbLayers(); ++layerIdx) + { + auto* layer = network.getLayer(layerIdx); + auto const layerName = layer->getName(); + if (layerDeviceTypes.find(layerName) != layerDeviceTypes.end()) + { + DeviceType const deviceType = layerDeviceTypes.at(layerName); + config.setDeviceType(layer, deviceType); + } + } +} + +void setMemoryPoolLimits(IBuilderConfig& config, BuildOptions const& build) +{ + auto const roundToBytes = [](double const sizeInMB) { return static_cast(sizeInMB * (1 << 20)); }; + if (build.workspace >= 0) + { + config.setMemoryPoolLimit(MemoryPoolType::kWORKSPACE, roundToBytes(build.workspace)); + } + if (build.dlaSRAM >= 0) + { + config.setMemoryPoolLimit(MemoryPoolType::kDLA_MANAGED_SRAM, roundToBytes(build.dlaSRAM)); + } + if (build.dlaLocalDRAM >= 0) + { + config.setMemoryPoolLimit(MemoryPoolType::kDLA_LOCAL_DRAM, roundToBytes(build.dlaLocalDRAM)); + } + if (build.dlaGlobalDRAM >= 0) + { + config.setMemoryPoolLimit(MemoryPoolType::kDLA_GLOBAL_DRAM, roundToBytes(build.dlaGlobalDRAM)); + } +} + +void setPreviewFeatures(IBuilderConfig& config, BuildOptions const& build) +{ + auto const setFlag = [&](PreviewFeature feat) { + int32_t featVal = static_cast(feat); + if (build.previewFeatures.find(featVal) != build.previewFeatures.end()) + { + config.setPreviewFeature(feat, build.previewFeatures.at(featVal)); + } + }; + setFlag(PreviewFeature::kFASTER_DYNAMIC_SHAPES_0805); + setFlag(PreviewFeature::kDISABLE_EXTERNAL_TACTIC_SOURCES_FOR_CORE_0805); + setFlag(PreviewFeature::kPROFILE_SHARING_0806); +} + +} // namespace + +bool setupNetworkAndConfig(BuildOptions const& build, SystemOptions const& sys, IBuilder& builder, + INetworkDefinition& network, IBuilderConfig& config, std::unique_ptr& calibrator, + std::ostream& err, std::vector>& sparseWeights) +{ + IOptimizationProfile* profile{nullptr}; + if (build.maxBatch) + { + builder.setMaxBatchSize(build.maxBatch); + } + else + { + profile = builder.createOptimizationProfile(); + } + + bool hasDynamicShapes{false}; + + bool broadcastInputFormats = broadcastIOFormats(build.inputFormats, network.getNbInputs()); + + if (profile) + { + // Check if the provided input tensor names match the input tensors of the engine. + // Throw an error if the provided input tensor names cannot be found because it implies a potential typo. + for (auto const& shape : build.shapes) + { + bool tensorNameFound{false}; + for (int32_t i = 0; i < network.getNbInputs(); ++i) + { + if (network.getInput(i)->getName() == shape.first) + { + tensorNameFound = true; + break; + } + } + if (!tensorNameFound) + { + sample::gLogError << "Cannot find input tensor with name \"" << shape.first << "\" in the network " + << "inputs! Please make sure the input tensor names are correct." << std::endl; + return false; + } + } + } + + for (uint32_t i = 0, n = network.getNbInputs(); i < n; i++) + { + // Set formats and data types of inputs + auto* input = network.getInput(i); + if (!build.inputFormats.empty()) + { + int inputFormatIndex = broadcastInputFormats ? 0 : i; + input->setType(build.inputFormats[inputFormatIndex].first); + input->setAllowedFormats(build.inputFormats[inputFormatIndex].second); + } + else + { + switch (input->getType()) + { + case DataType::kINT32: + case DataType::kBOOL: + case DataType::kHALF: + case DataType::kUINT8: + // Leave these as is. + break; + case DataType::kFLOAT: + case DataType::kINT8: + // User did not specify a floating-point format. Default to kFLOAT. + input->setType(DataType::kFLOAT); + break; + case DataType::kFP8: ASSERT(!"FP8 is not supported"); break; + } + input->setAllowedFormats(1U << static_cast(TensorFormat::kLINEAR)); + } + + if (profile) + { + auto const dims = input->getDimensions(); + auto const isScalar = dims.nbDims == 0; + auto const isDynamicInput = std::any_of(dims.d, dims.d + dims.nbDims, [](int32_t dim) { return dim == -1; }) + || input->isShapeTensor(); + if (isDynamicInput) + { + hasDynamicShapes = true; + auto shape = build.shapes.find(input->getName()); + ShapeRange shapes{}; + + // If no shape is provided, set dynamic dimensions to 1. + if (shape == build.shapes.end()) + { + constexpr int DEFAULT_DIMENSION = 1; + std::vector staticDims; + if (input->isShapeTensor()) + { + if (isScalar) + { + staticDims.push_back(1); + } + else + { + staticDims.resize(dims.d[0]); + std::fill(staticDims.begin(), staticDims.end(), DEFAULT_DIMENSION); + } + } + else + { + staticDims.resize(dims.nbDims); + std::transform(dims.d, dims.d + dims.nbDims, staticDims.begin(), + [&](int dimension) { return dimension > 0 ? dimension : DEFAULT_DIMENSION; }); + } + sample::gLogWarning << "Dynamic dimensions required for input: " << input->getName() + << ", but no shapes were provided. Automatically overriding shape to: " + << staticDims << std::endl; + std::fill(shapes.begin(), shapes.end(), staticDims); + } + else + { + shapes = shape->second; + } + + std::vector profileDims{}; + if (input->isShapeTensor()) + { + profileDims = shapes[static_cast(OptProfileSelector::kMIN)]; + SMP_RETVAL_IF_FALSE(profile->setShapeValues(input->getName(), OptProfileSelector::kMIN, + profileDims.data(), static_cast(profileDims.size())), + "Error in set shape values MIN", false, err); + profileDims = shapes[static_cast(OptProfileSelector::kOPT)]; + SMP_RETVAL_IF_FALSE(profile->setShapeValues(input->getName(), OptProfileSelector::kOPT, + profileDims.data(), static_cast(profileDims.size())), + "Error in set shape values OPT", false, err); + profileDims = shapes[static_cast(OptProfileSelector::kMAX)]; + SMP_RETVAL_IF_FALSE(profile->setShapeValues(input->getName(), OptProfileSelector::kMAX, + profileDims.data(), static_cast(profileDims.size())), + "Error in set shape values MAX", false, err); + } + else + { + profileDims = shapes[static_cast(OptProfileSelector::kMIN)]; + SMP_RETVAL_IF_FALSE( + profile->setDimensions(input->getName(), OptProfileSelector::kMIN, toDims(profileDims)), + "Error in set dimensions to profile MIN", false, err); + profileDims = shapes[static_cast(OptProfileSelector::kOPT)]; + SMP_RETVAL_IF_FALSE( + profile->setDimensions(input->getName(), OptProfileSelector::kOPT, toDims(profileDims)), + "Error in set dimensions to profile OPT", false, err); + profileDims = shapes[static_cast(OptProfileSelector::kMAX)]; + SMP_RETVAL_IF_FALSE( + profile->setDimensions(input->getName(), OptProfileSelector::kMAX, toDims(profileDims)), + "Error in set dimensions to profile MAX", false, err); + } + } + } + } + + for (uint32_t i = 0, n = network.getNbOutputs(); i < n; i++) + { + auto* output = network.getOutput(i); + if (profile) + { + auto const dims = output->getDimensions(); + // A shape tensor output with known static dimensions may have dynamic shape values inside it. + auto const isDynamicOutput = std::any_of(dims.d, dims.d + dims.nbDims, [](int32_t dim) { return dim == -1; }) + || output->isShapeTensor(); + if (isDynamicOutput) + { + hasDynamicShapes = true; + } + } + } + + if (!hasDynamicShapes && !build.shapes.empty()) + { + sample::gLogError << "Static model does not take explicit shapes since the shape of inference tensors will be " + "determined by the model itself" + << std::endl; + return false; + } + + if (profile && hasDynamicShapes) + { + SMP_RETVAL_IF_FALSE(profile->isValid(), "Required optimization profile is invalid", false, err); + SMP_RETVAL_IF_FALSE( + config.addOptimizationProfile(profile) != -1, "Error in add optimization profile", false, err); + } + + bool broadcastOutputFormats = broadcastIOFormats(build.outputFormats, network.getNbOutputs(), false); + + for (uint32_t i = 0, n = network.getNbOutputs(); i < n; i++) + { + // Set formats and data types of outputs + auto* output = network.getOutput(i); + if (!build.outputFormats.empty()) + { + int outputFormatIndex = broadcastOutputFormats ? 0 : i; + output->setType(build.outputFormats[outputFormatIndex].first); + output->setAllowedFormats(build.outputFormats[outputFormatIndex].second); + } + else + { + output->setAllowedFormats(1U << static_cast(TensorFormat::kLINEAR)); + } + } + + setMemoryPoolLimits(config, build); + + setPreviewFeatures(config, build); + + if (build.heuristic) + { + config.setFlag(BuilderFlag::kENABLE_TACTIC_HEURISTIC); + } + + config.setBuilderOptimizationLevel(build.builderOptimizationLevel); + + if (build.timingCacheMode == TimingCacheMode::kDISABLE) + { + config.setFlag(BuilderFlag::kDISABLE_TIMING_CACHE); + } + + if (!build.tf32) + { + config.clearFlag(BuilderFlag::kTF32); + } + + if (build.refittable) + { + config.setFlag(BuilderFlag::kREFIT); + } + + if (build.versionCompatible) + { + config.setFlag(BuilderFlag::kVERSION_COMPATIBLE); + } + + std::vector pluginPaths; + for (auto const& pluginPath : sys.setPluginsToSerialize) + { + sample::gLogVerbose << "Setting plugin to serialize: " << pluginPath << std::endl; + pluginPaths.push_back(pluginPath.c_str()); + } + if (!pluginPaths.empty()) + { + config.setPluginsToSerialize(pluginPaths.data(), pluginPaths.size()); + } + + if (build.excludeLeanRuntime) + { + config.setFlag(BuilderFlag::kEXCLUDE_LEAN_RUNTIME); + } + + if (build.sparsity != SparsityFlag::kDISABLE) + { + config.setFlag(BuilderFlag::kSPARSE_WEIGHTS); + if (build.sparsity == SparsityFlag::kFORCE) + { + sparsify(network, sparseWeights); + } + } + + config.setProfilingVerbosity(build.profilingVerbosity); + config.setMinTimingIterations(build.minTiming); + config.setAvgTimingIterations(build.avgTiming); + + if (build.fp16) + { + config.setFlag(BuilderFlag::kFP16); + } + + if (build.int8) + { + config.setFlag(BuilderFlag::kINT8); + } + + SMP_RETVAL_IF_FALSE(!(build.int8 && build.fp8), + "FP8 and INT8 precisions have been specified", false, err); + + if (build.fp8) + { + config.setFlag(BuilderFlag::kFP8); + } + + if (build.int8 && !build.fp16) + { + sample::gLogInfo + << "FP32 and INT8 precisions have been specified - more performance might be enabled by additionally " + "specifying --fp16 or --best" + << std::endl; + } + + auto isInt8 = [](const IOFormat& format) { return format.first == DataType::kINT8; }; + auto int8IO = std::count_if(build.inputFormats.begin(), build.inputFormats.end(), isInt8) + + std::count_if(build.outputFormats.begin(), build.outputFormats.end(), isInt8); + + auto hasQDQLayers = [](INetworkDefinition& network) { + // Determine if our network has QDQ layers. + auto const nbLayers = network.getNbLayers(); + for (int32_t i = 0; i < nbLayers; i++) + { + auto const& layer = network.getLayer(i); + if (layer->getType() == LayerType::kQUANTIZE || layer->getType() == LayerType::kDEQUANTIZE) + { + return true; + } + } + return false; + }; + + if (!hasQDQLayers(network) && (build.int8 || int8IO) && build.calibration.empty()) + { + // Explicitly set int8 scales if no calibrator is provided and if I/O tensors use int8, + // because auto calibration does not support this case. + SMP_RETVAL_IF_FALSE(setTensorDynamicRange(network), "Error in set tensor dynamic range.", false, err); + } + else if (build.int8) + { + if (!hasQDQLayers(network) && int8IO) + { + try + { + // Set dynamic ranges of int8 inputs / outputs to match scales loaded from calibration cache + // TODO http://nvbugs/3262234 Change the network validation so that this workaround can be removed + setTensorScalesFromCalibration(network, build.inputFormats, build.outputFormats, build.calibration); + } + catch (std::exception&) + { + sample::gLogError + << "Int8IO was specified but impossible to read tensor scales from provided calibration cache file" + << std::endl; + return false; + } + } + IOptimizationProfile* profileCalib{nullptr}; + if (!build.shapesCalib.empty()) + { + profileCalib = builder.createOptimizationProfile(); + for (uint32_t i = 0, n = network.getNbInputs(); i < n; i++) + { + auto* input = network.getInput(i); + Dims profileDims{}; + auto shape = build.shapesCalib.find(input->getName()); + ShapeRange shapesCalib{}; + shapesCalib = shape->second; + + profileDims = toDims(shapesCalib[static_cast(OptProfileSelector::kOPT)]); + // Here we check only kMIN as all profileDims are the same. + SMP_RETVAL_IF_FALSE( + profileCalib->setDimensions(input->getName(), OptProfileSelector::kMIN, profileDims), + "Error in set dimensions to calibration profile OPT", false, err); + profileCalib->setDimensions(input->getName(), OptProfileSelector::kOPT, profileDims); + profileCalib->setDimensions(input->getName(), OptProfileSelector::kMAX, profileDims); + } + SMP_RETVAL_IF_FALSE(profileCalib->isValid(), "Calibration profile is invalid", false, err); + SMP_RETVAL_IF_FALSE( + config.setCalibrationProfile(profileCalib), "Error in set calibration profile", false, err); + } + + std::vector elemCount{}; + for (int i = 0; i < network.getNbInputs(); i++) + { + auto* input = network.getInput(i); + auto const dims = input->getDimensions(); + auto const isDynamicInput + = std::any_of(dims.d, dims.d + dims.nbDims, [](int32_t dim) { return dim == -1; }); + + if (profileCalib) + { + elemCount.push_back(volume(profileCalib->getDimensions(input->getName(), OptProfileSelector::kOPT))); + } + else if (profile && isDynamicInput) + { + elemCount.push_back(volume(profile->getDimensions(input->getName(), OptProfileSelector::kOPT))); + } + else + { + elemCount.push_back(volume(input->getDimensions())); + } + } + + calibrator.reset(new RndInt8Calibrator(1, elemCount, build.calibration, network, err)); + config.setInt8Calibrator(calibrator.get()); + } + + if (build.directIO) + { + config.setFlag(BuilderFlag::kDIRECT_IO); + } + + switch (build.precisionConstraints) + { + case PrecisionConstraints::kNONE: + // It's the default for TensorRT. + break; + case PrecisionConstraints::kOBEY: + config.setFlag(BuilderFlag::kOBEY_PRECISION_CONSTRAINTS); + break; + case PrecisionConstraints::kPREFER: config.setFlag(BuilderFlag::kPREFER_PRECISION_CONSTRAINTS); break; + } + + if (!build.layerPrecisions.empty() && build.precisionConstraints != PrecisionConstraints::kNONE) + { + setLayerPrecisions(network, build.layerPrecisions); + } + + if (!build.layerOutputTypes.empty() && build.precisionConstraints != PrecisionConstraints::kNONE) + { + setLayerOutputTypes(network, build.layerOutputTypes); + } + + if (!build.layerDeviceTypes.empty()) + { + setLayerDeviceTypes(network, config, build.layerDeviceTypes); + } + + if (build.safe) + { + config.setEngineCapability(sys.DLACore != -1 ? EngineCapability::kDLA_STANDALONE : EngineCapability::kSAFETY); + } + + if (build.restricted) + { + config.setFlag(BuilderFlag::kSAFETY_SCOPE); + } + + if (sys.DLACore != -1) + { + if (sys.DLACore < builder.getNbDLACores()) + { + config.setDefaultDeviceType(DeviceType::kDLA); + config.setDLACore(sys.DLACore); + config.setFlag(BuilderFlag::kPREFER_PRECISION_CONSTRAINTS); + + if (sys.fallback) + { + config.setFlag(BuilderFlag::kGPU_FALLBACK); + } + else + { + // Reformatting runs on GPU, so avoid I/O reformatting. + config.setFlag(BuilderFlag::kDIRECT_IO); + } + if (!build.int8) + { + config.setFlag(BuilderFlag::kFP16); + } + } + else + { + err << "Cannot create DLA engine, " << sys.DLACore << " not available" << std::endl; + return false; + } + } + + if (build.enabledTactics || build.disabledTactics) + { + TacticSources tacticSources = config.getTacticSources(); + tacticSources |= build.enabledTactics; + tacticSources &= ~build.disabledTactics; + config.setTacticSources(tacticSources); + } + + config.setHardwareCompatibilityLevel(build.hardwareCompatibilityLevel); + + if (build.maxAuxStreams != defaultMaxAuxStreams) + { + config.setMaxAuxStreams(build.maxAuxStreams); + } + + return true; +} + +//! +//! \brief Create a serialized engine for a network defintion +//! +//! \return Whether the engine creation succeeds or fails. +//! +bool networkToSerializedEngine( + BuildOptions const& build, SystemOptions const& sys, IBuilder& builder, BuildEnvironment& env, std::ostream& err) +{ + std::unique_ptr config{builder.createBuilderConfig()}; + std::unique_ptr calibrator; + std::vector> sparseWeights; + SMP_RETVAL_IF_FALSE(config != nullptr, "Config creation failed", false, err); + SMP_RETVAL_IF_FALSE( + setupNetworkAndConfig(build, sys, builder, *env.network, *config, calibrator, err, sparseWeights), + "Network And Config setup failed", false, err); + + std::unique_ptr timingCache{nullptr}; + // Try to load cache from file. Create a fresh cache if the file doesn't exist + if (build.timingCacheMode == TimingCacheMode::kGLOBAL) + { + std::vector loadedCache = samplesCommon::loadTimingCacheFile(build.timingCacheFile); + timingCache.reset(config->createTimingCache(static_cast(loadedCache.data()), loadedCache.size())); + SMP_RETVAL_IF_FALSE(timingCache != nullptr, "TimingCache creation failed", false, err); + config->setTimingCache(*timingCache, false); + } + + // CUDA stream used for profiling by the builder. + auto profileStream = samplesCommon::makeCudaStream(); + SMP_RETVAL_IF_FALSE(profileStream != nullptr, "Cuda stream creation failed", false, err); + config->setProfileStream(*profileStream); + + std::unique_ptr serializedEngine{builder.buildSerializedNetwork(*env.network, *config)}; + SMP_RETVAL_IF_FALSE(serializedEngine != nullptr, "Engine could not be created from network", false, err); + + env.engine.setBlob(serializedEngine->data(), serializedEngine->size()); + + if (build.safe && build.consistency) + { + checkSafeEngine(serializedEngine->data(), serializedEngine->size()); + } + + if (build.timingCacheMode == TimingCacheMode::kGLOBAL) + { + auto timingCache = config->getTimingCache(); + samplesCommon::updateTimingCacheFile(build.timingCacheFile, timingCache); + } + + return true; +} + +//! +//! \brief Parse a given model, create a network and an engine. +//! +bool modelToBuildEnv( + ModelOptions const& model, BuildOptions const& build, SystemOptions& sys, BuildEnvironment& env, std::ostream& err) +{ + env.builder.reset(createBuilder()); + SMP_RETVAL_IF_FALSE(env.builder != nullptr, "Builder creation failed", false, err); + env.builder->setErrorRecorder(&gRecorder); + auto networkFlags + = (build.maxBatch) ? 0U : 1U << static_cast(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH); + + for (auto const& pluginPath : sys.dynamicPlugins) + { + env.builder->getPluginRegistry().loadLibrary(pluginPath.c_str()); + } + env.network.reset(env.builder->createNetworkV2(networkFlags)); + + std::vector vcPluginLibrariesUsed; + SMP_RETVAL_IF_FALSE(env.network != nullptr, "Network creation failed", false, err); + env.parser = modelToNetwork(model, *env.network, err, build.versionCompatible ? &vcPluginLibrariesUsed : nullptr); + SMP_RETVAL_IF_FALSE(env.parser.operator bool(), "Parsing model failed", false, err); + + if (build.versionCompatible && !sys.ignoreParsedPluginLibs && !vcPluginLibrariesUsed.empty()) + { + sample::gLogInfo << "The following plugin libraries were identified by the parser as required for a " + "version-compatible engine:" + << std::endl; + for (auto const& lib : vcPluginLibrariesUsed) + { + sample::gLogInfo << " " << lib << std::endl; + } + if (!build.excludeLeanRuntime) + { + sample::gLogInfo << "These libraries will be added to --setPluginsToSerialize since --excludeLeanRuntime " + "was not specified." + << std::endl; + std::copy(vcPluginLibrariesUsed.begin(), vcPluginLibrariesUsed.end(), + std::back_inserter(sys.setPluginsToSerialize)); + } + sample::gLogInfo << "These libraries will be added to --dynamicPlugins for use at inference time." << std::endl; + std::copy(vcPluginLibrariesUsed.begin(), vcPluginLibrariesUsed.end(), std::back_inserter(sys.dynamicPlugins)); + + // Implicitly-added plugins from ONNX parser should be loaded into plugin registry as well. + for (auto const& pluginPath : vcPluginLibrariesUsed) + { + env.builder->getPluginRegistry().loadLibrary(pluginPath.c_str()); + } + + sample::gLogInfo << "Use --ignoreParsedPluginLibs to disable this behavior." << std::endl; + } + + SMP_RETVAL_IF_FALSE( + networkToSerializedEngine(build, sys, *env.builder, env, err), "Building engine failed", false, err); + return true; +} + +namespace +{ +std::pair, std::vector> getLayerWeightsRolePair(IRefitter& refitter) +{ + // Get number of refittable items. + auto const nbAll = refitter.getAll(0, nullptr, nullptr); + std::vector layerNames(nbAll); + // Allocate buffers for the items and get them. + std::vector weightsRoles(nbAll); + refitter.getAll(nbAll, layerNames.data(), weightsRoles.data()); + std::vector layerNameStrs(nbAll); + std::transform(layerNames.begin(), layerNames.end(), layerNameStrs.begin(), [](char const* name) { + if (name == nullptr) + { + return std::string{}; + } + return std::string{name}; + }); + return {layerNameStrs, weightsRoles}; +} + +std::pair, std::vector> getMissingLayerWeightsRolePair(IRefitter& refitter) +{ + // Get number of refittable items. + auto const nbMissing = refitter.getMissing(0, nullptr, nullptr); + std::vector layerNames(nbMissing); + // Allocate buffers for the items and get them. + std::vector weightsRoles(nbMissing); + refitter.getMissing(nbMissing, layerNames.data(), weightsRoles.data()); + std::vector layerNameStrs(nbMissing); + std::transform(layerNames.begin(), layerNames.end(), layerNameStrs.begin(), [](char const* name) { + if (name == nullptr) + { + return std::string{}; + } + return std::string{name}; + }); + return {layerNameStrs, weightsRoles}; +} +} // namespace + +bool loadEngineToBuildEnv(std::string const& engine, bool enableConsistency, BuildEnvironment& env, std::ostream& err) +{ + std::ifstream engineFile(engine, std::ios::binary); + SMP_RETVAL_IF_FALSE(engineFile.good(), "", false, err << "Error opening engine file: " << engine); + engineFile.seekg(0, std::ifstream::end); + int64_t fsize = engineFile.tellg(); + engineFile.seekg(0, std::ifstream::beg); + + std::vector engineBlob(fsize); + engineFile.read(reinterpret_cast(engineBlob.data()), fsize); + SMP_RETVAL_IF_FALSE(engineFile.good(), "", false, err << "Error loading engine file: " << engine); + + if (enableConsistency) + { + checkSafeEngine(engineBlob.data(), fsize); + } + + env.engine.setBlob(engineBlob.data(), engineBlob.size()); + + return true; +} + +void dumpRefittable(nvinfer1::ICudaEngine& engine) +{ + std::unique_ptr refitter{createRefitter(engine)}; + if (refitter == nullptr) + { + sample::gLogError << "Failed to create a refitter." << std::endl; + return; + } + + auto const& layerWeightsRolePair = getLayerWeightsRolePair(*refitter); + auto const& layerNames = layerWeightsRolePair.first; + auto const& weightsRoles = layerWeightsRolePair.second; + auto const nbAll = layerWeightsRolePair.first.size(); + for (size_t i = 0; i < nbAll; ++i) + { + sample::gLogInfo << layerNames[i] << " " << weightsRoles[i] << std::endl; + } +} + +ICudaEngine* loadEngine(std::string const& engine, int32_t DLACore, std::ostream& err) +{ + BuildEnvironment env(/* isSafe */ false, /* versionCompatible */ false, DLACore, "", getTempfileControlDefaults()); + return loadEngineToBuildEnv(engine, false, env, err) ? env.engine.release() : nullptr; +} + +bool saveEngine(const ICudaEngine& engine, std::string const& fileName, std::ostream& err) +{ + std::ofstream engineFile(fileName, std::ios::binary); + if (!engineFile) + { + err << "Cannot open engine file: " << fileName << std::endl; + return false; + } + + std::unique_ptr serializedEngine{engine.serialize()}; + if (serializedEngine == nullptr) + { + err << "Engine serialization failed" << std::endl; + return false; + } + + engineFile.write(static_cast(serializedEngine->data()), serializedEngine->size()); + return !engineFile.fail(); +} + +bool getEngineBuildEnv( + const ModelOptions& model, BuildOptions const& build, SystemOptions& sys, BuildEnvironment& env, std::ostream& err) +{ + bool createEngineSuccess{false}; + + if (build.load) + { + createEngineSuccess = loadEngineToBuildEnv(build.engine, build.safe && build.consistency, env, err); + } + else + { + createEngineSuccess = modelToBuildEnv(model, build, sys, env, err); + } + + SMP_RETVAL_IF_FALSE(createEngineSuccess, "Failed to create engine from model or file.", false, err); + + if (build.save) + { + std::ofstream engineFile(build.engine, std::ios::binary); + engineFile.write(reinterpret_cast(env.engine.getBlob().data()), env.engine.getBlob().size()); + SMP_RETVAL_IF_FALSE(!engineFile.fail(), "Saving engine to file failed.", false, err); + } + + return true; +} + +// There is not a getWeightsName API, so we need to use WeightsRole. +std::vector> getAllRefitWeightsForLayer(const ILayer& l) +{ + switch (l.getType()) + { + case LayerType::kCONSTANT: + { + auto const& layer = static_cast(l); + auto const weights = layer.getWeights(); + switch (weights.type) + { + case DataType::kFLOAT: + case DataType::kHALF: + case DataType::kINT8: + case DataType::kINT32: return {std::make_pair(WeightsRole::kCONSTANT, weights)}; + case DataType::kBOOL: + case DataType::kUINT8: + case DataType::kFP8: + // Refit not supported for these types. + break; + } + break; + } + case LayerType::kCONVOLUTION: + { + auto const& layer = static_cast(l); + return {std::make_pair(WeightsRole::kKERNEL, layer.getKernelWeights()), + std::make_pair(WeightsRole::kBIAS, layer.getBiasWeights())}; + } + case LayerType::kDECONVOLUTION: + { + auto const& layer = static_cast(l); + return {std::make_pair(WeightsRole::kKERNEL, layer.getKernelWeights()), + std::make_pair(WeightsRole::kBIAS, layer.getBiasWeights())}; + } + case LayerType::kFULLY_CONNECTED: + { + auto const& layer = static_cast(l); + return {std::make_pair(WeightsRole::kKERNEL, layer.getKernelWeights()), + std::make_pair(WeightsRole::kBIAS, layer.getBiasWeights())}; + } + case LayerType::kSCALE: + { + auto const& layer = static_cast(l); + return {std::make_pair(WeightsRole::kSCALE, layer.getScale()), + std::make_pair(WeightsRole::kSHIFT, layer.getShift())}; + } + case LayerType::kACTIVATION: + case LayerType::kASSERTION: + case LayerType::kCAST: + case LayerType::kCONCATENATION: + case LayerType::kCONDITION: + case LayerType::kCONDITIONAL_INPUT: + case LayerType::kCONDITIONAL_OUTPUT: + case LayerType::kDEQUANTIZE: + case LayerType::kEINSUM: + case LayerType::kELEMENTWISE: + case LayerType::kFILL: + case LayerType::kGATHER: + case LayerType::kGRID_SAMPLE: + case LayerType::kIDENTITY: + case LayerType::kITERATOR: + case LayerType::kLOOP_OUTPUT: + case LayerType::kLRN: + case LayerType::kMATRIX_MULTIPLY: + case LayerType::kNMS: + case LayerType::kNON_ZERO: + case LayerType::kNORMALIZATION: + case LayerType::kONE_HOT: + case LayerType::kPADDING: + case LayerType::kPARAMETRIC_RELU: + case LayerType::kPLUGIN: + case LayerType::kPLUGIN_V2: + case LayerType::kPOOLING: + case LayerType::kQUANTIZE: + case LayerType::kRAGGED_SOFTMAX: + case LayerType::kRECURRENCE: + case LayerType::kREDUCE: + case LayerType::kRESIZE: + case LayerType::kREVERSE_SEQUENCE: + case LayerType::kRNN_V2: + case LayerType::kSCATTER: + case LayerType::kSELECT: + case LayerType::kSHAPE: + case LayerType::kSHUFFLE: + case LayerType::kSLICE: + case LayerType::kSOFTMAX: + case LayerType::kTOPK: + case LayerType::kTRIP_LIMIT: + case LayerType::kUNARY: return {}; + } + return {}; +} + +bool timeRefit(INetworkDefinition const& network, nvinfer1::ICudaEngine& engine, bool multiThreading) +{ + using time_point = std::chrono::time_point; + using durationMs = std::chrono::duration; + + auto const nbLayers = network.getNbLayers(); + std::unique_ptr refitter{createRefitter(engine)}; + // Set max threads that can be used by refitter. + if (multiThreading && !refitter->setMaxThreads(10)) + { + sample::gLogError << "Failed to set max threads to refitter." << std::endl; + return false; + } + auto const& layerWeightsRolePair = getLayerWeightsRolePair(*refitter); + // We use std::string instead of char const* since we can have copies of layer names. + std::set> layerRoleSet; + + auto const& layerNames = layerWeightsRolePair.first; + auto const& weightsRoles = layerWeightsRolePair.second; + + std::transform(layerNames.begin(), layerNames.end(), weightsRoles.begin(), + std::inserter(layerRoleSet, layerRoleSet.begin()), + [](std::string const& layerName, WeightsRole const role) { return std::make_pair(layerName, role); }); + + auto const isRefittable = [&layerRoleSet](char const* layerName, WeightsRole const role) { + return layerRoleSet.find(std::make_pair(layerName, role)) != layerRoleSet.end(); + }; + + auto const setWeights = [&] { + for (int32_t i = 0; i < nbLayers; i++) + { + auto const layer = network.getLayer(i); + auto const roleWeightsVec = getAllRefitWeightsForLayer(*layer); + for (auto const& roleWeights : roleWeightsVec) + { + if (isRefittable(layer->getName(), roleWeights.first)) + { + bool const success = refitter->setWeights(layer->getName(), roleWeights.first, roleWeights.second); + if (!success) + { + return false; + } + } + } + } + return true; + }; + + auto const reportMissingWeights = [&] { + auto const& missingPair = getMissingLayerWeightsRolePair(*refitter); + auto const& layerNames = missingPair.first; + auto const& weightsRoles = missingPair.second; + for (size_t i = 0; i < layerNames.size(); ++i) + { + sample::gLogError << "Missing (" << layerNames[i] << ", " << weightsRoles[i] << ") for refitting." + << std::endl; + } + return layerNames.empty(); + }; + + // Warm up and report missing weights + bool const success = setWeights() && reportMissingWeights() && refitter->refitCudaEngine(); + if (!success) + { + return false; + } + + constexpr int32_t loop = 5; + time_point const refitStartTime{std::chrono::steady_clock::now()}; + { + for (int32_t l = 0; l < loop; l++) + { + bool const success = setWeights() && refitter->refitCudaEngine(); + if (!success) + { + return false; + } + } + } + time_point const refitEndTime{std::chrono::steady_clock::now()}; + + sample::gLogInfo << "Engine refitted" + << " in " << durationMs(refitEndTime - refitStartTime).count() / loop << " ms." << std::endl; + return true; +} + +namespace +{ +void* initSafeRuntime() +{ + void* handle{nullptr}; +#if !defined(_WIN32) + std::string const dllName{samplesCommon::isDebug() ? "libnvinfer_safe_debug.so.8" : "libnvinfer_safe.so.8"}; +#if SANITIZER_BUILD + handle = dlopen(dllName.c_str(), RTLD_LAZY | RTLD_NODELETE); +#else + handle = dlopen(dllName.c_str(), RTLD_LAZY); +#endif +#endif + return handle; +} + +void* initConsistencyCheckerLibrary() +{ + void* handle{nullptr}; +#if !defined(_WIN32) + std::string const dllName{samplesCommon::isDebug() ? "libnvinfer_checker_debug.so.8" : "libnvinfer_checker.so.8"}; +#if SANITIZER_BUILD + handle = dlopen(dllName.c_str(), RTLD_LAZY | RTLD_NODELETE); +#else + handle = dlopen(dllName.c_str(), RTLD_LAZY); +#endif +#endif + return handle; +} + +#if !defined(_WIN32) +struct DllDeleter +{ + void operator()(void* handle) + { + if (handle != nullptr) + { + dlclose(handle); + } + } +}; +const std::unique_ptr safeRuntimeLibrary{initSafeRuntime()}; +const std::unique_ptr consistencyCheckerLibrary{initConsistencyCheckerLibrary()}; +#endif +} // namespace + +bool hasSafeRuntime() +{ + bool ret{false}; +#if !defined(_WIN32) + ret = (safeRuntimeLibrary != nullptr); +#endif + return ret; +} + +nvinfer1::safe::IRuntime* createSafeInferRuntime(nvinfer1::ILogger& logger) noexcept +{ + nvinfer1::safe::IRuntime* runtime{nullptr}; +#if !defined(_WIN32) + constexpr char symbolName[] = "_ZN8nvinfer14safe18createInferRuntimeERNS_7ILoggerE"; + typedef nvinfer1::safe::IRuntime* (*CreateInferRuntimeFn)(nvinfer1::ILogger & logger); + if (hasSafeRuntime()) + { + auto createFn = reinterpret_cast(dlsym(safeRuntimeLibrary.get(), symbolName)); + if (createFn != nullptr) + { + runtime = createFn(logger); + } + } +#endif + return runtime; +} + +bool hasConsistencyChecker() +{ + bool ret{false}; +#if !defined(_WIN32) + ret = (consistencyCheckerLibrary != nullptr); +#endif + return ret; +} + +nvinfer1::consistency::IConsistencyChecker* createConsistencyChecker( + nvinfer1::ILogger& logger, void const* serializedEngine, int32_t const engineSize) noexcept +{ + nvinfer1::consistency::IConsistencyChecker* checker{nullptr}; + + if (serializedEngine == nullptr || engineSize == 0) + { + return checker; + } + +#if !defined(_WIN32) + constexpr char symbolName[] = "createConsistencyChecker_INTERNAL"; + typedef nvinfer1::consistency::IConsistencyChecker* (*CreateCheckerFn)( + nvinfer1::ILogger * logger, void const* data, size_t size, uint32_t version); + if (hasSafeRuntime()) + { + auto createFn = reinterpret_cast(dlsym(consistencyCheckerLibrary.get(), symbolName)); + if (createFn != nullptr) + { + checker = createFn(&logger, serializedEngine, engineSize, NV_TENSORRT_VERSION); + } + } +#endif + return checker; +} + +bool checkSafeEngine(void const* serializedEngine, int32_t const engineSize) +{ + if (!hasConsistencyChecker()) + { + sample::gLogError << "Cannot perform consistency check because the checker is not loaded.." << std::endl; + return false; + } + auto checker = std::unique_ptr( + createConsistencyChecker(sample::gLogger.getTRTLogger(), serializedEngine, engineSize)); + if (checker.get() == nullptr) + { + sample::gLogError << "Failed to create consistency checker." << std::endl; + return false; + } + sample::gLogInfo << "Start consistency checking." << std::endl; + if (!checker->validate()) + { + sample::gLogError << "Consistency validation failed." << std::endl; + return false; + } + sample::gLogInfo << "Consistency validation passed." << std::endl; + return true; +} +} // namespace sample diff --git a/Code/TestTRTInterDll/trtinfer_lib/common/sampleEngines.h b/Code/TestTRTInterDll/trtinfer_lib/common/sampleEngines.h new file mode 100644 index 0000000..6c0a88b --- /dev/null +++ b/Code/TestTRTInterDll/trtinfer_lib/common/sampleEngines.h @@ -0,0 +1,314 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef TRT_SAMPLE_ENGINES_H +#define TRT_SAMPLE_ENGINES_H + +#include +#include + +#include "NvCaffeParser.h" +#include "NvInfer.h" +#include "NvInferConsistency.h" +#include "NvInferSafeRuntime.h" +#include "NvOnnxParser.h" +#include "NvUffParser.h" +#include "sampleOptions.h" +#include "sampleUtils.h" + +namespace sample +{ + +struct Parser +{ + std::unique_ptr caffeParser; + std::unique_ptr uffParser; + std::unique_ptr onnxParser; + + operator bool() const + { + return caffeParser || uffParser || onnxParser; + } +}; + +//! +//! \brief A helper class to hold a serialized engine (std or safe) and only deserialize it when being accessed. +//! +class LazilyDeserializedEngine +{ +public: + //! + //! \brief Delete default constructor to make sure isSafe and DLACore are always set. + //! + LazilyDeserializedEngine() = delete; + + //! + //! \brief Constructor of LazilyDeserializedEngine. + //! + LazilyDeserializedEngine(bool isSafe, bool versionCompatible, int32_t DLACore, std::string const& tempdir, + nvinfer1::TempfileControlFlags tempfileControls, std::string const& leanDLLPath) + : mIsSafe(isSafe) + , mVersionCompatible(versionCompatible) + , mDLACore(DLACore) + , mTempdir(tempdir) + , mTempfileControls(tempfileControls) + , mLeanDLLPath(leanDLLPath) + { + } + + //! + //! \brief Move from another LazilyDeserializedEngine. + //! + LazilyDeserializedEngine(LazilyDeserializedEngine&& other) + { + mIsSafe = other.mIsSafe; + mVersionCompatible = other.mVersionCompatible; + mDLACore = other.mDLACore; + mEngineBlob = std::move(other.mEngineBlob); + mEngine = std::move(other.mEngine); + mSafeEngine = std::move(other.mSafeEngine); + mTempdir = std::move(other.mTempdir); + mTempfileControls = other.mTempfileControls; + mLeanDLLPath = std::move(other.mLeanDLLPath); + mDynamicPlugins = std::move(other.mDynamicPlugins); + } + + //! + //! \brief Delete copy constructor. + //! + LazilyDeserializedEngine(LazilyDeserializedEngine const& other) = delete; + + //! + //! \brief Get the pointer to the ICudaEngine. Triggers deserialization if not already done so. + //! + nvinfer1::ICudaEngine* get(); + + //! + //! \brief Get the pointer to the ICudaEngine and release the ownership. + //! + nvinfer1::ICudaEngine* release(); + + //! + //! \brief Get the pointer to the safe::ICudaEngine. Triggers deserialization if not already done so. + //! + nvinfer1::safe::ICudaEngine* getSafe(); + + //! + //! \brief Get the underlying blob storing serialized engine. + //! + std::vector const& getBlob() const + { + return mEngineBlob; + } + + //! + //! \brief Set the underlying blob storing serialized engine. + //! + void setBlob(void* data, size_t size) + { + mEngineBlob.resize(size); + std::memcpy(mEngineBlob.data(), data, size); + mEngine.reset(); + mSafeEngine.reset(); + } + + //! + //! \brief Release the underlying blob without deleting the deserialized engine. + //! + void releaseBlob() + { + mEngineBlob.clear(); + } + + //! + //! \brief Get if safe mode is enabled. + //! + bool isSafe() + { + return mIsSafe; + } + + void setDynamicPlugins(std::vector const& dynamicPlugins) + { + mDynamicPlugins = dynamicPlugins; + } + +private: + bool mIsSafe{false}; + bool mVersionCompatible{false}; + int32_t mDLACore{-1}; + std::vector mEngineBlob; + + std::string mTempdir{}; + nvinfer1::TempfileControlFlags mTempfileControls{getTempfileControlDefaults()}; + std::string mLeanDLLPath{}; + std::vector mDynamicPlugins; + + //! \name Owned TensorRT objects + //! Per TensorRT object lifetime requirements as outlined in the developer guide, + //! the runtime must remain live while any engines created by the runtime are live. + //! DO NOT ADJUST the declaration order here: runtime -> (engine|safeEngine). + //! Destruction occurs in reverse declaration order: (engine|safeEngine) -> runtime. + //!@{ + + //! The runtime used to track parent of mRuntime if one exists. + //! Needed to load mRuntime if lean.so is supplied through file system path. + std::unique_ptr mParentRuntime{}; + + //! The runtime that is used to deserialize the engine. + std::unique_ptr mRuntime{}; + + //! If mIsSafe is false, this points to the deserialized std engine + std::unique_ptr mEngine{}; + + //! If mIsSafe is true, this points to the deserialized safe engine + std::unique_ptr mSafeEngine{}; + + //!@} +}; + +struct BuildEnvironment +{ + BuildEnvironment() = delete; + BuildEnvironment(BuildEnvironment const& other) = delete; + BuildEnvironment(BuildEnvironment&& other) = delete; + BuildEnvironment(bool isSafe, bool versionCompatible, int32_t DLACore, std::string const& tempdir, + nvinfer1::TempfileControlFlags tempfileControls, std::string const& leanDLLPath = "") + : engine(isSafe, versionCompatible, DLACore, tempdir, tempfileControls, leanDLLPath) + { + } + + //! \name Owned TensorRT objects + //! Per TensorRT object lifetime requirements as outlined in the developer guide, + //! factory objects must remain live while the objects created by those factories + //! are live (with the exception of builder -> engine). + //! DO NOT ADJUST the declaration order here: builder -> network -> parser. + //! Destruction occurs in reverse declaration order: parser -> network -> builder. + //!@{ + + //! The builder used to build the engine. + std::unique_ptr builder; + + //! The network used by the builder. + std::unique_ptr network; + + //! The parser used to specify the network. + Parser parser; + + //! The engine. + LazilyDeserializedEngine engine; + //!@} +}; + +//! +//! \brief Set up network and config +//! +//! \return boolean Return true if network and config were successfully set +//! +bool setupNetworkAndConfig(const BuildOptions& build, const SystemOptions& sys, nvinfer1::IBuilder& builder, + nvinfer1::INetworkDefinition& network, nvinfer1::IBuilderConfig& config, std::ostream& err, + std::vector>& sparseWeights); + +//! +//! \brief Log refittable layers and weights of a refittable engine +//! +void dumpRefittable(nvinfer1::ICudaEngine& engine); + +//! +//! \brief Load a serialized engine +//! +//! \return Pointer to the engine loaded or nullptr if the operation failed +//! +nvinfer1::ICudaEngine* loadEngine(std::string const& engine, int32_t DLACore, std::ostream& err); + +//! +//! \brief Save an engine into a file +//! +//! \return boolean Return true if the engine was successfully saved +//! +bool saveEngine(nvinfer1::ICudaEngine const& engine, std::string const& fileName, std::ostream& err); + +//! +//! \brief Create an engine from model or serialized file, and optionally save engine +//! +//! \return Pointer to the engine created or nullptr if the creation failed +//! +bool getEngineBuildEnv( + ModelOptions const& model, BuildOptions const& build, SystemOptions& sys, BuildEnvironment& env, std::ostream& err); + +//! +//! \brief Create a serialized network +//! +//! \return Pointer to a host memory for a serialized network +//! +nvinfer1::IHostMemory* networkToSerialized(const BuildOptions& build, const SystemOptions& sys, + nvinfer1::IBuilder& builder, nvinfer1::INetworkDefinition& network, std::ostream& err); + +//! +//! \brief Tranfer model to a serialized network +//! +//! \return Pointer to a host memory for a serialized network +//! +nvinfer1::IHostMemory* modelToSerialized( + const ModelOptions& model, const BuildOptions& build, const SystemOptions& sys, std::ostream& err); + +//! +//! \brief Serialize network and save it into a file +//! +//! \return boolean Return true if the network was successfully serialized and saved +//! +bool serializeAndSave( + const ModelOptions& model, const BuildOptions& build, const SystemOptions& sys, std::ostream& err); + +bool timeRefit(const nvinfer1::INetworkDefinition& network, nvinfer1::ICudaEngine& engine, bool multiThreading); + +//! +//! \brief Set tensor scales from a calibration table +//! +void setTensorScalesFromCalibration(nvinfer1::INetworkDefinition& network, std::vector const& inputFormats, + std::vector const& outputFormats, std::string const& calibrationFile); + +//! +//! \brief Check if safe runtime is loaded. +//! +bool hasSafeRuntime(); + +//! +//! \brief Create a safe runtime object if the dynamic library is loaded. +//! +nvinfer1::safe::IRuntime* createSafeInferRuntime(nvinfer1::ILogger& logger) noexcept; + +//! +//! \brief Check if consistency checker is loaded. +//! +bool hasConsistencyChecker(); + +//! +//! \brief Create a consistency checker object if the dynamic library is loaded. +//! +nvinfer1::consistency::IConsistencyChecker* createConsistencyChecker( + nvinfer1::ILogger& logger, nvinfer1::IHostMemory const* engine) noexcept; + +//! +//! \brief Run consistency check on serialized engine. +//! +bool checkSafeEngine(void const* serializedEngine, int32_t const engineSize); + +bool loadEngineToBuildEnv(std::string const& engine, bool enableConsistency, BuildEnvironment& env, std::ostream& err); +} // namespace sample + +#endif // TRT_SAMPLE_ENGINES_H diff --git a/Code/TestTRTInterDll/trtinfer_lib/common/sampleEntrypoints.h b/Code/TestTRTInterDll/trtinfer_lib/common/sampleEntrypoints.h new file mode 100644 index 0000000..9480697 --- /dev/null +++ b/Code/TestTRTInterDll/trtinfer_lib/common/sampleEntrypoints.h @@ -0,0 +1,141 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef TRT_SAMPLE_ENTRYPOINTS_H +#define TRT_SAMPLE_ENTRYPOINTS_H + +//! \file sampleEntrypoints.h +//! +//! Declares and conditionally defines entrypoints needed to create base TensorRT objects, depending +//! on whether the given sample uses TRT at link time or dynamically. Since common code is built once +//! and shared across all samples (both link-time and dynamic TRT), it does not define these entrypoints, +//! so each sample must define them individually. +//! +//! Samples that use TRT at link time can define DEFINE_TRT_ENTRYPOINTS before including this header to +//! pick up the definitions here. + +#include "NvCaffeParser.h" +#include "NvInfer.h" +#include "NvOnnxParser.h" +#include "NvUffParser.h" +#include "logger.h" + +extern nvinfer1::IBuilder* createBuilder(); +extern nvinfer1::IRuntime* createRuntime(); +extern nvinfer1::IRefitter* createRefitter(nvinfer1::ICudaEngine& engine); + +extern nvonnxparser::IParser* createONNXParser(nvinfer1::INetworkDefinition& network); + +extern nvcaffeparser1::ICaffeParser* sampleCreateCaffeParser(); +extern void shutdownCaffeParser(); + +extern nvuffparser::IUffParser* sampleCreateUffParser(); +extern void shutdownUffParser(); + +#if !defined(DEFINE_TRT_ENTRYPOINTS) +#define DEFINE_TRT_ENTRYPOINTS 0 +#endif + +// Allow opting out of individual entrypoints that are unused by the sample +#if !defined(DEFINE_TRT_BUILDER_ENTRYPOINT) +#define DEFINE_TRT_BUILDER_ENTRYPOINT 1 +#endif +#if !defined(DEFINE_TRT_RUNTIME_ENTRYPOINT) +#define DEFINE_TRT_RUNTIME_ENTRYPOINT 1 +#endif +#if !defined(DEFINE_TRT_REFITTER_ENTRYPOINT) +#define DEFINE_TRT_REFITTER_ENTRYPOINT 1 +#endif +#if !defined(DEFINE_TRT_ONNX_PARSER_ENTRYPOINT) +#define DEFINE_TRT_ONNX_PARSER_ENTRYPOINT 1 +#endif +#if !defined(DEFINE_TRT_LEGACY_PARSER_ENTRYPOINT) +#define DEFINE_TRT_LEGACY_PARSER_ENTRYPOINT 1 +#endif + +#if DEFINE_TRT_ENTRYPOINTS +nvinfer1::IBuilder* createBuilder() +{ +#if DEFINE_TRT_BUILDER_ENTRYPOINT + return nvinfer1::createInferBuilder(sample::gLogger.getTRTLogger()); +#else + return {}; +#endif +} + +nvinfer1::IRuntime* createRuntime() +{ +#if DEFINE_TRT_RUNTIME_ENTRYPOINT + return nvinfer1::createInferRuntime(sample::gLogger.getTRTLogger()); +#else + return {}; +#endif +} + +nvinfer1::IRefitter* createRefitter(nvinfer1::ICudaEngine& engine) +{ +#if DEFINE_TRT_REFITTER_ENTRYPOINT + return nvinfer1::createInferRefitter(engine, sample::gLogger.getTRTLogger()); +#else + return {}; +#endif +} + +nvonnxparser::IParser* createONNXParser(nvinfer1::INetworkDefinition& network) +{ +#if DEFINE_TRT_ONNX_PARSER_ENTRYPOINT + return nvonnxparser::createParser(network, sample::gLogger.getTRTLogger()); +#else + return {}; +#endif +} + +nvcaffeparser1::ICaffeParser* sampleCreateCaffeParser() +{ +#if DEFINE_TRT_LEGACY_PARSER_ENTRYPOINT + return nvcaffeparser1::createCaffeParser(); +#else + return {}; +#endif +} + +void shutdownCaffeParser() +{ +#if DEFINE_TRT_LEGACY_PARSER_ENTRYPOINT + nvcaffeparser1::shutdownProtobufLibrary(); +#endif +} + +nvuffparser::IUffParser* sampleCreateUffParser() +{ +#if DEFINE_TRT_LEGACY_PARSER_ENTRYPOINT + return nvuffparser::createUffParser(); +#else + return {}; +#endif +} + +void shutdownUffParser() +{ +#if DEFINE_TRT_LEGACY_PARSER_ENTRYPOINT + nvuffparser::shutdownProtobufLibrary(); +#endif +} + +#endif // DEFINE_TRT_ENTRYPOINTS + +#endif // TRT_SAMPLE_ENTRYPOINTS_H diff --git a/Code/TestTRTInterDll/trtinfer_lib/common/sampleInference.cpp b/Code/TestTRTInterDll/trtinfer_lib/common/sampleInference.cpp new file mode 100644 index 0000000..d2364b1 --- /dev/null +++ b/Code/TestTRTInterDll/trtinfer_lib/common/sampleInference.cpp @@ -0,0 +1,1639 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if defined(__QNX__) +#include +#include +#endif + +#include "NvInfer.h" + +#include "ErrorRecorder.h" +#include "logger.h" +#include "sampleDevice.h" +#include "sampleEngines.h" +#include "sampleInference.h" +#include "sampleOptions.h" +#include "sampleReporting.h" +#include "sampleUtils.h" +using namespace nvinfer1; +namespace sample +{ + +template +bool validateTensorNames( + MapType const& map, EngineType const* engine, int32_t const endBindingIndex) +{ + // Check if the provided input tensor names match the input tensors of the engine. + // Throw an error if the provided input tensor names cannot be found because it implies a potential typo. + for (auto const& item : map) + { + bool tensorNameFound{false}; + for (int32_t b = 0; b < endBindingIndex; ++b) + { + if (engine->bindingIsInput(b) && engine->getBindingName(b) == item.first) + { + tensorNameFound = true; + break; + } + } + if (!tensorNameFound) + { + sample::gLogError << "Cannot find input tensor with name \"" << item.first << "\" in the engine bindings! " + << "Please make sure the input tensor names are correct." << std::endl; + return false; + } + } + return true; +} + +template +class FillBindingClosure +{ +private: + using InputsMap = std::unordered_map; + using BindingsVector = std::vector>; + + EngineType const* engine; + ContextType const* context; + InputsMap const& inputs; + BindingsVector& bindings; + int32_t batch; + int32_t endBindingIndex; + + void fillOneBinding(TensorInfo const& tensorInfo) + { + auto const name = tensorInfo.name; + auto const* bindingInOutStr = tensorInfo.isInput ? "input" : "output"; + for (auto& binding : bindings) + { + auto const input = inputs.find(name); + if (tensorInfo.isInput && input != inputs.end()) + { + sample::gLogInfo << "Using values loaded from " << input->second << " for input " << name << std::endl; + binding->addBinding(tensorInfo, input->second); + } + else + { + sample::gLogInfo << "Using random values for " << bindingInOutStr << " " << name << std::endl; + binding->addBinding(tensorInfo); + } + sample::gLogInfo << "Created " << bindingInOutStr << " binding for " << name << " with dimensions " + << tensorInfo.dims << std::endl; + } + } + + bool fillAllBindings(int32_t batch, int32_t endBindingIndex) + { + if (!validateTensorNames(inputs, engine, endBindingIndex)) + { + sample::gLogError << "Invalid tensor names found in --loadInputs flag." << std::endl; + return false; + } + for (int32_t b = 0; b < endBindingIndex; b++) + { + TensorInfo tensorInfo; + tensorInfo.bindingIndex = b; + getTensorInfo(tensorInfo); + tensorInfo.updateVolume(batch); + fillOneBinding(tensorInfo); + } + return true; + } + + void getTensorInfo(TensorInfo& tensorInfo); + +public: + FillBindingClosure(EngineType const* _engine, ContextType const* _context, InputsMap const& _inputs, + BindingsVector& _bindings, int32_t _batch, int32_t _endBindingIndex) + : engine(_engine) + , context(_context) + , inputs(_inputs) + , bindings(_bindings) + , batch(_batch) + , endBindingIndex(_endBindingIndex) + { + } + + bool operator()() + { + return fillAllBindings(batch, endBindingIndex); + } +}; + +template <> +void FillBindingClosure::getTensorInfo(TensorInfo& tensorInfo) +{ + auto const b = tensorInfo.bindingIndex; + auto const name = engine->getBindingName(b); + tensorInfo.name = name; + if (engine->hasImplicitBatchDimension()) + { + tensorInfo.dims = context->getBindingDimensions(b); + tensorInfo.comps = engine->getBindingComponentsPerElement(b); + tensorInfo.strides = context->getStrides(b); + tensorInfo.vectorDimIndex = engine->getBindingVectorizedDim(b); + tensorInfo.isInput = engine->bindingIsInput(b); + tensorInfo.dataType = engine->getBindingDataType(b); + } + else + { + // Use enqueueV3. + tensorInfo.dims = context->getTensorShape(name); + tensorInfo.isDynamic = std::any_of( + tensorInfo.dims.d, tensorInfo.dims.d + tensorInfo.dims.nbDims, [](int32_t dim) { return dim == -1; }); + tensorInfo.comps = engine->getTensorComponentsPerElement(name); + tensorInfo.strides = context->getTensorStrides(name); + tensorInfo.vectorDimIndex = engine->getTensorVectorizedDim(name); + tensorInfo.isInput = engine->getTensorIOMode(name) == TensorIOMode::kINPUT; + tensorInfo.dataType = engine->getTensorDataType(name); + } +} + +template <> +void FillBindingClosure::getTensorInfo( + TensorInfo& tensorInfo) +{ + // Use enqueueV3 for safe engine/context + auto const b = tensorInfo.bindingIndex; + auto const name = engine->getIOTensorName(b); + tensorInfo.name = name; + tensorInfo.dims = engine->getTensorShape(name); + tensorInfo.isDynamic = false; + tensorInfo.comps = engine->getTensorComponentsPerElement(name); + tensorInfo.strides = context->getTensorStrides(name); + tensorInfo.vectorDimIndex = engine->getTensorVectorizedDim(name); + tensorInfo.isInput = engine->getTensorIOMode(name) == TensorIOMode::kINPUT; + tensorInfo.dataType = engine->getTensorDataType(name); +} + +bool setUpInference(InferenceEnvironment& iEnv, InferenceOptions const& inference, SystemOptions const& system) +{ + int32_t device{}; + cudaCheck(cudaGetDevice(&device)); + + cudaDeviceProp properties; + cudaCheck(cudaGetDeviceProperties(&properties, device)); + // Use managed memory on integrated devices when transfers are skipped + // and when it is explicitly requested on the commandline. + bool useManagedMemory{(inference.skipTransfers && properties.integrated) || inference.useManaged}; + using FillSafeBindings = FillBindingClosure; + if (iEnv.safe) + { + ASSERT(sample::hasSafeRuntime()); + + auto* safeEngine = iEnv.engine.getSafe(); + SMP_RETVAL_IF_FALSE(safeEngine != nullptr, "Got invalid safeEngine!", false, sample::gLogError); + + // Release serialized blob to save memory space. + iEnv.engine.releaseBlob(); + + for (int32_t s = 0; s < inference.infStreams; ++s) + { + auto ec = safeEngine->createExecutionContext(); + if (ec == nullptr) + { + sample::gLogError << "Unable to create execution context for stream " << s << "." << std::endl; + return false; + } + iEnv.safeContexts.emplace_back(ec); + iEnv.bindings.emplace_back(new Bindings(useManagedMemory)); + } + int32_t const nbBindings = safeEngine->getNbBindings(); + auto const* safeContext = iEnv.safeContexts.front().get(); + // batch is set to 1 because safety only support explicit batch. + return FillSafeBindings(safeEngine, safeContext, inference.inputs, iEnv.bindings, 1, nbBindings)(); + } + + using FillStdBindings = FillBindingClosure; + + auto* engine = iEnv.engine.get(); + SMP_RETVAL_IF_FALSE(engine != nullptr, "Got invalid engine!", false, sample::gLogError); + + bool const hasDLA = system.DLACore >= 0; + if (engine->hasImplicitBatchDimension() && hasDLA && inference.batch != engine->getMaxBatchSize()) + { + sample::gLogError << "When using DLA with an implicit batch engine, the inference batch size must be the same " + "as the engine's maximum batch size. Please specify the batch size by adding: '--batch=" + << engine->getMaxBatchSize() << "' to your command." << std::endl; + return false; + } + + // Release serialized blob to save memory space. + iEnv.engine.releaseBlob(); + + for (int32_t s = 0; s < inference.infStreams; ++s) + { + auto ec = engine->createExecutionContext(); + if (ec == nullptr) + { + sample::gLogError << "Unable to create execution context for stream " << s << "." << std::endl; + return false; + } + ec->setNvtxVerbosity(inference.nvtxVerbosity); + + int32_t const persistentCacheLimit + = samplesCommon::getMaxPersistentCacheSize() * inference.persistentCacheRatio; + sample::gLogInfo << "Setting persistentCacheLimit to " << persistentCacheLimit << " bytes." << std::endl; + ec->setPersistentCacheLimit(persistentCacheLimit); + + iEnv.contexts.emplace_back(ec); + iEnv.bindings.emplace_back(new Bindings(useManagedMemory)); + } + if (iEnv.profiler) + { + iEnv.contexts.front()->setProfiler(iEnv.profiler.get()); + // Always run reportToProfiler() after enqueue launch + iEnv.contexts.front()->setEnqueueEmitsProfile(false); + } + + int32_t const nbOptProfiles = engine->getNbOptimizationProfiles(); + int32_t const endBindingIndex = engine->getNbIOTensors(); + + if (nbOptProfiles > 1) + { + sample::gLogWarning << "Multiple profiles are currently not supported. Running with one profile." << std::endl; + } + + // Make sure that the tensor names provided in command-line args actually exist in any of the engine bindings + // to avoid silent typos. + if (!validateTensorNames(inference.shapes, engine, endBindingIndex)) + { + sample::gLogError << "Invalid tensor names found in --shapes flag." << std::endl; + return false; + } + + // Set all input dimensions before all bindings can be allocated + bool const useEnqueueV3 = !engine->hasImplicitBatchDimension(); + if (useEnqueueV3) + { + sample::gLogVerbose << "Using enqueueV3." << std::endl; + } + for (int32_t b = 0; b < endBindingIndex; ++b) + { + auto const& name = engine->getIOTensorName(b); + auto const& mode = engine->getTensorIOMode(name); + if (mode == TensorIOMode::kINPUT) + { + Dims const dims = iEnv.contexts.front()->getTensorShape(name); + bool isShapeInferenceIO{false}; + if (useEnqueueV3) + { + isShapeInferenceIO = engine->isShapeInferenceIO(name); + } + else + { + isShapeInferenceIO = engine->isShapeBinding(b); + } + bool const hasRuntimeDim = std::any_of(dims.d, dims.d + dims.nbDims, [](int32_t dim) { return dim == -1; }); + auto const shape = inference.shapes.find(name); + if (hasRuntimeDim || isShapeInferenceIO) + { + // Set shapeData to either dimensions of the input (if it has a dynamic shape) + // or set to values of the input (if it is an input shape tensor). + std::vector shapeData; + + if (shape == inference.shapes.end()) + { + // No information provided. Use default value for missing data. + constexpr int32_t kDEFAULT_VALUE = 1; + if (isShapeInferenceIO) + { + // Set shape tensor to all ones. + shapeData.assign(volume(dims, 0, dims.nbDims), kDEFAULT_VALUE); + sample::gLogWarning << "Values missing for input shape tensor: " << engine->getBindingName(b) + << "Automatically setting values to: " << shapeData << std::endl; + } + else + { + // Use default value for unspecified runtime dimensions. + shapeData.resize(dims.nbDims); + std::transform(dims.d, dims.d + dims.nbDims, shapeData.begin(), + [&](int32_t dimension) { return dimension >= 0 ? dimension : kDEFAULT_VALUE; }); + sample::gLogWarning + << "Shape missing for input with dynamic shape: " << engine->getBindingName(b) + << "Automatically setting shape to: " << shapeData << std::endl; + } + } + else if (inference.inputs.count(shape->first) && isShapeInferenceIO) + { + // Load shape tensor from file. + int64_t const size = volume(dims, 0, dims.nbDims); + shapeData.resize(size); + auto const& filename = inference.inputs.at(shape->first); + auto dst = reinterpret_cast(shapeData.data()); + loadFromFile(filename, dst, size * sizeof(decltype(shapeData)::value_type)); + } + else + { + shapeData = shape->second; + } + + int32_t* shapeTensorData{nullptr}; + if (isShapeInferenceIO) + { + // Save the data in iEnv, in a way that it's address does not change + // before enqueueV2 or enqueueV3 is called. + iEnv.inputShapeTensorValues.emplace_back(shapeData); + shapeTensorData = iEnv.inputShapeTensorValues.back().data(); + } + + for (auto& c : iEnv.contexts) + { + if (useEnqueueV3) + { + if (isShapeInferenceIO) + { + if (!c->setTensorAddress(name, shapeTensorData)) + { + return false; + } + } + else + { + if (!c->setInputShape(name, toDims(shapeData))) + { + return false; + } + } + } + else + { + if (isShapeInferenceIO) + { + if (!c->setInputShapeBinding(b, shapeTensorData)) + { + return false; + } + } + else + { + if (!c->setBindingDimensions(b, toDims(shapeData))) + { + return false; + } + } + } + } + } + else if (nbOptProfiles && shape != inference.shapes.end()) + { + // Check if the provided shape matches the static dimensions in the engine. + for (auto& c : iEnv.contexts) + { + if (!c->setInputShape(name, toDims(shape->second))) + { + return false; + } + } + } + } + } + + auto const* context = iEnv.contexts.front().get(); + int32_t const batch = engine->hasImplicitBatchDimension() ? inference.batch : 1; + return FillStdBindings(engine, context, inference.inputs, iEnv.bindings, batch, endBindingIndex)(); +} + +TaskInferenceEnvironment::TaskInferenceEnvironment( + std::string engineFile, InferenceOptions inference, int32_t deviceId, int32_t DLACore, int32_t bs) + : iOptions(inference) + , device(deviceId) + , batch(bs) +{ + BuildEnvironment bEnv(/* isSafe */ false, /* versionCompatible */ false, DLACore, "", getTempfileControlDefaults()); + loadEngineToBuildEnv(engineFile, false, bEnv, sample::gLogError); + std::unique_ptr tmp(new InferenceEnvironment(bEnv)); + iEnv = std::move(tmp); + + cudaCheck(cudaSetDevice(device)); + SystemOptions system{}; + system.device = device; + system.DLACore = DLACore; + if (!setUpInference(*iEnv, iOptions, system)) + { + sample::gLogError << "Inference set up failed" << std::endl; + } +} +namespace +{ + +#if defined(__QNX__) +using TimePoint = double; +#else +using TimePoint = std::chrono::time_point; +#endif + +TimePoint getCurrentTime() +{ +#if defined(__QNX__) + uint64_t const currentCycles = ClockCycles(); + uint64_t const cyclesPerSecond = SYSPAGE_ENTRY(qtime)->cycles_per_sec; + // Return current timestamp in ms. + return static_cast(currentCycles) * 1000. / cyclesPerSecond; +#else + return std::chrono::high_resolution_clock::now(); +#endif +} + +//! +//! \struct SyncStruct +//! \brief Threads synchronization structure +//! +struct SyncStruct +{ + std::mutex mutex; + TrtCudaStream mainStream; + TrtCudaEvent gpuStart{cudaEventBlockingSync}; + TimePoint cpuStart{}; + float sleep{}; +}; + +struct Enqueue +{ + explicit Enqueue(nvinfer1::IExecutionContext& context) + : mContext(context) + { + } + + nvinfer1::IExecutionContext& mContext; +}; + +//! +//! \class EnqueueImplicit +//! \brief Functor to enqueue inference with implicit batch +//! +class EnqueueImplicit : private Enqueue +{ + +public: + explicit EnqueueImplicit(nvinfer1::IExecutionContext& context, void** buffers, int32_t batch) + : Enqueue(context) + , mBuffers(buffers) + , mBatch(batch) + { + } + + bool operator()(TrtCudaStream& stream) const + { + if (mContext.enqueue(mBatch, mBuffers, stream.get(), nullptr)) + { + // Collecting layer timing info from current profile index of execution context + if (mContext.getProfiler() && !mContext.getEnqueueEmitsProfile() && !mContext.reportToProfiler()) + { + gLogWarning << "Failed to collect layer timing info from previous enqueue()" << std::endl; + } + return true; + } + return false; + } + +private: + void** mBuffers{}; + int32_t mBatch{}; +}; + +//! +//! \class EnqueueExplicit +//! \brief Functor to enqueue inference with explict batch +//! +class EnqueueExplicit : private Enqueue +{ + +public: + explicit EnqueueExplicit(nvinfer1::IExecutionContext& context, Bindings const& bindings) + : Enqueue(context) + , mBindings(bindings) + { + ASSERT(mBindings.setTensorAddresses(mContext)); + } + + bool operator()(TrtCudaStream& stream) const + { + if (mContext.enqueueV3(stream.get())) + { + // Collecting layer timing info from current profile index of execution context + if (mContext.getProfiler() && !mContext.getEnqueueEmitsProfile() && !mContext.reportToProfiler()) + { + gLogWarning << "Failed to collect layer timing info from previous enqueueV3()" << std::endl; + } + return true; + } + return false; + } + +private: + Bindings const& mBindings; +}; + +//! +//! \class EnqueueGraph +//! \brief Functor to enqueue inference from CUDA Graph +//! +class EnqueueGraph +{ + +public: + explicit EnqueueGraph(nvinfer1::IExecutionContext& context, TrtCudaGraph& graph) + : mGraph(graph) + , mContext(context) + { + } + + bool operator()(TrtCudaStream& stream) const + { + if (mGraph.launch(stream)) + { + // Collecting layer timing info from current profile index of execution context + if (mContext.getProfiler() && !mContext.getEnqueueEmitsProfile() && !mContext.reportToProfiler()) + { + gLogWarning << "Failed to collect layer timing info from previous CUDA graph launch" << std::endl; + } + return true; + } + return false; + } + + TrtCudaGraph& mGraph; + nvinfer1::IExecutionContext& mContext; +}; + +//! +//! \class EnqueueGraphSafe +//! \brief Functor to enqueue inference from CUDA Graph +//! +class EnqueueGraphSafe +{ + +public: + explicit EnqueueGraphSafe(TrtCudaGraph& graph) + : mGraph(graph) + { + } + + bool operator()(TrtCudaStream& stream) const + { + return mGraph.launch(stream); + } + + TrtCudaGraph& mGraph; +}; + +//! +//! \class EnqueueSafe +//! \brief Functor to enqueue safe execution context +//! +class EnqueueSafe +{ +public: + explicit EnqueueSafe(nvinfer1::safe::IExecutionContext& context, Bindings const& bindings) + : mContext(context) + , mBindings(bindings) + { + ASSERT(mBindings.setSafeTensorAddresses(mContext)); + } + + bool operator()(TrtCudaStream& stream) const + { + if (mContext.enqueueV3(stream.get())) + { + return true; + } + return false; + } + + nvinfer1::safe::IExecutionContext& mContext; +private: + Bindings const& mBindings; +}; + +using EnqueueFunction = std::function; + +enum class StreamType : int32_t +{ + kINPUT = 0, + kCOMPUTE = 1, + kOUTPUT = 2, + kNUM = 3 +}; + +enum class EventType : int32_t +{ + kINPUT_S = 0, + kINPUT_E = 1, + kCOMPUTE_S = 2, + kCOMPUTE_E = 3, + kOUTPUT_S = 4, + kOUTPUT_E = 5, + kNUM = 6 +}; + +using MultiStream = std::array(StreamType::kNUM)>; + +using MultiEvent = std::array, static_cast(EventType::kNUM)>; + +using EnqueueTimes = std::array; + +//! +//! \class Iteration +//! \brief Inference iteration and streams management +//! +template +class Iteration +{ + +public: + Iteration(int32_t id, InferenceOptions const& inference, ContextType& context, Bindings& bindings) + : mBindings(bindings) + , mStreamId(id) + , mDepth(1 + inference.overlap) + , mActive(mDepth) + , mEvents(mDepth) + , mEnqueueTimes(mDepth) + , mContext(&context) + { + for (int32_t d = 0; d < mDepth; ++d) + { + for (int32_t e = 0; e < static_cast(EventType::kNUM); ++e) + { + mEvents[d][e].reset(new TrtCudaEvent(!inference.spin)); + } + } + createEnqueueFunction(inference, context, bindings); + } + + bool query(bool skipTransfers) + { + if (mActive[mNext]) + { + return true; + } + + if (!skipTransfers) + { + record(EventType::kINPUT_S, StreamType::kINPUT); + setInputData(false); + record(EventType::kINPUT_E, StreamType::kINPUT); + wait(EventType::kINPUT_E, StreamType::kCOMPUTE); // Wait for input DMA before compute + } + + record(EventType::kCOMPUTE_S, StreamType::kCOMPUTE); + recordEnqueueTime(); + if (!mEnqueue(getStream(StreamType::kCOMPUTE))) + { + return false; + } + recordEnqueueTime(); + record(EventType::kCOMPUTE_E, StreamType::kCOMPUTE); + + if (!skipTransfers) + { + wait(EventType::kCOMPUTE_E, StreamType::kOUTPUT); // Wait for compute before output DMA + record(EventType::kOUTPUT_S, StreamType::kOUTPUT); + fetchOutputData(false); + record(EventType::kOUTPUT_E, StreamType::kOUTPUT); + } + + mActive[mNext] = true; + moveNext(); + return true; + } + + float sync( + TimePoint const& cpuStart, TrtCudaEvent const& gpuStart, std::vector& trace, bool skipTransfers) + { + if (mActive[mNext]) + { + if (skipTransfers) + { + getEvent(EventType::kCOMPUTE_E).synchronize(); + } + else + { + getEvent(EventType::kOUTPUT_E).synchronize(); + } + trace.emplace_back(getTrace(cpuStart, gpuStart, skipTransfers)); + mActive[mNext] = false; + return getEvent(EventType::kCOMPUTE_S) - gpuStart; + } + return 0; + } + + void syncAll( + TimePoint const& cpuStart, TrtCudaEvent const& gpuStart, std::vector& trace, bool skipTransfers) + { + for (int32_t d = 0; d < mDepth; ++d) + { + sync(cpuStart, gpuStart, trace, skipTransfers); + moveNext(); + } + } + + void wait(TrtCudaEvent& gpuStart) + { + getStream(StreamType::kINPUT).wait(gpuStart); + } + + void setInputData(bool sync) + { + mBindings.transferInputToDevice(getStream(StreamType::kINPUT)); + // additional sync to avoid overlapping with inference execution. + if (sync) + { + getStream(StreamType::kINPUT).synchronize(); + } + } + + void fetchOutputData(bool sync) + { + mBindings.transferOutputToHost(getStream(StreamType::kOUTPUT)); + // additional sync to avoid overlapping with inference execution. + if (sync) + { + getStream(StreamType::kOUTPUT).synchronize(); + } + } + +private: + void moveNext() + { + mNext = mDepth - 1 - mNext; + } + + TrtCudaStream& getStream(StreamType t) + { + return mStream[static_cast(t)]; + } + + TrtCudaEvent& getEvent(EventType t) + { + return *mEvents[mNext][static_cast(t)]; + } + + void record(EventType e, StreamType s) + { + getEvent(e).record(getStream(s)); + } + + void recordEnqueueTime() + { + mEnqueueTimes[mNext][enqueueStart] = getCurrentTime(); + enqueueStart = 1 - enqueueStart; + } + + TimePoint getEnqueueTime(bool start) + { + return mEnqueueTimes[mNext][start ? 0 : 1]; + } + + void wait(EventType e, StreamType s) + { + getStream(s).wait(getEvent(e)); + } + + InferenceTrace getTrace(TimePoint const& cpuStart, TrtCudaEvent const& gpuStart, bool skipTransfers) + { + float is + = skipTransfers ? getEvent(EventType::kCOMPUTE_S) - gpuStart : getEvent(EventType::kINPUT_S) - gpuStart; + float ie + = skipTransfers ? getEvent(EventType::kCOMPUTE_S) - gpuStart : getEvent(EventType::kINPUT_E) - gpuStart; + float os + = skipTransfers ? getEvent(EventType::kCOMPUTE_E) - gpuStart : getEvent(EventType::kOUTPUT_S) - gpuStart; + float oe + = skipTransfers ? getEvent(EventType::kCOMPUTE_E) - gpuStart : getEvent(EventType::kOUTPUT_E) - gpuStart; + + return InferenceTrace(mStreamId, + std::chrono::duration(getEnqueueTime(true) - cpuStart).count(), + std::chrono::duration(getEnqueueTime(false) - cpuStart).count(), is, ie, + getEvent(EventType::kCOMPUTE_S) - gpuStart, getEvent(EventType::kCOMPUTE_E) - gpuStart, os, oe); + } + + void createEnqueueFunction( + InferenceOptions const& inference, nvinfer1::IExecutionContext& context, Bindings& bindings) + { + if (context.getEngine().hasImplicitBatchDimension()) + { + mEnqueue = EnqueueFunction(EnqueueImplicit(context, mBindings.getDeviceBuffers(), inference.batch)); + } + else + { + mEnqueue = EnqueueFunction(EnqueueExplicit(context, mBindings)); + } + if (inference.graph) + { + TrtCudaStream& stream = getStream(StreamType::kCOMPUTE); + // Avoid capturing initialization calls by executing the enqueue function at least + // once before starting CUDA graph capture. + auto const ret = mEnqueue(stream); + assert(ret); + stream.synchronize(); + + mGraph.beginCapture(stream); + // The built TRT engine may contain operations that are not permitted under CUDA graph capture mode. + // When the stream is capturing, the enqueue call may return false if the current CUDA graph capture fails. + if (mEnqueue(stream)) + { + mGraph.endCapture(stream); + mEnqueue = EnqueueFunction(EnqueueGraph(context, mGraph)); + } + else + { + mGraph.endCaptureOnError(stream); + // Ensure any CUDA error has been cleaned up. + cudaCheck(cudaGetLastError()); + sample::gLogWarning << "The built TensorRT engine contains operations that are not permitted under " + "CUDA graph capture mode." + << std::endl; + sample::gLogWarning << "The specified --useCudaGraph flag has been ignored. The inference will be " + "launched without using CUDA graph launch." + << std::endl; + } + } + } + + void createEnqueueFunction(InferenceOptions const& inference, nvinfer1::safe::IExecutionContext& context, Bindings&) + { + mEnqueue = EnqueueFunction(EnqueueSafe(context, mBindings)); + if (inference.graph) + { + TrtCudaStream& stream = getStream(StreamType::kCOMPUTE); + ASSERT(mEnqueue(stream)); + stream.synchronize(); + mGraph.beginCapture(stream); + ASSERT(mEnqueue(stream)); + mGraph.endCapture(stream); + mEnqueue = EnqueueFunction(EnqueueGraphSafe(mGraph)); + } + + } + + Bindings& mBindings; + + TrtCudaGraph mGraph; + EnqueueFunction mEnqueue; + + int32_t mStreamId{0}; + int32_t mNext{0}; + int32_t mDepth{2}; // default to double buffer to hide DMA transfers + + std::vector mActive; + MultiStream mStream; + std::vector mEvents; + + int32_t enqueueStart{0}; + std::vector mEnqueueTimes; + ContextType* mContext{nullptr}; +}; + +template +bool inferenceLoop(std::vector>>& iStreams, TimePoint const& cpuStart, + TrtCudaEvent const& gpuStart, int iterations, float maxDurationMs, float warmupMs, + std::vector& trace, bool skipTransfers, float idleMs) +{ + float durationMs = 0; + int32_t skip = 0; + + for (int32_t i = 0; i < iterations + skip || durationMs < maxDurationMs; ++i) + { + for (auto& s : iStreams) + { + if (!s->query(skipTransfers)) + { + return false; + } + } + for (auto& s : iStreams) + { + durationMs = std::max(durationMs, s->sync(cpuStart, gpuStart, trace, skipTransfers)); + } + if (durationMs < warmupMs) // Warming up + { + if (durationMs) // Skip complete iterations + { + ++skip; + } + continue; + } + if (idleMs != 0.F) + { + std::this_thread::sleep_for(std::chrono::duration(idleMs)); + } + } + for (auto& s : iStreams) + { + s->syncAll(cpuStart, gpuStart, trace, skipTransfers); + } + return true; +} + +template +void inferenceExecution(InferenceOptions const& inference, InferenceEnvironment& iEnv, SyncStruct& sync, + int32_t const threadIdx, int32_t const streamsPerThread, int32_t device, std::vector& trace) +{ + float warmupMs = inference.warmup; + float durationMs = inference.duration * 1000.F + warmupMs; + + cudaCheck(cudaSetDevice(device)); + + std::vector>> iStreams; + + for (int32_t s = 0; s < streamsPerThread; ++s) + { + int32_t const streamId{threadIdx * streamsPerThread + s}; + auto* iteration = new Iteration( + streamId, inference, *iEnv.template getContext(streamId), *iEnv.bindings[streamId]); + if (inference.skipTransfers) + { + iteration->setInputData(true); + } + iStreams.emplace_back(iteration); + } + + for (auto& s : iStreams) + { + s->wait(sync.gpuStart); + } + + std::vector localTrace; + if (!inferenceLoop(iStreams, sync.cpuStart, sync.gpuStart, inference.iterations, durationMs, warmupMs, localTrace, + inference.skipTransfers, inference.idle)) + { + iEnv.error = true; + } + + if (inference.skipTransfers) + { + for (auto& s : iStreams) + { + s->fetchOutputData(true); + } + } + + sync.mutex.lock(); + trace.insert(trace.end(), localTrace.begin(), localTrace.end()); + sync.mutex.unlock(); +} + +inline std::thread makeThread(InferenceOptions const& inference, InferenceEnvironment& iEnv, SyncStruct& sync, + int32_t threadIdx, int32_t streamsPerThread, int32_t device, std::vector& trace) +{ + + if (iEnv.safe) + { + ASSERT(sample::hasSafeRuntime()); + return std::thread(inferenceExecution, std::cref(inference), std::ref(iEnv), + std::ref(sync), threadIdx, streamsPerThread, device, std::ref(trace)); + } + + return std::thread(inferenceExecution, std::cref(inference), std::ref(iEnv), + std::ref(sync), threadIdx, streamsPerThread, device, std::ref(trace)); +} + +} // namespace + +bool runInference( + InferenceOptions const& inference, InferenceEnvironment& iEnv, int32_t device, std::vector& trace) +{ + cudaCheck(cudaProfilerStart()); + + trace.resize(0); + + SyncStruct sync; + sync.sleep = inference.sleep; + sync.mainStream.sleep(&sync.sleep); + sync.cpuStart = getCurrentTime(); + sync.gpuStart.record(sync.mainStream); + + // When multiple streams are used, trtexec can run inference in two modes: + // (1) if inference.threads is true, then run each stream on each thread. + // (2) if inference.threads is false, then run all streams on the same thread. + int32_t const numThreads = inference.threads ? inference.infStreams : 1; + int32_t const streamsPerThread = inference.threads ? 1 : inference.infStreams; + + std::vector threads; + for (int32_t threadIdx = 0; threadIdx < numThreads; ++threadIdx) + { + threads.emplace_back(makeThread(inference, iEnv, sync, threadIdx, streamsPerThread, device, trace)); + } + for (auto& th : threads) + { + th.join(); + } + + cudaCheck(cudaProfilerStop()); + + auto cmpTrace = [](InferenceTrace const& a, InferenceTrace const& b) { return a.h2dStart < b.h2dStart; }; + std::sort(trace.begin(), trace.end(), cmpTrace); + + return !iEnv.error; +} + +bool runMultiTasksInference(std::vector>& tEnvList) +{ + cudaCheck(cudaProfilerStart()); + cudaSetDeviceFlags(cudaDeviceScheduleSpin); + + SyncStruct sync; + sync.sleep = 0; + sync.mainStream.sleep(&sync.sleep); + sync.cpuStart = getCurrentTime(); + sync.gpuStart.record(sync.mainStream); + + std::vector threads; + for (size_t i = 0; i < tEnvList.size(); ++i) + { + auto& tEnv = tEnvList[i]; + threads.emplace_back(makeThread( + tEnv->iOptions, *(tEnv->iEnv), sync, /*threadIdx*/ 0, /*streamsPerThread*/ 1, tEnv->device, tEnv->trace)); + } + for (auto& th : threads) + { + th.join(); + } + + cudaCheck(cudaProfilerStop()); + + auto cmpTrace = [](InferenceTrace const& a, InferenceTrace const& b) { return a.h2dStart < b.h2dStart; }; + for (auto& tEnv : tEnvList) + { + std::sort(tEnv->trace.begin(), tEnv->trace.end(), cmpTrace); + } + + return std::none_of(tEnvList.begin(), tEnvList.end(), + [](std::unique_ptr& tEnv) { return tEnv->iEnv->error; }); +} + +namespace +{ +size_t reportGpuMemory() +{ + static size_t prevFree{0}; + size_t free{0}; + size_t total{0}; + size_t newlyAllocated{0}; + cudaCheck(cudaMemGetInfo(&free, &total)); + sample::gLogInfo << "Free GPU memory = " << free / 1024.0_MiB << " GiB"; + if (prevFree != 0) + { + newlyAllocated = (prevFree - free); + sample::gLogInfo << ", newly allocated GPU memory = " << newlyAllocated / 1024.0_MiB << " GiB"; + } + sample::gLogInfo << ", total GPU memory = " << total / 1024.0_MiB << " GiB" << std::endl; + prevFree = free; + return newlyAllocated; +} +} // namespace + +//! Returns true if deserialization is slower than expected or fails. +bool timeDeserialize(InferenceEnvironment& iEnv, SystemOptions const& sys) +{ + constexpr int32_t kNB_ITERS{20}; + std::unique_ptr rt{createRuntime()}; + std::unique_ptr engine; + + std::unique_ptr safeRT{sample::createSafeInferRuntime(sample::gLogger.getTRTLogger())}; + std::unique_ptr safeEngine; + + if (iEnv.safe) + { + ASSERT(sample::hasSafeRuntime() && safeRT != nullptr); + safeRT->setErrorRecorder(&gRecorder); + } + + auto timeDeserializeFn = [&]() -> float { + bool deserializeOK{false}; + engine.reset(nullptr); + safeEngine.reset(nullptr); + auto startClock = std::chrono::high_resolution_clock::now(); + if (iEnv.safe) + { + safeEngine.reset(safeRT->deserializeCudaEngine(iEnv.engine.getBlob().data(), iEnv.engine.getBlob().size())); + deserializeOK = (safeEngine != nullptr); + } + else + { + for (auto const& pluginPath : sys.dynamicPlugins) + { + rt->getPluginRegistry().loadLibrary(pluginPath.c_str()); + } + engine.reset( + rt->deserializeCudaEngine(iEnv.engine.getBlob().data(), iEnv.engine.getBlob().size(), nullptr)); + deserializeOK = (engine != nullptr); + } + auto endClock = std::chrono::high_resolution_clock::now(); + // return NAN if deserialization failed. + return deserializeOK ? std::chrono::duration(endClock - startClock).count() : NAN; + }; + + // Warmup the caches to make sure that cache thrashing isn't throwing off the results + { + sample::gLogInfo << "Begin deserialization warmup..." << std::endl; + for (int32_t i = 0, e = 2; i < e; ++i) + { + timeDeserializeFn(); + } + } + sample::gLogInfo << "Begin deserialization engine timing..." << std::endl; + float const first = timeDeserializeFn(); + + // Check if first deserialization succeeded. + if (std::isnan(first)) + { + sample::gLogError << "Engine deserialization failed." << std::endl; + return true; + } + + sample::gLogInfo << "First deserialization time = " << first << " milliseconds" << std::endl; + + // Record initial gpu memory state. + reportGpuMemory(); + + float totalTime{0.F}; + for (int32_t i = 0; i < kNB_ITERS; ++i) + { + totalTime += timeDeserializeFn(); + } + auto const averageTime = totalTime / kNB_ITERS; + // reportGpuMemory sometimes reports zero after a single deserialization of a small engine, + // so use the size of memory for all the iterations. + auto const totalEngineSizeGpu = reportGpuMemory(); + sample::gLogInfo << "Total deserialization time = " << totalTime << " milliseconds in " << kNB_ITERS + << " iterations, average time = " << averageTime << " milliseconds, first time = " << first + << " milliseconds." << std::endl; + sample::gLogInfo << "Deserialization Bandwidth = " << 1E-6 * totalEngineSizeGpu / totalTime << " GB/s" << std::endl; + + // If the first deserialization is more than tolerance slower than + // the average deserialization, return true, which means an error occurred. + // The tolerance is set to 2x since the deserialization time is quick and susceptible + // to caching issues causing problems in the first timing. + auto const tolerance = 2.0F; + bool const isSlowerThanExpected = first > averageTime * tolerance; + if (isSlowerThanExpected) + { + sample::gLogInfo << "First deserialization time divided by average time is " << (first / averageTime) + << ". Exceeds tolerance of " << tolerance << "x." << std::endl; + } + return isSlowerThanExpected; +} + +std::string getLayerInformation( + nvinfer1::ICudaEngine* engine, nvinfer1::IExecutionContext* context, nvinfer1::LayerInformationFormat format) +{ + auto runtime = std::unique_ptr{createRuntime()}; + auto inspector = std::unique_ptr(engine->createEngineInspector()); + if (context != nullptr) + { + inspector->setExecutionContext(context); + } + std::string result = inspector->getEngineInformation(format); + return result; +} + +void Binding::fill(std::string const& fileName) +{ + loadFromFile(fileName, static_cast(buffer->getHostBuffer()), buffer->getSize()); +} + +void Binding::fill() +{ + switch (dataType) + { + case nvinfer1::DataType::kBOOL: + { + fillBuffer(buffer->getHostBuffer(), volume, 0, 1); + break; + } + case nvinfer1::DataType::kINT32: + { + fillBuffer(buffer->getHostBuffer(), volume, -128, 127); + break; + } + case nvinfer1::DataType::kINT8: + { + fillBuffer(buffer->getHostBuffer(), volume, -128, 127); + break; + } + case nvinfer1::DataType::kFLOAT: + { + fillBuffer(buffer->getHostBuffer(), volume, -1.0F, 1.0F); + break; + } + case nvinfer1::DataType::kHALF: + { + fillBuffer<__half>(buffer->getHostBuffer(), volume, -1.0F, 1.0F); + break; + } + case nvinfer1::DataType::kUINT8: + { + fillBuffer(buffer->getHostBuffer(), volume, 0, 255); + break; + } + case nvinfer1::DataType::kFP8: ASSERT(!"FP8 is not supported"); + } +} + +void Binding::dump(std::ostream& os, Dims dims, Dims strides, int32_t vectorDim, int32_t spv, + std::string const separator /*= " "*/) const +{ + void* outputBuffer{}; + if (outputAllocator != nullptr) + { + outputBuffer = outputAllocator->getBuffer()->getHostBuffer(); + } + else + { + outputBuffer = buffer->getHostBuffer(); + } + switch (dataType) + { + case nvinfer1::DataType::kBOOL: + { + dumpBuffer(outputBuffer, separator, os, dims, strides, vectorDim, spv); + break; + } + case nvinfer1::DataType::kINT32: + { + dumpBuffer(outputBuffer, separator, os, dims, strides, vectorDim, spv); + break; + } + case nvinfer1::DataType::kINT8: + { + dumpBuffer(outputBuffer, separator, os, dims, strides, vectorDim, spv); + break; + } + case nvinfer1::DataType::kFLOAT: + { + dumpBuffer(outputBuffer, separator, os, dims, strides, vectorDim, spv); + break; + } + case nvinfer1::DataType::kHALF: + { + dumpBuffer<__half>(outputBuffer, separator, os, dims, strides, vectorDim, spv); + break; + } + case nvinfer1::DataType::kUINT8: + { + dumpBuffer(outputBuffer, separator, os, dims, strides, vectorDim, spv); + break; + } + case nvinfer1::DataType::kFP8: ASSERT(!"FP8 is not supported"); + } +} + +void Bindings::addBinding(TensorInfo const& tensorInfo, std::string const& fileName /*= ""*/) +{ + auto const b = tensorInfo.bindingIndex; + while (mBindings.size() <= static_cast(b)) + { + mBindings.emplace_back(); + mDevicePointers.emplace_back(); + } + mNames[tensorInfo.name] = b; + mBindings[b].isInput = tensorInfo.isInput; + mBindings[b].volume = tensorInfo.vol; + mBindings[b].dataType = tensorInfo.dataType; + if (tensorInfo.isDynamic) + { + ASSERT(!tensorInfo.isInput); // Only output shape can be possibly unknown because of DDS. + if (mBindings[b].outputAllocator == nullptr) + { + if (mUseManaged) + { + mBindings[b].outputAllocator.reset(new OutputAllocator(new UnifiedMirroredBuffer)); + } + else + { + mBindings[b].outputAllocator.reset(new OutputAllocator(new DiscreteMirroredBuffer)); + } + } + } + else + { + if (mBindings[b].buffer == nullptr) + { + if (mUseManaged) + { + mBindings[b].buffer.reset(new UnifiedMirroredBuffer); + } + else + { + mBindings[b].buffer.reset(new DiscreteMirroredBuffer); + } + } + // Some memory allocators return nullptr when allocating zero bytes, but TensorRT requires a non-null ptr + // even for empty tensors, so allocate a dummy byte. + if (tensorInfo.vol == 0) + { + mBindings[b].buffer->allocate(1); + } + else + { + mBindings[b].buffer->allocate( + static_cast(tensorInfo.vol) * static_cast(dataTypeSize(tensorInfo.dataType))); + } + mDevicePointers[b] = mBindings[b].buffer->getDeviceBuffer(); + } + if (tensorInfo.isInput) + { + if (fileName.empty()) + { + fill(b); + } + else + { + fill(b, fileName); + } + } +} + +void** Bindings::getDeviceBuffers() +{ + return mDevicePointers.data(); +} + +void Bindings::transferInputToDevice(TrtCudaStream& stream) +{ + for (auto& b : mNames) + { + if (mBindings[b.second].isInput) + { + mBindings[b.second].buffer->hostToDevice(stream); + } + } +} + +void Bindings::transferOutputToHost(TrtCudaStream& stream) +{ + for (auto& b : mNames) + { + if (!mBindings[b.second].isInput) + { + if (mBindings[b.second].outputAllocator != nullptr) + { + mBindings[b.second].outputAllocator->getBuffer()->deviceToHost(stream); + } + else + { + mBindings[b.second].buffer->deviceToHost(stream); + } + } + } +} + +template <> +void Bindings::dumpBindingValues(nvinfer1::IExecutionContext const& context, int32_t binding, std::ostream& os, + std::string const& separator /*= " "*/, int32_t batch /*= 1*/) const +{ + Dims dims = context.getBindingDimensions(binding); + Dims strides = context.getStrides(binding); + int32_t vectorDim = context.getEngine().getBindingVectorizedDim(binding); + int32_t const spv = context.getEngine().getBindingComponentsPerElement(binding); + + if (context.getEngine().hasImplicitBatchDimension()) + { + auto const insertN = [](Dims& d, int32_t bs) { + int32_t const nbDims = d.nbDims; + ASSERT(nbDims < Dims::MAX_DIMS); + std::copy_backward(&d.d[0], &d.d[nbDims], &d.d[nbDims + 1]); + d.d[0] = bs; + d.nbDims = nbDims + 1; + }; + int32_t batchStride = 0; + for (int32_t i = 0; i < strides.nbDims; ++i) + { + if (strides.d[i] * dims.d[i] > batchStride) + { + batchStride = strides.d[i] * dims.d[i]; + } + } + insertN(dims, batch); + insertN(strides, batchStride); + vectorDim = (vectorDim == -1) ? -1 : vectorDim + 1; + } + + mBindings[binding].dump(os, dims, strides, vectorDim, spv, separator); +} + +namespace { + +std::string genFilenameSafeString(std::string const& s) +{ + std::string res = s; + static std::string const allowedSpecialChars{"._-,"}; + for (auto& c : res) + { + if (!isalnum(c) && allowedSpecialChars.find(c) == std::string::npos) + { + c = '_'; + } + } + return res; +} + +template +Dims getBindingDimensions(ContextType const& /*context*/, int32_t /*binding*/) +{ + ASSERT(0 && "Unimplemented"); +} + +template <> +Dims getBindingDimensions(nvinfer1::IExecutionContext const& context, int32_t binding) +{ + return context.getBindingDimensions(binding); +} + +template <> +Dims getBindingDimensions(nvinfer1::safe::IExecutionContext const& context, int32_t binding) +{ + return context.getEngine().getBindingDimensions(binding); +} + +inline std::ostream& operator<<(std::ostream& o, nvinfer1::DataType dt) +{ + switch (dt) + { + case DataType::kINT32: o << "Int32"; break; + case DataType::kFLOAT: o << "Float"; break; + case DataType::kHALF: o << "Half"; break; + case DataType::kINT8: o << "Int8"; break; + case DataType::kUINT8: o << "UInt8"; break; + case DataType::kBOOL: o << "Bool"; break; + case DataType::kFP8: o << "Float8"; break; + } + return o; +} + +} // namespace + +template +void Bindings::dumpRawBindingToFiles(ContextType const& context, std::ostream& os) const +{ + os << "Dumping I/O Bindings to RAW Files:" << std::endl; + for (auto const& n : mNames) + { + auto name = n.first; + auto bIndex = n.second; + auto const& binding = mBindings[bIndex]; + void* outputBuffer{}; + if (binding.outputAllocator != nullptr) + { + outputBuffer = binding.outputAllocator->getBuffer()->getHostBuffer(); + } + else + { + outputBuffer = binding.buffer->getHostBuffer(); + } + + Dims dims = getBindingDimensions(context, bIndex); + std::string dimsStr; + std::string dotStr; + + for (int32_t i = 0; i < dims.nbDims; i++) + { + dimsStr += dotStr + std::to_string(dims.d[i]); + dotStr = "."; + } + + std::string const bindingTypeStr = (binding.isInput ? "input" : "output"); + + std::stringstream fileName; + fileName << genFilenameSafeString(name) << "." << bindingTypeStr << "." << dimsStr << "." << binding.dataType << ".raw"; + + os << "Writing file for " << bindingTypeStr << " binding " << name << " (with datatype " << binding.dataType + << " and dimensions " << dimsStr << ") to " << fileName.str() << std::endl; + + std::ofstream f(fileName.str(), std::ios::out | std::ios::binary); + ASSERT(f && "Cannot open file for write"); + f.write(static_cast(outputBuffer), binding.volume * samplesCommon::elementSize(binding.dataType)); + f.close(); + } +} + +template +void Bindings::dumpRawBindingToFiles(nvinfer1::IExecutionContext const& context, std::ostream& os) const; + +template <> +void Bindings::dumpBindingDimensions(int binding, nvinfer1::IExecutionContext const& context, std::ostream& os) const +{ + auto const dims = context.getBindingDimensions(binding); + // Do not add a newline terminator, because the caller may be outputting a JSON string. + os << dims; +} + +template <> +void Bindings::dumpBindingDimensions(int binding, nvinfer1::safe::IExecutionContext const& context, std::ostream& os) const +{ + auto const dims = context.getEngine().getBindingDimensions(binding); + // Do not add a newline terminator, because the caller may be outputting a JSON string. + os << dims; +} + +template <> +void Bindings::dumpBindingValues(nvinfer1::safe::IExecutionContext const& context, int32_t binding, std::ostream& os, + std::string const& separator /*= " "*/, int32_t batch /*= 1*/) const +{ + Dims const dims = context.getEngine().getBindingDimensions(binding); + Dims const strides = context.getStrides(binding); + int32_t const vectorDim = context.getEngine().getBindingVectorizedDim(binding); + int32_t const spv = context.getEngine().getBindingComponentsPerElement(binding); + + mBindings[binding].dump(os, dims, strides, vectorDim, spv, separator); +} + +template +void Bindings::dumpRawBindingToFiles(nvinfer1::safe::IExecutionContext const& context, std::ostream& os) const; + +std::unordered_map Bindings::getBindings(std::function predicate) const +{ + std::unordered_map bindings; + for (auto const& n : mNames) + { + auto const binding = n.second; + if (predicate(mBindings[binding])) + { + bindings.insert(n); + } + } + return bindings; +} + +bool Bindings::setTensorAddresses(nvinfer1::IExecutionContext& context) const +{ + for (auto const& b : mNames) + { + auto const name = b.first.c_str(); + auto const location = context.getEngine().getTensorLocation(name); + if (location == TensorLocation::kDEVICE) + { + if (mBindings[b.second].outputAllocator != nullptr) + { + if (!context.setOutputAllocator(name, mBindings[b.second].outputAllocator.get())) + { + return false; + } + } + else + { + if (!context.setTensorAddress(name, mDevicePointers[b.second])) + { + return false; + } + } + } + } + return true; +} + +bool Bindings::setSafeTensorAddresses(nvinfer1::safe::IExecutionContext& context) const +{ + for (auto const& b : mNames) + { + auto const name = b.first.c_str(); + if (context.getEngine().getTensorIOMode(name) == nvinfer1::TensorIOMode::kINPUT) + { + if (!context.setInputTensorAddress(name, static_cast(mDevicePointers[b.second]))) + { + return false; + } + } + else + { + if (!context.setOutputTensorAddress(name, mDevicePointers[b.second])) + { + return false; + } + } + } + return true; +} + +} // namespace sample diff --git a/Code/TestTRTInterDll/trtinfer_lib/common/sampleInference.h b/Code/TestTRTInterDll/trtinfer_lib/common/sampleInference.h new file mode 100644 index 0000000..909a71b --- /dev/null +++ b/Code/TestTRTInterDll/trtinfer_lib/common/sampleInference.h @@ -0,0 +1,264 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef TRT_SAMPLE_INFERENCE_H +#define TRT_SAMPLE_INFERENCE_H + +#include "sampleEngines.h" +#include "sampleReporting.h" +#include "sampleUtils.h" + +#include +#include +#include +#include +#include +#include + +#include "NvInfer.h" +#include "NvInferSafeRuntime.h" + +namespace sample +{ + +struct InferenceEnvironment +{ + InferenceEnvironment() = delete; + InferenceEnvironment(InferenceEnvironment const& other) = delete; + InferenceEnvironment(InferenceEnvironment&& other) = delete; + InferenceEnvironment(BuildEnvironment& bEnv) : engine(std::move(bEnv.engine)), safe(bEnv.engine.isSafe()) + { + } + + LazilyDeserializedEngine engine; + std::unique_ptr profiler; + std::vector> contexts; + std::vector> bindings; + bool error{false}; + + bool safe{false}; + std::vector> safeContexts; + + template + inline ContextType* getContext(int32_t streamIdx); + + //! Storage for input shape tensors. + //! + //! It's important that the addresses of the data do not change between the calls to + //! setTensorAddress/setInputShape (which tells TensorRT where the input shape tensor is) + //! and enqueueV2/enqueueV3 (when TensorRT might use the input shape tensor). + //! + //! The input shape tensors could alternatively be handled via member bindings, + //! but it simplifies control-flow to store the data here since it's shared across + //! the bindings. + std::list> inputShapeTensorValues; +}; + +template <> +inline nvinfer1::IExecutionContext* InferenceEnvironment::getContext(int32_t streamIdx) +{ + return contexts[streamIdx].get(); +} + +template <> +inline nvinfer1::safe::IExecutionContext* InferenceEnvironment::getContext(int32_t streamIdx) +{ + return safeContexts[streamIdx].get(); +} + +//! +//! \brief Set up contexts and bindings for inference +//! +bool setUpInference(InferenceEnvironment& iEnv, InferenceOptions const& inference, SystemOptions const& system); + +//! +//! \brief Deserialize the engine and time how long it takes. +//! +bool timeDeserialize(InferenceEnvironment& iEnv, SystemOptions const& sys); + +//! +//! \brief Run inference and collect timing, return false if any error hit during inference +//! +bool runInference( + InferenceOptions const& inference, InferenceEnvironment& iEnv, int32_t device, std::vector& trace); + +//! +//! \brief Get layer information of the engine. +//! +std::string getLayerInformation( + nvinfer1::ICudaEngine* engine, nvinfer1::IExecutionContext* context, nvinfer1::LayerInformationFormat format); + +struct Binding +{ + bool isInput{false}; + std::unique_ptr buffer; + std::unique_ptr outputAllocator; + int64_t volume{0}; + nvinfer1::DataType dataType{nvinfer1::DataType::kFLOAT}; + + void fill(std::string const& fileName); + + void fill(); + + void dump(std::ostream& os, nvinfer1::Dims dims, nvinfer1::Dims strides, int32_t vectorDim, int32_t spv, + std::string const separator = " ") const; +}; + +struct TensorInfo +{ + int32_t bindingIndex{-1}; + char const* name{nullptr}; + nvinfer1::Dims dims{}; + bool isDynamic{}; + int32_t comps{-1}; + nvinfer1::Dims strides{}; + int32_t vectorDimIndex{-1}; + bool isInput{}; + nvinfer1::DataType dataType{}; + int64_t vol{-1}; + + void updateVolume(int32_t batch) + { + vol = volume(dims, strides, vectorDimIndex, comps, batch); + } +}; + +class Bindings +{ +public: + Bindings() = delete; + explicit Bindings(bool useManaged) + : mUseManaged(useManaged) + { + } + + void addBinding(TensorInfo const& tensorInfo, std::string const& fileName = ""); + + void** getDeviceBuffers(); + + void transferInputToDevice(TrtCudaStream& stream); + + void transferOutputToHost(TrtCudaStream& stream); + + void fill(int binding, std::string const& fileName) + { + mBindings[binding].fill(fileName); + } + + void fill(int binding) + { + mBindings[binding].fill(); + } + + template + void dumpBindingDimensions(int32_t binding, ContextType const& context, std::ostream& os) const; + + template + void dumpBindingValues(ContextType const& context, int32_t binding, std::ostream& os, + std::string const& separator = " ", int32_t batch = 1) const; + + template + void dumpRawBindingToFiles(ContextType const& context, std::ostream& os) const; + + template + void dumpInputs(ContextType const& context, std::ostream& os) const + { + auto isInput = [](Binding const& b) { return b.isInput; }; + dumpBindings(context, isInput, os); + } + + template + void dumpOutputs(ContextType const& context, std::ostream& os) const + { + auto isOutput = [](Binding const& b) { return !b.isInput; }; + dumpBindings(context, isOutput, os); + } + + template + void dumpBindings(ContextType const& context, std::ostream& os) const + { + auto all = [](Binding const& b) { return true; }; + dumpBindings(context, all, os); + } + + template + void dumpBindings( + ContextType const& context, std::function predicate, std::ostream& os) const + { + for (auto const& n : mNames) + { + auto const binding = n.second; + if (predicate(mBindings[binding])) + { + os << n.first << ": ("; + dumpBindingDimensions(binding, context, os); + os << ")" << std::endl; + + dumpBindingValues(context, binding, os); + os << std::endl; + } + } + } + + + std::unordered_map getInputBindings() const + { + auto isInput = [](Binding const& b) { return b.isInput; }; + return getBindings(isInput); + } + + std::unordered_map getOutputBindings() const + { + auto isOutput = [](Binding const& b) { return !b.isInput; }; + return getBindings(isOutput); + } + + std::unordered_map getBindings() const + { + auto all = [](Binding const& b) { return true; }; + return getBindings(all); + } + + std::unordered_map getBindings(std::function predicate) const; + + bool setTensorAddresses(nvinfer1::IExecutionContext& context) const; + + bool setSafeTensorAddresses(nvinfer1::safe::IExecutionContext& context) const; + +private: + std::unordered_map mNames; + std::vector mBindings; + std::vector mDevicePointers; + bool mUseManaged{false}; +}; + +struct TaskInferenceEnvironment +{ + TaskInferenceEnvironment(std::string engineFile, InferenceOptions inference, int32_t deviceId = 0, + int32_t DLACore = -1, int32_t bs = batchNotProvided); + InferenceOptions iOptions{}; + int32_t device{defaultDevice}; + int32_t batch{batchNotProvided}; + std::unique_ptr iEnv; + std::vector trace; +}; + +bool runMultiTasksInference(std::vector>& tEnvList); + +} // namespace sample + +#endif // TRT_SAMPLE_INFERENCE_H diff --git a/Code/TestTRTInterDll/trtinfer_lib/common/sampleOptions.cpp b/Code/TestTRTInterDll/trtinfer_lib/common/sampleOptions.cpp new file mode 100644 index 0000000..55c5c9e --- /dev/null +++ b/Code/TestTRTInterDll/trtinfer_lib/common/sampleOptions.cpp @@ -0,0 +1,2358 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "NvInfer.h" + +#include "logger.h" +#include "sampleOptions.h" +#include "sampleUtils.h" +using namespace nvinfer1; +namespace sample +{ + +namespace +{ + +template +T stringToValue(const std::string& option) +{ + return T{option}; +} + +template <> +int32_t stringToValue(const std::string& option) +{ + return std::stoi(option); +} + +template <> +float stringToValue(const std::string& option) +{ + return std::stof(option); +} + +template <> +double stringToValue(const std::string& option) +{ + return std::stod(option); +} + +template <> +bool stringToValue(const std::string& option) +{ + return true; +} + +template <> +std::vector stringToValue>(const std::string& option) +{ + std::vector shape; + std::vector dimsStrings = splitToStringVec(option, 'x'); + for (const auto& d : dimsStrings) + { + shape.push_back(stringToValue(d)); + } + return shape; +} + +template <> +nvinfer1::DataType stringToValue(const std::string& option) +{ + const std::unordered_map strToDT{{"fp32", nvinfer1::DataType::kFLOAT}, + {"fp16", nvinfer1::DataType::kHALF}, {"int8", nvinfer1::DataType::kINT8}, {"fp8", nvinfer1::DataType::kFP8}, + {"int32", nvinfer1::DataType::kINT32}}; + const auto& dt = strToDT.find(option); + if (dt == strToDT.end()) + { + throw std::invalid_argument("Invalid DataType " + option); + } + return dt->second; +} + +template <> +nvinfer1::DeviceType stringToValue(std::string const& option) +{ + std::unordered_map const strToDevice = { + {"GPU", nvinfer1::DeviceType::kGPU}, + {"DLA", nvinfer1::DeviceType::kDLA}, + }; + auto const& device = strToDevice.find(option); + if (device == strToDevice.end()) + { + throw std::invalid_argument("Invalid Device Type " + option); + } + return device->second; +} + +template <> +nvinfer1::TensorFormats stringToValue(const std::string& option) +{ + std::vector optionStrings = splitToStringVec(option, '+'); + const std::unordered_map strToFmt{{"chw", nvinfer1::TensorFormat::kLINEAR}, + {"chw2", nvinfer1::TensorFormat::kCHW2}, {"chw4", nvinfer1::TensorFormat::kCHW4}, + {"hwc8", nvinfer1::TensorFormat::kHWC8}, {"chw16", nvinfer1::TensorFormat::kCHW16}, + {"chw32", nvinfer1::TensorFormat::kCHW32}, {"dhwc8", nvinfer1::TensorFormat::kDHWC8}, + {"cdhw32", nvinfer1::TensorFormat::kCDHW32}, {"hwc", nvinfer1::TensorFormat::kHWC}, + {"dhwc", nvinfer1::TensorFormat::kDHWC}, {"dla_linear", nvinfer1::TensorFormat::kDLA_LINEAR}, + {"dla_hwc4", nvinfer1::TensorFormat::kDLA_HWC4}}; + nvinfer1::TensorFormats formats{}; + for (auto f : optionStrings) + { + const auto& tf = strToFmt.find(f); + if (tf == strToFmt.end()) + { + throw std::invalid_argument(std::string("Invalid TensorFormat ") + f); + } + formats |= 1U << static_cast(tf->second); + } + + return formats; +} + +template <> +IOFormat stringToValue(const std::string& option) +{ + IOFormat ioFormat{}; + const size_t colon = option.find(':'); + + if (colon == std::string::npos) + { + throw std::invalid_argument(std::string("Invalid IOFormat ") + option); + } + + ioFormat.first = stringToValue(option.substr(0, colon)); + ioFormat.second = stringToValue(option.substr(colon + 1)); + + return ioFormat; +} + +template <> +SparsityFlag stringToValue(std::string const& option) +{ + std::unordered_map const table{ + {"disable", SparsityFlag::kDISABLE}, {"enable", SparsityFlag::kENABLE}, {"force", SparsityFlag::kFORCE}}; + auto search = table.find(option); + if (search == table.end()) + { + throw std::invalid_argument(std::string("Unknown sparsity mode: ") + option); + } + return search->second; +} + +template +std::pair splitNameAndValue(const std::string& s) +{ + std::string tensorName; + std::string valueString; + + // Support 'inputName':Path format for --loadInputs flag when dealing with Windows paths. + // i.e. 'inputName':c:\inputData + std::vector quoteNameRange{ splitToStringVec(s, '\'') }; + // splitToStringVec returns the entire string when delimiter is not found, so it's size is always at least 1 + if (quoteNameRange.size() != 1) + { + if (quoteNameRange.size() != 3) + { + throw std::invalid_argument(std::string("Found invalid number of \'s when parsing ") + s + + std::string(". Expected: 2, received: ") + std::to_string(quoteNameRange.size() -1)); + } + // Everything before the second "'" is the name. + tensorName = quoteNameRange[0] + quoteNameRange[1]; + // Path is the last string - ignoring leading ":" so slice it with [1:] + valueString = quoteNameRange[2].substr(1); + return std::pair(tensorName, stringToValue(valueString)); + } + + // Split on the last : + std::vector nameRange{splitToStringVec(s, ':')}; + // Everything before the last : is the name + tensorName = nameRange[0]; + for (size_t i = 1; i < nameRange.size() - 1; i++) + { + tensorName += ":" + nameRange[i]; + } + // Value is the string element after the last : + valueString = nameRange[nameRange.size() - 1]; + return std::pair(tensorName, stringToValue(valueString)); +} + +template +void splitInsertKeyValue(const std::vector& kvList, T& map) +{ + for (const auto& kv : kvList) + { + map.insert(splitNameAndValue(kv)); + } +} + +const char* boolToEnabled(bool enable) +{ + return enable ? "Enabled" : "Disabled"; +} + +//! A helper function similar to sep.join(list) in Python. +template +std::string joinValuesToString(std::vector const& list, std::string const& sep) +{ + std::ostringstream os; + for (int32_t i = 0, n = list.size(); i < n; ++i) + { + os << list[i]; + if (i != n - 1) + { + os << sep; + } + } + return os.str(); +} + +template +std::string joinValuesToString(std::array const& list, std::string const& sep) +{ + return joinValuesToString(std::vector(list.begin(), list.end()), sep); +} + +//! Check if input option exists in input arguments. +//! If it does: return its value, erase the argument and return true. +//! If it does not: return false. +template +bool getAndDelOption(Arguments& arguments, const std::string& option, T& value) +{ + const auto match = arguments.find(option); + if (match != arguments.end()) + { + value = stringToValue(match->second); + arguments.erase(match); + return true; + } + + return false; +} + +//! Check if input option exists in input arguments. +//! If it does: return false in value, erase the argument and return true. +//! If it does not: return false. +bool getAndDelNegOption(Arguments& arguments, const std::string& option, bool& value) +{ + bool dummy; + if (getAndDelOption(arguments, option, dummy)) + { + value = false; + return true; + } + return false; +} + +//! Check if input option exists in input arguments. +//! If it does: add all the matched arg values to values vector, erase the argument and return true. +//! If it does not: return false. +template +bool getAndDelRepeatedOption(Arguments& arguments, const std::string& option, std::vector& values) +{ + const auto match = arguments.equal_range(option); + if (match.first == match.second) + { + return false; + } + + auto addToValues + = [&values](Arguments::value_type& argValue) { values.emplace_back(stringToValue(argValue.second)); }; + std::for_each(match.first, match.second, addToValues); + arguments.erase(match.first, match.second); + + return true; +} + +void insertShapesBuild(BuildOptions::ShapeProfile& shapes, nvinfer1::OptProfileSelector selector, + const std::string& name, const std::vector& dims) +{ + shapes[name][static_cast(selector)] = dims; +} + +void insertShapesInference( + InferenceOptions::ShapeProfile& shapes, std::string const& name, std::vector const& dims) +{ + shapes[name] = dims; +} + +std::string removeSingleQuotationMarks(std::string& str) +{ + std::vector strList{splitToStringVec(str, '\'')}; + // Remove all the escaped single quotation marks + std::string retVal; + // Do not really care about unterminated sequences + for (size_t i = 0; i < strList.size(); i++) + { + retVal += strList[i]; + } + return retVal; +} + +void getLayerPrecisions(Arguments& arguments, char const* argument, LayerPrecisions& layerPrecisions) +{ + std::string list; + if (!getAndDelOption(arguments, argument, list)) + { + return; + } + + // The layerPrecisions flag contains comma-separated layerName:precision pairs. + std::vector precisionList{splitToStringVec(list, ',')}; + for (auto const& s : precisionList) + { + auto namePrecisionPair = splitNameAndValue(s); + auto const layerName = removeSingleQuotationMarks(namePrecisionPair.first); + layerPrecisions[layerName] = namePrecisionPair.second; + } +} + +void getLayerOutputTypes(Arguments& arguments, char const* argument, LayerOutputTypes& layerOutputTypes) +{ + std::string list; + if (!getAndDelOption(arguments, argument, list)) + { + return; + } + + // The layerOutputTypes flag contains comma-separated layerName:types pairs. + std::vector precisionList{splitToStringVec(list, ',')}; + for (auto const& s : precisionList) + { + auto namePrecisionPair = splitNameAndValue(s); + auto const layerName = removeSingleQuotationMarks(namePrecisionPair.first); + auto const typeStrings = splitToStringVec(namePrecisionPair.second, '+'); + std::vector typeVec(typeStrings.size(), nvinfer1::DataType::kFLOAT); + std::transform(typeStrings.begin(), typeStrings.end(), typeVec.begin(), stringToValue); + layerOutputTypes[layerName] = typeVec; + } +} + +void getLayerDeviceTypes(Arguments& arguments, char const* argument, LayerDeviceTypes& layerDeviceTypes) +{ + std::string list; + if (!getAndDelOption(arguments, argument, list)) + { + return; + } + + // The layerDeviceTypes flag contains comma-separated layerName:deviceType pairs. + std::vector deviceList{splitToStringVec(list, ',')}; + for (auto const& s : deviceList) + { + auto nameDevicePair = splitNameAndValue(s); + auto const layerName = removeSingleQuotationMarks(nameDevicePair.first); + layerDeviceTypes[layerName] = stringToValue(nameDevicePair.second); + } +} + +bool getShapesBuild(Arguments& arguments, BuildOptions::ShapeProfile& shapes, char const* argument, + nvinfer1::OptProfileSelector selector) +{ + std::string list; + bool retVal = getAndDelOption(arguments, argument, list); + std::vector shapeList{splitToStringVec(list, ',')}; + for (const auto& s : shapeList) + { + auto nameDimsPair = splitNameAndValue>(s); + auto tensorName = removeSingleQuotationMarks(nameDimsPair.first); + auto dims = nameDimsPair.second; + insertShapesBuild(shapes, selector, tensorName, dims); + } + return retVal; +} + +bool getShapesInference(Arguments& arguments, InferenceOptions::ShapeProfile& shapes, const char* argument) +{ + std::string list; + bool retVal = getAndDelOption(arguments, argument, list); + std::vector shapeList{splitToStringVec(list, ',')}; + for (const auto& s : shapeList) + { + auto nameDimsPair = splitNameAndValue>(s); + auto tensorName = removeSingleQuotationMarks(nameDimsPair.first); + auto dims = nameDimsPair.second; + insertShapesInference(shapes, tensorName, dims); + } + return retVal; +} + +void fillShapes(BuildOptions::ShapeProfile& shapes, std::string const& name, ShapeRange const& sourceShapeRange, + nvinfer1::OptProfileSelector minDimsSource, nvinfer1::OptProfileSelector optDimsSource, + nvinfer1::OptProfileSelector maxDimsSource) +{ + insertShapesBuild( + shapes, nvinfer1::OptProfileSelector::kMIN, name, sourceShapeRange[static_cast(minDimsSource)]); + insertShapesBuild( + shapes, nvinfer1::OptProfileSelector::kOPT, name, sourceShapeRange[static_cast(optDimsSource)]); + insertShapesBuild( + shapes, nvinfer1::OptProfileSelector::kMAX, name, sourceShapeRange[static_cast(maxDimsSource)]); +} + +void processShapes(BuildOptions::ShapeProfile& shapes, bool minShapes, bool optShapes, bool maxShapes, bool calib) +{ + // Only accept optShapes only or all three of minShapes, optShapes, maxShapes when calib is set + if (((minShapes || maxShapes) && !optShapes) // minShapes only, maxShapes only, both minShapes and maxShapes + || (minShapes && !maxShapes && optShapes) // both minShapes and optShapes + || (!minShapes && maxShapes && optShapes)) // both maxShapes and optShapes + { + if (calib) + { + throw std::invalid_argument( + "Must specify only --optShapesCalib or all of --minShapesCalib, --optShapesCalib, --maxShapesCalib"); + } + } + + if (!minShapes && !optShapes && !maxShapes) + { + return; + } + + BuildOptions::ShapeProfile newShapes; + for (auto& s : shapes) + { + nvinfer1::OptProfileSelector minDimsSource, optDimsSource, maxDimsSource; + minDimsSource = nvinfer1::OptProfileSelector::kMIN; + optDimsSource = nvinfer1::OptProfileSelector::kOPT; + maxDimsSource = nvinfer1::OptProfileSelector::kMAX; + + // Populate missing minShapes + if (!minShapes) + { + if (optShapes) + { + minDimsSource = optDimsSource; + sample::gLogWarning << "optShapes is being broadcasted to minShapes for tensor " << s.first + << std::endl; + } + else + { + minDimsSource = maxDimsSource; + sample::gLogWarning << "maxShapes is being broadcasted to minShapes for tensor " << s.first + << std::endl; + } + } + + // Populate missing optShapes + if (!optShapes) + { + if (maxShapes) + { + optDimsSource = maxDimsSource; + sample::gLogWarning << "maxShapes is being broadcasted to optShapes for tensor " << s.first + << std::endl; + } + else + { + optDimsSource = minDimsSource; + sample::gLogWarning << "minShapes is being broadcasted to optShapes for tensor " << s.first + << std::endl; + } + } + + // Populate missing maxShapes + if (!maxShapes) + { + if (optShapes) + { + maxDimsSource = optDimsSource; + sample::gLogWarning << "optShapes is being broadcasted to maxShapes for tensor " << s.first + << std::endl; + } + else + { + maxDimsSource = minDimsSource; + sample::gLogWarning << "minShapes is being broadcasted to maxShapes for tensor " << s.first + << std::endl; + } + } + + fillShapes(newShapes, s.first, s.second, minDimsSource, optDimsSource, maxDimsSource); + } + shapes = newShapes; +} + +template +void printShapes(std::ostream& os, const char* phase, const T& shapes) +{ + if (shapes.empty()) + { + os << "Input " << phase << " shapes: model" << std::endl; + } + else + { + for (const auto& s : shapes) + { + os << "Input " << phase << " shape: " << s.first << "=" << s.second << std::endl; + } + } +} + +std::ostream& printBatch(std::ostream& os, int32_t maxBatch) +{ + if (maxBatch != maxBatchNotProvided) + { + os << maxBatch; + } + else + { + os << "explicit batch"; + } + return os; +} + +std::ostream& printTacticSources( + std::ostream& os, nvinfer1::TacticSources enabledSources, nvinfer1::TacticSources disabledSources) +{ + if (!enabledSources && !disabledSources) + { + os << "Using default tactic sources"; + } + else + { + auto const addSource = [&](uint32_t source, std::string const& name) { + if (enabledSources & source) + { + os << name << " [ON], "; + } + else if (disabledSources & source) + { + os << name << " [OFF], "; + } + }; + + addSource(1U << static_cast(nvinfer1::TacticSource::kCUBLAS), "cublas"); + addSource(1U << static_cast(nvinfer1::TacticSource::kCUBLAS_LT), "cublasLt"); + addSource(1U << static_cast(nvinfer1::TacticSource::kCUDNN), "cudnn"); + addSource(1U << static_cast(nvinfer1::TacticSource::kEDGE_MASK_CONVOLUTIONS), "edge mask convolutions"); + addSource(1U << static_cast(nvinfer1::TacticSource::kJIT_CONVOLUTIONS), "JIT convolutions"); + } + return os; +} + +std::ostream& printPrecision(std::ostream& os, BuildOptions const& options) +{ + os << "FP32"; + if (options.fp16) + { + os << "+FP16"; + } + if (options.int8) + { + os << "+INT8"; + } + if (options.fp8) + { + os << "+FP8"; + } + if (options.precisionConstraints == PrecisionConstraints::kOBEY) + { + os << " (obey precision constraints)"; + } + if (options.precisionConstraints == PrecisionConstraints::kPREFER) + { + os << " (prefer precision constraints)"; + } + return os; +} + +std::ostream& printTempfileControls(std::ostream& os, TempfileControlFlags const tempfileControls) +{ + auto getFlag = [&](TempfileControlFlag f) -> char const* { + bool allowed = !!(tempfileControls & (1U << static_cast(f))); + return allowed ? "allow" : "deny"; + }; + auto const inMemory = getFlag(TempfileControlFlag::kALLOW_IN_MEMORY_FILES); + auto const temporary = getFlag(TempfileControlFlag::kALLOW_TEMPORARY_FILES); + + os << "{ in_memory: " << inMemory << ", temporary: " << temporary << " }"; + + return os; +} + +std::ostream& printTimingCache(std::ostream& os, TimingCacheMode const& timingCacheMode) +{ + switch (timingCacheMode) + { + case TimingCacheMode::kGLOBAL: os << "global"; break; + case TimingCacheMode::kLOCAL: os << "local"; break; + case TimingCacheMode::kDISABLE: os << "disable"; break; + } + return os; +} + +std::ostream& printSparsity(std::ostream& os, BuildOptions const& options) +{ + switch (options.sparsity) + { + case SparsityFlag::kDISABLE: os << "Disabled"; break; + case SparsityFlag::kENABLE: os << "Enabled"; break; + case SparsityFlag::kFORCE: os << "Forced"; break; + } + + return os; +} + +std::ostream& printMemoryPools(std::ostream& os, BuildOptions const& options) +{ + auto const printValueOrDefault = [&os](double const val) { + if (val >= 0) + { + os << val << " MiB"; + } + else + { + os << "default"; + } + }; + os << "workspace: "; + printValueOrDefault(options.workspace); + os << ", "; + os << "dlaSRAM: "; + printValueOrDefault(options.dlaSRAM); + os << ", "; + os << "dlaLocalDRAM: "; + printValueOrDefault(options.dlaLocalDRAM); + os << ", "; + os << "dlaGlobalDRAM: "; + printValueOrDefault(options.dlaGlobalDRAM); + return os; +} + +std::string previewFeatureToString(PreviewFeature feature) +{ + // clang-format off + switch (feature) + { + case PreviewFeature::kFASTER_DYNAMIC_SHAPES_0805: return "kFASTER_DYNAMIC_SHAPES_0805"; + case PreviewFeature::kDISABLE_EXTERNAL_TACTIC_SOURCES_FOR_CORE_0805: return "kDISABLE_EXTERNAL_TACTIC_SOURCES_FOR_CORE_0805"; + case PreviewFeature::kPROFILE_SHARING_0806: return "kPROFILE_SHARING_0806"; + } + return "Invalid Preview Feature"; + // clang-format on +} + +std::ostream& printPreviewFlags(std::ostream& os, BuildOptions const& options) +{ + if (options.previewFeatures.empty()) + { + os << "Use default preview flags."; + return os; + } + + auto const addFlag = [&](PreviewFeature feat) { + int32_t featVal = static_cast(feat); + if (options.previewFeatures.find(featVal) != options.previewFeatures.end()) + { + os << previewFeatureToString(feat) << (options.previewFeatures.at(featVal) ? " [ON], " : " [OFF], "); + } + }; + + addFlag(PreviewFeature::kFASTER_DYNAMIC_SHAPES_0805); + addFlag(PreviewFeature::kDISABLE_EXTERNAL_TACTIC_SOURCES_FOR_CORE_0805); + addFlag(PreviewFeature::kPROFILE_SHARING_0806); + + return os; +} + +} // namespace + +Arguments argsToArgumentsMap(int32_t argc, char* argv[]) +{ + Arguments arguments; + for (int32_t i = 1; i < argc; ++i) + { + auto valuePtr = strchr(argv[i], '='); + if (valuePtr) + { + std::string value{valuePtr + 1}; + arguments.emplace(std::string(argv[i], valuePtr - argv[i]), value); + } + else + { + arguments.emplace(argv[i], ""); + } + } + return arguments; +} + +void BaseModelOptions::parse(Arguments& arguments) +{ + if (getAndDelOption(arguments, "--onnx", model)) + { + format = ModelFormat::kONNX; + } + else if (getAndDelOption(arguments, "--uff", model)) + { + format = ModelFormat::kUFF; + } + else if (getAndDelOption(arguments, "--model", model)) + { + format = ModelFormat::kCAFFE; + } +} + +void UffInput::parse(Arguments& arguments) +{ + getAndDelOption(arguments, "--uffNHWC", NHWC); + std::vector args; + if (getAndDelRepeatedOption(arguments, "--uffInput", args)) + { + for (const auto& i : args) + { + std::vector values{splitToStringVec(i, ',')}; + if (values.size() == 4) + { + nvinfer1::Dims3 dims{std::stoi(values[1]), std::stoi(values[2]), std::stoi(values[3])}; + inputs.emplace_back(values[0], dims); + } + else + { + throw std::invalid_argument(std::string("Invalid uffInput ") + i); + } + } + } +} + +void ModelOptions::parse(Arguments& arguments) +{ + baseModel.parse(arguments); + + switch (baseModel.format) + { + case ModelFormat::kCAFFE: + { + getAndDelOption(arguments, "--deploy", prototxt); + break; + } + case ModelFormat::kUFF: + { + uffInputs.parse(arguments); + if (uffInputs.inputs.empty()) + { + throw std::invalid_argument("Uff models require at least one input"); + } + break; + } + case ModelFormat::kONNX: break; + case ModelFormat::kANY: + { + if (getAndDelOption(arguments, "--deploy", prototxt)) + { + baseModel.format = ModelFormat::kCAFFE; + } + break; + } + } + + // The --output flag should only be used with Caffe and UFF. It has no effect on ONNX. + std::vector outArgs; + if (getAndDelRepeatedOption(arguments, "--output", outArgs)) + { + for (const auto& o : outArgs) + { + for (auto& v : splitToStringVec(o, ',')) + { + outputs.emplace_back(std::move(v)); + } + } + } + if (baseModel.format == ModelFormat::kCAFFE || baseModel.format == ModelFormat::kUFF) + { + if (outputs.empty()) + { + throw std::invalid_argument("Caffe and Uff models require at least one output"); + } + } + else if (baseModel.format == ModelFormat::kONNX) + { + if (!outputs.empty()) + { + throw std::invalid_argument("The --output flag should not be used with ONNX models."); + } + } +} + +void getTempfileControls(Arguments& arguments, char const* argument, TempfileControlFlags& tempfileControls) +{ + std::string list; + if (!getAndDelOption(arguments, argument, list)) + { + return; + } + + std::vector controlList{splitToStringVec(list, ',')}; + for (auto const& s : controlList) + { + auto controlAllowPair = splitNameAndValue(s); + bool allowed{false}; + int32_t offset{-1}; + + if (controlAllowPair.second.compare("allow") == 0) + { + allowed = true; + } + else if (controlAllowPair.second.compare("deny") != 0) + { + throw std::invalid_argument("--tempfileControls value should be `deny` or `allow`"); + } + + if (controlAllowPair.first.compare("in_memory") == 0) + { + offset = static_cast(TempfileControlFlag::kALLOW_IN_MEMORY_FILES); + } + else if (controlAllowPair.first.compare("temporary") == 0) + { + offset = static_cast(TempfileControlFlag::kALLOW_TEMPORARY_FILES); + } + else + { + throw std::invalid_argument(std::string{"Unknown --tempfileControls key "} + controlAllowPair.first); + } + + if (allowed) + { + tempfileControls |= (1U << offset); + } + else + { + tempfileControls &= ~(1U << offset); + } + } +} + +void BuildOptions::parse(Arguments& arguments) +{ + auto getFormats = [&arguments](std::vector& formatsVector, const char* argument) { + std::string list; + getAndDelOption(arguments, argument, list); + std::vector formats{splitToStringVec(list, ',')}; + for (const auto& f : formats) + { + formatsVector.push_back(stringToValue(f)); + } + }; + + getFormats(inputFormats, "--inputIOFormats"); + getFormats(outputFormats, "--outputIOFormats"); + + bool addedExplicitBatchFlag{false}; + getAndDelOption(arguments, "--explicitBatch", addedExplicitBatchFlag); + if (addedExplicitBatchFlag) + { + sample::gLogWarning << "--explicitBatch flag has been deprecated and has no effect!" << std::endl; + sample::gLogWarning << "Explicit batch dim is automatically enabled if input model is ONNX or if dynamic " + << "shapes are provided when the engine is built." << std::endl; + } + + bool minShapes = getShapesBuild(arguments, shapes, "--minShapes", nvinfer1::OptProfileSelector::kMIN); + bool optShapes = getShapesBuild(arguments, shapes, "--optShapes", nvinfer1::OptProfileSelector::kOPT); + bool maxShapes = getShapesBuild(arguments, shapes, "--maxShapes", nvinfer1::OptProfileSelector::kMAX); + processShapes(shapes, minShapes, optShapes, maxShapes, false); + bool minShapesCalib + = getShapesBuild(arguments, shapesCalib, "--minShapesCalib", nvinfer1::OptProfileSelector::kMIN); + bool optShapesCalib + = getShapesBuild(arguments, shapesCalib, "--optShapesCalib", nvinfer1::OptProfileSelector::kOPT); + bool maxShapesCalib + = getShapesBuild(arguments, shapesCalib, "--maxShapesCalib", nvinfer1::OptProfileSelector::kMAX); + processShapes(shapesCalib, minShapesCalib, optShapesCalib, maxShapesCalib, true); + + bool addedExplicitPrecisionFlag{false}; + getAndDelOption(arguments, "--explicitPrecision", addedExplicitPrecisionFlag); + if (addedExplicitPrecisionFlag) + { + sample::gLogWarning << "--explicitPrecision flag has been deprecated and has no effect!" << std::endl; + } + + if (getAndDelOption(arguments, "--workspace", workspace)) + { + sample::gLogWarning << "--workspace flag has been deprecated by --memPoolSize flag." << std::endl; + } + + std::string memPoolSizes; + getAndDelOption(arguments, "--memPoolSize", memPoolSizes); + std::vector memPoolSpecs{splitToStringVec(memPoolSizes, ',')}; + for (auto const& memPoolSpec : memPoolSpecs) + { + std::string memPoolName; + double memPoolSize; + std::tie(memPoolName, memPoolSize) = splitNameAndValue(memPoolSpec); + if (memPoolSize < 0) + { + throw std::invalid_argument(std::string("Negative memory pool size: ") + std::to_string(memPoolSize)); + } + if (memPoolName == "workspace") + { + workspace = memPoolSize; + } + else if (memPoolName == "dlaSRAM") + { + dlaSRAM = memPoolSize; + } + else if (memPoolName == "dlaLocalDRAM") + { + dlaLocalDRAM = memPoolSize; + } + else if (memPoolName == "dlaGlobalDRAM") + { + dlaGlobalDRAM = memPoolSize; + } + else if (!memPoolName.empty()) + { + throw std::invalid_argument(std::string("Unknown memory pool: ") + memPoolName); + } + } + + getAndDelOption(arguments, "--maxBatch", maxBatch); + getAndDelOption(arguments, "--minTiming", minTiming); + getAndDelOption(arguments, "--avgTiming", avgTiming); + + bool best{false}; + getAndDelOption(arguments, "--best", best); + if (best) + { + int8 = true; + fp16 = true; + } + + getAndDelOption(arguments, "--refit", refittable); + + // --vc and --versionCompatible are synonyms + getAndDelOption(arguments, "--vc", versionCompatible); + if (!versionCompatible) + { + getAndDelOption(arguments, "--versionCompatible", versionCompatible); + } + + getAndDelOption(arguments, "--excludeLeanRuntime", excludeLeanRuntime); + + getAndDelNegOption(arguments, "--noTF32", tf32); + getAndDelOption(arguments, "--fp16", fp16); + getAndDelOption(arguments, "--int8", int8); + getAndDelOption(arguments, "--fp8", fp8); + if (fp8 && int8) + { + throw std::invalid_argument("Invalid usage, fp8 and int8 aren't allowed to be enabled together."); + } + getAndDelOption(arguments, "--safe", safe); + getAndDelOption(arguments, "--consistency", consistency); + getAndDelOption(arguments, "--restricted", restricted); + if (getAndDelOption(arguments, "--buildOnly", skipInference)) + { + sample::gLogWarning << "--buildOnly flag has been deprecated by --skipInference flag." << std::endl; + } + getAndDelOption(arguments, "--skipInference", skipInference); + getAndDelOption(arguments, "--directIO", directIO); + + std::string precisionConstraintsString; + getAndDelOption(arguments, "--precisionConstraints", precisionConstraintsString); + if (!precisionConstraintsString.empty()) + { + const std::unordered_map precisionConstraintsMap + = {{"obey", PrecisionConstraints::kOBEY}, {"prefer", PrecisionConstraints::kPREFER}, + {"none", PrecisionConstraints::kNONE}}; + auto it = precisionConstraintsMap.find(precisionConstraintsString); + if (it == precisionConstraintsMap.end()) + { + throw std::invalid_argument(std::string("Unknown precision constraints: ") + precisionConstraintsString); + } + precisionConstraints = it->second; + } + else + { + precisionConstraints = PrecisionConstraints::kNONE; + } + + getLayerPrecisions(arguments, "--layerPrecisions", layerPrecisions); + getLayerOutputTypes(arguments, "--layerOutputTypes", layerOutputTypes); + getLayerDeviceTypes(arguments, "--layerDeviceTypes", layerDeviceTypes); + + if (layerPrecisions.empty() && layerOutputTypes.empty() && precisionConstraints != PrecisionConstraints::kNONE) + { + sample::gLogWarning << R"(When --precisionConstraints flag is set to "obey" or "prefer", please add )" + << "--layerPrecision/--layerOutputTypes flags to set layer-wise precisions and output " + << "types." << std::endl; + } + else if ((!layerPrecisions.empty() || !layerOutputTypes.empty()) + && precisionConstraints == PrecisionConstraints::kNONE) + { + sample::gLogWarning << "--layerPrecision/--layerOutputTypes flags have no effect when --precisionConstraints " + << R"(flag is set to "none".)" << std::endl; + } + + getAndDelOption(arguments, "--sparsity", sparsity); + + bool calibCheck = getAndDelOption(arguments, "--calib", calibration); + if (int8 && calibCheck && !shapes.empty() && shapesCalib.empty()) + { + shapesCalib = shapes; + } + + std::string profilingVerbosityString; + if (getAndDelOption(arguments, "--nvtxMode", profilingVerbosityString)) + { + sample::gLogWarning << "--nvtxMode flag has been deprecated by --profilingVerbosity flag." << std::endl; + } + + getAndDelOption(arguments, "--profilingVerbosity", profilingVerbosityString); + if (profilingVerbosityString == "layer_names_only") + { + profilingVerbosity = nvinfer1::ProfilingVerbosity::kLAYER_NAMES_ONLY; + } + else if (profilingVerbosityString == "none") + { + profilingVerbosity = nvinfer1::ProfilingVerbosity::kNONE; + } + else if (profilingVerbosityString == "detailed") + { + profilingVerbosity = nvinfer1::ProfilingVerbosity::kDETAILED; + } + else if (profilingVerbosityString == "default") + { + sample::gLogWarning << "--profilingVerbosity=default has been deprecated by " + "--profilingVerbosity=layer_names_only." + << std::endl; + profilingVerbosity = nvinfer1::ProfilingVerbosity::kLAYER_NAMES_ONLY; + } + else if (profilingVerbosityString == "verbose") + { + sample::gLogWarning << "--profilingVerbosity=verbose has been deprecated by --profilingVerbosity=detailed." + << std::endl; + profilingVerbosity = nvinfer1::ProfilingVerbosity::kDETAILED; + } + else if (!profilingVerbosityString.empty()) + { + throw std::invalid_argument(std::string("Unknown profilingVerbosity: ") + profilingVerbosityString); + } + + if (getAndDelOption(arguments, "--loadEngine", engine)) + { + load = true; + } + if (getAndDelOption(arguments, "--saveEngine", engine)) + { + save = true; + } + if (load && save) + { + throw std::invalid_argument("Incompatible load and save engine options selected"); + } + + std::string tacticSourceArgs; + if (getAndDelOption(arguments, "--tacticSources", tacticSourceArgs)) + { + std::vector tacticList = splitToStringVec(tacticSourceArgs, ','); + for (auto& t : tacticList) + { + bool enable{false}; + if (t.front() == '+') + { + enable = true; + } + else if (t.front() != '-') + { + throw std::invalid_argument( + "Tactic source must be prefixed with + or -, indicating whether it should be enabled or disabled " + "respectively."); + } + t.erase(0, 1); + + const auto toUpper = [](std::string& sourceName) { + std::transform( + sourceName.begin(), sourceName.end(), sourceName.begin(), [](char c) { return std::toupper(c); }); + return sourceName; + }; + + nvinfer1::TacticSource source{}; + t = toUpper(t); + if (t == "CUBLAS") + { + source = nvinfer1::TacticSource::kCUBLAS; + } + else if (t == "CUBLASLT" || t == "CUBLAS_LT") + { + source = nvinfer1::TacticSource::kCUBLAS_LT; + } + else if (t == "CUDNN") + { + source = nvinfer1::TacticSource::kCUDNN; + } + else if (t == "EDGE_MASK_CONVOLUTIONS") + { + source = nvinfer1::TacticSource::kEDGE_MASK_CONVOLUTIONS; + } + else if (t == "JIT_CONVOLUTIONS") + { + source = nvinfer1::TacticSource::kJIT_CONVOLUTIONS; + } + else + { + throw std::invalid_argument(std::string("Unknown tactic source: ") + t); + } + + uint32_t sourceBit = 1U << static_cast(source); + + if (enable) + { + enabledTactics |= sourceBit; + } + else + { + disabledTactics |= sourceBit; + } + + if (enabledTactics & disabledTactics) + { + throw std::invalid_argument(std::string("Cannot enable and disable ") + t); + } + } + } + + bool noBuilderCache{false}; + getAndDelOption(arguments, "--noBuilderCache", noBuilderCache); + getAndDelOption(arguments, "--timingCacheFile", timingCacheFile); + if (noBuilderCache) + { + timingCacheMode = TimingCacheMode::kDISABLE; + } + else if (!timingCacheFile.empty()) + { + timingCacheMode = TimingCacheMode::kGLOBAL; + } + else + { + timingCacheMode = TimingCacheMode::kLOCAL; + } + getAndDelOption(arguments, "--heuristic", heuristic); + getAndDelOption(arguments, "--builderOptimizationLevel", builderOptimizationLevel); + + std::string hardwareCompatibleArgs; + getAndDelOption(arguments, "--hardwareCompatibilityLevel", hardwareCompatibleArgs); + if (hardwareCompatibleArgs == "none" || hardwareCompatibleArgs.empty()) + { + hardwareCompatibilityLevel = HardwareCompatibilityLevel::kNONE; + } + else if (samplesCommon::toLower(hardwareCompatibleArgs) == "ampere+") + { + hardwareCompatibilityLevel = HardwareCompatibilityLevel::kAMPERE_PLUS; + } + else + { + throw std::invalid_argument(std::string("Unknown hardwareCompatibilityLevel: ") + hardwareCompatibleArgs + + ". Valid options: none, ampere+."); + } + + getAndDelOption(arguments, "--maxAuxStreams", maxAuxStreams); + + std::string previewFeaturesBuf; + getAndDelOption(arguments, "--preview", previewFeaturesBuf); + std::vector previewFeaturesVec{splitToStringVec(previewFeaturesBuf, ',')}; + for (auto featureName : previewFeaturesVec) + { + bool enable{false}; + if (featureName.front() == '+') + { + enable = true; + } + else if (featureName.front() != '-') + { + throw std::invalid_argument( + "Preview features must be prefixed with + or -, indicating whether it should be enabled or disabled " + "respectively."); + } + featureName.erase(0, 1); + + PreviewFeature feat{}; + if (featureName == "profileSharing0806") + { + feat = PreviewFeature::kPROFILE_SHARING_0806; + } + else if (featureName == "fasterDynamicShapes0805") + { + feat = PreviewFeature::kFASTER_DYNAMIC_SHAPES_0805; + } + else if (featureName == "disableExternalTacticSourcesForCore0805") + { + feat = PreviewFeature::kDISABLE_EXTERNAL_TACTIC_SOURCES_FOR_CORE_0805; + } + else + { + throw std::invalid_argument(std::string("Unknown preview feature: ") + featureName); + } + previewFeatures[static_cast(feat)] = enable; + } + + getAndDelOption(arguments, "--tempdir", tempdir); + getTempfileControls(arguments, "--tempfileControls", tempfileControls); + + std::string runtimeMode; + getAndDelOption(arguments, "--useRuntime", runtimeMode); + if (runtimeMode == "full") + { + useRuntime = RuntimeMode::kFULL; + } + else if (runtimeMode == "dispatch") + { + useRuntime = RuntimeMode::kDISPATCH; + } + else if (runtimeMode == "lean") + { + useRuntime = RuntimeMode::kLEAN; + } + else if (!runtimeMode.empty()) + { + throw std::invalid_argument(std::string("Unknown useRuntime: ") + runtimeMode); + } + + if ((useRuntime == RuntimeMode::kDISPATCH || useRuntime == RuntimeMode::kLEAN) && !versionCompatible) + { + versionCompatible = true; + sample::gLogWarning << "Implicitly enabling --versionCompatible since --useRuntime=" << runtimeMode + << " is set." << std::endl; + } + + if (useRuntime != RuntimeMode::kFULL && !load) + { + throw std::invalid_argument(std::string("Building a TensorRT engine requires --useRuntime=full.")); + } + + getAndDelOption(arguments, "--leanDLLPath", leanDLLPath); +} + +void SystemOptions::parse(Arguments& arguments) +{ + getAndDelOption(arguments, "--device", device); + getAndDelOption(arguments, "--useDLACore", DLACore); + getAndDelOption(arguments, "--allowGPUFallback", fallback); + std::string pluginName; + while (getAndDelOption(arguments, "--plugins", pluginName)) + { + sample::gLogWarning << "--plugins flag has been deprecated, use --staticPlugins flag instead." << std::endl; + plugins.emplace_back(pluginName); + } + while (getAndDelOption(arguments, "--staticPlugins", pluginName)) + { + plugins.emplace_back(pluginName); + } + while (getAndDelOption(arguments, "--setPluginsToSerialize", pluginName)) + { + setPluginsToSerialize.emplace_back(pluginName); + } + while (getAndDelOption(arguments, "--dynamicPlugins", pluginName)) + { + dynamicPlugins.emplace_back(pluginName); + } + getAndDelOption(arguments, "--ignoreParsedPluginLibs", ignoreParsedPluginLibs); +} + +void InferenceOptions::parse(Arguments& arguments) +{ + + if (getAndDelOption(arguments, "--streams", infStreams)) + { + sample::gLogWarning << "--streams flag has been deprecated, use --infStreams flag instead." << std::endl; + } + getAndDelOption(arguments, "--infStreams", infStreams); + + getAndDelOption(arguments, "--iterations", iterations); + getAndDelOption(arguments, "--duration", duration); + getAndDelOption(arguments, "--warmUp", warmup); + getAndDelOption(arguments, "--sleepTime", sleep); + getAndDelOption(arguments, "--idleTime", idle); + bool exposeDMA{false}; + if (getAndDelOption(arguments, "--exposeDMA", exposeDMA)) + { + overlap = !exposeDMA; + } + getAndDelOption(arguments, "--noDataTransfers", skipTransfers); + getAndDelOption(arguments, "--useManagedMemory", useManaged); + getAndDelOption(arguments, "--useSpinWait", spin); + getAndDelOption(arguments, "--threads", threads); + getAndDelOption(arguments, "--useCudaGraph", graph); + getAndDelOption(arguments, "--separateProfileRun", rerun); + getAndDelOption(arguments, "--timeDeserialize", timeDeserialize); + getAndDelOption(arguments, "--timeRefit", timeRefit); + getAndDelOption(arguments, "--persistentCacheRatio", persistentCacheRatio); + + std::string list; + getAndDelOption(arguments, "--loadInputs", list); + std::vector inputsList{splitToStringVec(list, ',')}; + splitInsertKeyValue(inputsList, inputs); + + getShapesInference(arguments, shapes, "--shapes"); + getAndDelOption(arguments, "--batch", batch); +} + +void ReportingOptions::parse(Arguments& arguments) +{ + getAndDelOption(arguments, "--avgRuns", avgs); + getAndDelOption(arguments, "--verbose", verbose); + getAndDelOption(arguments, "--dumpRefit", refit); + getAndDelOption(arguments, "--dumpOutput", output); + getAndDelOption(arguments, "--dumpRawBindingsToFile", dumpRawBindings); + getAndDelOption(arguments, "--dumpProfile", profile); + getAndDelOption(arguments, "--dumpLayerInfo", layerInfo); + getAndDelOption(arguments, "--exportTimes", exportTimes); + getAndDelOption(arguments, "--exportOutput", exportOutput); + getAndDelOption(arguments, "--exportProfile", exportProfile); + getAndDelOption(arguments, "--exportLayerInfo", exportLayerInfo); + + std::string percentileString; + getAndDelOption(arguments, "--percentile", percentileString); + std::vector percentileStrings = splitToStringVec(percentileString, ','); + if (!percentileStrings.empty()) + { + percentiles.clear(); + } + for (const auto& p : percentileStrings) + { + percentiles.push_back(stringToValue(p)); + } + + for (auto percentile : percentiles) + { + if (percentile < 0.F || percentile > 100.F) + { + throw std::invalid_argument(std::string("Percentile ") + std::to_string(percentile) + "is not in [0,100]"); + } + } +} + +bool parseHelp(Arguments& arguments) +{ + bool helpLong{false}; + bool helpShort{false}; + getAndDelOption(arguments, "--help", helpLong); + getAndDelOption(arguments, "-h", helpShort); + return helpLong || helpShort; +} + +void AllOptions::parse(Arguments& arguments) +{ + model.parse(arguments); + build.parse(arguments); + system.parse(arguments); + inference.parse(arguments); + + // Use explicitBatch when input model is ONNX or when dynamic shapes are used. + const bool isOnnx{model.baseModel.format == ModelFormat::kONNX}; + const bool hasDynamicShapes{!build.shapes.empty() || !inference.shapes.empty()}; + const bool detectedExplicitBatch = isOnnx || hasDynamicShapes; + + // Throw an error if user tries to use --batch or --maxBatch when the engine has explicit batch dim. + const bool maxBatchWasSet{build.maxBatch != maxBatchNotProvided}; + const bool batchWasSet{inference.batch != batchNotProvided}; + if (detectedExplicitBatch && (maxBatchWasSet || batchWasSet)) + { + throw std::invalid_argument( + "The --batch and --maxBatch flags should not be used when the input model is ONNX or when dynamic shapes " + "are provided. Please use --optShapes and --shapes to set input shapes instead."); + } + + if (build.useRuntime != RuntimeMode::kFULL && inference.timeRefit) + { + throw std::invalid_argument("--timeRefit requires --useRuntime=full."); + } + + // If batch and/or maxBatch is not set and the engine has implicit batch dim, set them to default values. + if (!detectedExplicitBatch) + { + // If batch is not set, set it to default value. + if (!batchWasSet) + { + inference.batch = defaultBatch; + } + // If maxBatch is not set, set it to be equal to batch. + if (!maxBatchWasSet) + { + build.maxBatch = inference.batch; + } + // MaxBatch should not be less than batch. + if (build.maxBatch < inference.batch) + { + throw std::invalid_argument("Build max batch " + std::to_string(build.maxBatch) + + " is less than inference batch " + std::to_string(inference.batch)); + } + } + + // Propagate shape profile between builder and inference + for (auto const& s : build.shapes) + { + if (inference.shapes.find(s.first) == inference.shapes.end()) + { + insertShapesInference( + inference.shapes, s.first, s.second[static_cast(nvinfer1::OptProfileSelector::kOPT)]); + } + } + for (auto const& s : inference.shapes) + { + if (build.shapes.find(s.first) == build.shapes.end()) + { + // assume min/opt/max all the same + insertShapesBuild(build.shapes, nvinfer1::OptProfileSelector::kMIN, s.first, s.second); + insertShapesBuild(build.shapes, nvinfer1::OptProfileSelector::kOPT, s.first, s.second); + insertShapesBuild(build.shapes, nvinfer1::OptProfileSelector::kMAX, s.first, s.second); + } + } + + // Set nvtxVerbosity to be the same as build-time profilingVerbosity. + inference.nvtxVerbosity = build.profilingVerbosity; + + reporting.parse(arguments); + helps = parseHelp(arguments); + + if (!helps) + { + if (!build.load && model.baseModel.format == ModelFormat::kANY) + { + throw std::invalid_argument("Model missing or format not recognized"); + } + if (build.safe && system.DLACore >= 0) + { + auto checkSafeDLAFormats = [](std::vector const& fmt) { + return fmt.empty() ? false : std::all_of(fmt.begin(), fmt.end(), [](IOFormat const& pair) { + bool supported{false}; + bool const isDLA_LINEAR{ + pair.second == 1U << static_cast(nvinfer1::TensorFormat::kDLA_LINEAR)}; + bool const isCHW4{pair.second == 1U << static_cast(nvinfer1::TensorFormat::kCHW4)}; + bool const isCHW32{pair.second == 1U << static_cast(nvinfer1::TensorFormat::kCHW32)}; + bool const isCHW16{pair.second == 1U << static_cast(nvinfer1::TensorFormat::kCHW16)}; + supported |= pair.first == nvinfer1::DataType::kINT8 && (isDLA_LINEAR || isCHW4 || isCHW32); + supported |= pair.first == nvinfer1::DataType::kHALF && (isDLA_LINEAR || isCHW4 || isCHW16); + return supported; + }); + }; + if (!checkSafeDLAFormats(build.inputFormats) || !checkSafeDLAFormats(build.outputFormats)) + { + throw std::invalid_argument( + "I/O formats for safe DLA capability are restricted to fp16/int8:dla_linear, fp16:chw16 or " + "int8:chw32"); + } + if (system.fallback) + { + throw std::invalid_argument("GPU fallback (--allowGPUFallback) not allowed for safe DLA capability"); + } + } + } +} + +void TaskInferenceOptions::parse(Arguments& arguments) +{ + getAndDelOption(arguments, "engine", engine); + getAndDelOption(arguments, "device", device); + getAndDelOption(arguments, "batch", batch); + getAndDelOption(arguments, "DLACore", DLACore); + getAndDelOption(arguments, "graph", graph); + getAndDelOption(arguments, "persistentCacheRatio", persistentCacheRatio); +} + +void SafeBuilderOptions::parse(Arguments& arguments) +{ + auto getFormats = [&arguments](std::vector& formatsVector, const char* argument) { + std::string list; + getAndDelOption(arguments, argument, list); + std::vector formats{splitToStringVec(list, ',')}; + for (const auto& f : formats) + { + formatsVector.push_back(stringToValue(f)); + } + }; + + getAndDelOption(arguments, "--serialized", serialized); + getAndDelOption(arguments, "--onnx", onnxModelFile); + getAndDelOption(arguments, "--help", help); + getAndDelOption(arguments, "-h", help); + getAndDelOption(arguments, "--verbose", verbose); + getAndDelOption(arguments, "-v", verbose); + getFormats(inputFormats, "--inputIOFormats"); + getFormats(outputFormats, "--outputIOFormats"); + getAndDelOption(arguments, "--int8", int8); + getAndDelOption(arguments, "--calib", calibFile); + getAndDelOption(arguments, "--consistency", consistency); + getAndDelOption(arguments, "--std", standard); + std::string pluginName; + while (getAndDelOption(arguments, "--plugins", pluginName)) + { + sample::gLogWarning << "--plugins flag has been deprecated, use --staticPlugins flag instead." << std::endl; + plugins.emplace_back(pluginName); + } + while (getAndDelOption(arguments, "--staticPlugins", pluginName)) + { + plugins.emplace_back(pluginName); + } + bool noBuilderCache{false}; + getAndDelOption(arguments, "--noBuilderCache", noBuilderCache); + getAndDelOption(arguments, "--timingCacheFile", timingCacheFile); + getAndDelOption(arguments, "--minTiming", minTiming); + getAndDelOption(arguments, "--avgTiming", avgTiming); + if (noBuilderCache) + { + timingCacheMode = TimingCacheMode::kDISABLE; + } + else if (!timingCacheFile.empty()) + { + timingCacheMode = TimingCacheMode::kGLOBAL; + } + else + { + timingCacheMode = TimingCacheMode::kLOCAL; + } + getAndDelOption(arguments, "--sparsity", sparsity); +} + +std::ostream& operator<<(std::ostream& os, const BaseModelOptions& options) +{ + os << "=== Model Options ===" << std::endl; + + os << "Format: "; + switch (options.format) + { + case ModelFormat::kCAFFE: + { + os << "Caffe"; + break; + } + case ModelFormat::kONNX: + { + os << "ONNX"; + break; + } + case ModelFormat::kUFF: + { + os << "UFF"; + break; + } + case ModelFormat::kANY: os << "*"; break; + } + os << std::endl << "Model: " << options.model << std::endl; + + return os; +} + +std::ostream& operator<<(std::ostream& os, const UffInput& input) +{ + os << "Uff Inputs Layout: " << (input.NHWC ? "NHWC" : "NCHW") << std::endl; + for (const auto& i : input.inputs) + { + os << "Input: " << i.first << "," << i.second.d[0] << "," << i.second.d[1] << "," << i.second.d[2] << std::endl; + } + + return os; +} + +std::ostream& operator<<(std::ostream& os, const ModelOptions& options) +{ + os << options.baseModel; + switch (options.baseModel.format) + { + case ModelFormat::kCAFFE: + { + os << "Prototxt: " << options.prototxt << std::endl; + break; + } + case ModelFormat::kUFF: + { + os << options.uffInputs; + break; + } + case ModelFormat::kONNX: // Fallthrough: No options to report for ONNX or the generic case + case ModelFormat::kANY: break; + } + + os << "Output:"; + for (const auto& o : options.outputs) + { + os << " " << o; + } + os << std::endl; + + return os; +} + +std::ostream& operator<<(std::ostream& os, nvinfer1::DataType dtype) +{ + switch (dtype) + { + case nvinfer1::DataType::kFLOAT: + { + os << "fp32"; + break; + } + case nvinfer1::DataType::kHALF: + { + os << "fp16"; + break; + } + case nvinfer1::DataType::kINT8: + { + os << "int8"; + break; + } + case nvinfer1::DataType::kINT32: + { + os << "int32"; + break; + } + case nvinfer1::DataType::kBOOL: + { + os << "bool"; + break; + } + case nvinfer1::DataType::kUINT8: + { + os << "uint8"; + break; + } + case nvinfer1::DataType::kFP8: + { + os << "fp8"; + break; + } + } + return os; +} + +std::ostream& operator<<(std::ostream& os, IOFormat const& format) +{ + os << format.first << ":"; + + for (int32_t f = 0; f < nvinfer1::EnumMax(); ++f) + { + if ((1U << f) & format.second) + { + if (f) + { + os << "+"; + } + switch (nvinfer1::TensorFormat(f)) + { + case nvinfer1::TensorFormat::kLINEAR: + { + os << "chw"; + break; + } + case nvinfer1::TensorFormat::kCHW2: + { + os << "chw2"; + break; + } + case nvinfer1::TensorFormat::kHWC8: + { + os << "hwc8"; + break; + } + case nvinfer1::TensorFormat::kHWC16: + { + os << "hwc16"; + break; + } + case nvinfer1::TensorFormat::kCHW4: + { + os << "chw4"; + break; + } + case nvinfer1::TensorFormat::kCHW16: + { + os << "chw16"; + break; + } + case nvinfer1::TensorFormat::kCHW32: + { + os << "chw32"; + break; + } + case nvinfer1::TensorFormat::kDHWC8: + { + os << "dhwc8"; + break; + } + case nvinfer1::TensorFormat::kCDHW32: + { + os << "cdhw32"; + break; + } + case nvinfer1::TensorFormat::kHWC: + { + os << "hwc"; + break; + } + case nvinfer1::TensorFormat::kDHWC: + { + os << "dhwc"; + break; + } + case nvinfer1::TensorFormat::kDLA_LINEAR: + { + os << "dla_linear"; + break; + } + case nvinfer1::TensorFormat::kDLA_HWC4: + { + os << "dla_hwc4"; + break; + } + } + } + } + return os; +} + +std::ostream& operator<<(std::ostream& os, nvinfer1::DeviceType devType) +{ + switch (devType) + { + case nvinfer1::DeviceType::kGPU: + { + os << "GPU"; + break; + } + case nvinfer1::DeviceType::kDLA: + { + os << "DLA"; + break; + } + } + return os; +} + +std::ostream& operator<<(std::ostream& os, const ShapeRange& dims) +{ + int32_t i = 0; + for (const auto& d : dims) + { + if (!d.size()) + { + break; + } + os << (i ? "+" : "") << d; + ++i; + } + return os; +} + +std::ostream& operator<<(std::ostream& os, LayerPrecisions const& layerPrecisions) +{ + int32_t i = 0; + for (auto const& layerPrecision : layerPrecisions) + { + os << (i ? "," : "") << layerPrecision.first << ":" << layerPrecision.second; + ++i; + } + return os; +} + +std::ostream& operator<<(std::ostream& os, LayerDeviceTypes const& layerDeviceTypes) +{ + int32_t i = 0; + for (auto const& layerDevicePair : layerDeviceTypes) + { + os << (i++ ? ", " : "") << layerDevicePair.first << ":" << layerDevicePair.second; + } + return os; +} + +std::ostream& operator<<(std::ostream& os, const BuildOptions& options) +{ + // clang-format off + os << "=== Build Options ===" << std::endl << + + "Max batch: "; printBatch(os, options.maxBatch) << std::endl << + "Memory Pools: "; printMemoryPools(os, options) << std::endl << + "minTiming: " << options.minTiming << std::endl << + "avgTiming: " << options.avgTiming << std::endl << + "Precision: "; printPrecision(os, options) << std::endl << + "LayerPrecisions: " << options.layerPrecisions << std::endl << + "Layer Device Types: " << options.layerDeviceTypes << std::endl << + "Calibration: " << (options.int8 && options.calibration.empty() ? "Dynamic" : options.calibration.c_str()) << std::endl << + "Refit: " << boolToEnabled(options.refittable) << std::endl << + "Version Compatible: " << boolToEnabled(options.versionCompatible) << std::endl << + "TensorRT runtime: " << options.useRuntime << std::endl << + "Lean DLL Path: " << options.leanDLLPath << std::endl << + "Tempfile Controls: "; printTempfileControls(os, options.tempfileControls) << std::endl << + "Exclude Lean Runtime: " << boolToEnabled(options.excludeLeanRuntime) << std::endl << + "Sparsity: "; printSparsity(os, options) << std::endl << + "Safe mode: " << boolToEnabled(options.safe) << std::endl << + "DirectIO mode: " << boolToEnabled(options.directIO) << std::endl << + "Restricted mode: " << boolToEnabled(options.restricted) << std::endl << + "Skip inference: " << boolToEnabled(options.skipInference) << std::endl << + "Save engine: " << (options.save ? options.engine : "") << std::endl << + "Load engine: " << (options.load ? options.engine : "") << std::endl << + "Profiling verbosity: " << static_cast(options.profilingVerbosity) << std::endl << + "Tactic sources: "; printTacticSources(os, options.enabledTactics, options.disabledTactics) << std::endl << + "timingCacheMode: "; printTimingCache(os, options.timingCacheMode) << std::endl << + "timingCacheFile: " << options.timingCacheFile << std::endl << + "Heuristic: " << boolToEnabled(options.heuristic) << std::endl << + "Preview Features: "; printPreviewFlags(os, options) << std::endl << + "MaxAuxStreams: " << options.maxAuxStreams << std::endl << + "BuilderOptimizationLevel: " << options.builderOptimizationLevel << std::endl; + // clang-format on + + auto printIOFormats = [](std::ostream& os, const char* direction, const std::vector formats) { + if (formats.empty()) + { + os << direction << "s format: fp32:CHW" << std::endl; + } + else + { + for (const auto& f : formats) + { + os << direction << ": " << f << std::endl; + } + } + }; + + printIOFormats(os, "Input(s)", options.inputFormats); + printIOFormats(os, "Output(s)", options.outputFormats); + printShapes(os, "build", options.shapes); + printShapes(os, "calibration", options.shapesCalib); + + return os; +} + +std::ostream& operator<<(std::ostream& os, const SystemOptions& options) +{ + // clang-format off + os << "=== System Options ===" << std::endl << + + "Device: " << options.device << std::endl << + "DLACore: " << (options.DLACore != -1 ? std::to_string(options.DLACore) : "") << + (options.DLACore != -1 && options.fallback ? "(With GPU fallback)" : "") << std::endl; + os << "Plugins:"; + + for (const auto& p : options.plugins) + { + os << " " << p; + } + os << std::endl; + + os << "setPluginsToSerialize:"; + + for (const auto& p : options.setPluginsToSerialize) + { + os << " " << p; + } + os << std::endl; + + os << "dynamicPlugins:"; + + for (const auto& p : options.dynamicPlugins) + { + os << " " << p; + } + os << std::endl; + + os << "ignoreParsedPluginLibs: " << options.ignoreParsedPluginLibs << std::endl; + os << std::endl; + + return os; + // clang-format on +} + +std::ostream& operator<<(std::ostream& os, const InferenceOptions& options) +{ + // clang-format off + os << "=== Inference Options ===" << std::endl << + + "Batch: "; + if (options.batch && options.shapes.empty()) + { + os << options.batch << std::endl; + } + else + { + os << "Explicit" << std::endl; + } + printShapes(os, "inference", options.shapes); + os << "Iterations: " << options.iterations << std::endl << + "Duration: " << options.duration << "s (+ " + << options.warmup << "ms warm up)" << std::endl << + "Sleep time: " << options.sleep << "ms" << std::endl << + "Idle time: " << options.idle << "ms" << std::endl << + "Inference Streams: " << options.infStreams << std::endl << + "ExposeDMA: " << boolToEnabled(!options.overlap) << std::endl << + "Data transfers: " << boolToEnabled(!options.skipTransfers) << std::endl << + "Spin-wait: " << boolToEnabled(options.spin) << std::endl << + "Multithreading: " << boolToEnabled(options.threads) << std::endl << + "CUDA Graph: " << boolToEnabled(options.graph) << std::endl << + "Separate profiling: " << boolToEnabled(options.rerun) << std::endl << + "Time Deserialize: " << boolToEnabled(options.timeDeserialize) << std::endl << + "Time Refit: " << boolToEnabled(options.timeRefit) << std::endl << + "NVTX verbosity: " << static_cast(options.nvtxVerbosity) << std::endl << + "Persistent Cache Ratio: " << static_cast(options.persistentCacheRatio) << std::endl; + // clang-format on + + os << "Inputs:" << std::endl; + for (const auto& input : options.inputs) + { + os << input.first << "<-" << input.second << std::endl; + } + + return os; +} + +std::ostream& operator<<(std::ostream& os, const ReportingOptions& options) +{ + // clang-format off + os << "=== Reporting Options ===" << std::endl << + "Verbose: " << boolToEnabled(options.verbose) << std::endl << + "Averages: " << options.avgs << " inferences" << std::endl << + "Percentiles: " << joinValuesToString(options.percentiles, ",") << std::endl << + "Dump refittable layers:" << boolToEnabled(options.refit) << std::endl << + "Dump output: " << boolToEnabled(options.output) << std::endl << + "Profile: " << boolToEnabled(options.profile) << std::endl << + "Export timing to JSON file: " << options.exportTimes << std::endl << + "Export output to JSON file: " << options.exportOutput << std::endl << + "Export profile to JSON file: " << options.exportProfile << std::endl; + // clang-format on + + return os; +} + +std::ostream& operator<<(std::ostream& os, const AllOptions& options) +{ + os << options.model << options.build << options.system << options.inference << options.reporting << std::endl; + return os; +} + +std::ostream& operator<<(std::ostream& os, const SafeBuilderOptions& options) +{ + auto printIOFormats = [](std::ostream& os, const char* direction, const std::vector formats) { + if (formats.empty()) + { + os << direction << "s format: fp32:CHW" << std::endl; + } + else + { + for (const auto& f : formats) + { + os << direction << ": " << f << std::endl; + } + } + }; + + os << "=== Build Options ===" << std::endl; + os << "Model ONNX: " << options.onnxModelFile << std::endl; + + os << "Precision: FP16"; + if (options.int8) + { + os << " + INT8"; + } + if (options.fp8) + { + os << " + FP8"; + } + os << std::endl; + os << "Calibration file: " << options.calibFile << std::endl; + os << "Serialized Network: " << options.serialized << std::endl; + + printIOFormats(os, "Input(s)", options.inputFormats); + printIOFormats(os, "Output(s)", options.outputFormats); + + os << "Plugins:"; + for (const auto& p : options.plugins) + { + os << " " << p; + } + + os << "timingCacheMode: "; + printTimingCache(os, options.timingCacheMode) << std::endl; + os << "timingCacheFile: " << options.timingCacheFile << std::endl; + os << std::endl; + return os; +} + +void BaseModelOptions::help(std::ostream& os) +{ + // clang-format off + os << " --uff= UFF model" << std::endl << + " --onnx= ONNX model" << std::endl << + " --model= Caffe model (default = no model, random weights used)" << std::endl; + // clang-format on +} + +void UffInput::help(std::ostream& os) +{ + // clang-format off + os << " --uffInput=,X,Y,Z Input blob name and its dimensions (X,Y,Z=C,H,W), it can be specified " + "multiple times; at least one is required for UFF models" << std::endl << + " --uffNHWC Set if inputs are in the NHWC layout instead of NCHW (use " << + "X,Y,Z=H,W,C order in --uffInput)" << std::endl; + // clang-format on +} + +void ModelOptions::help(std::ostream& os) +{ + // clang-format off + os << "=== Model Options ===" << std::endl; + BaseModelOptions::help(os); + os << " --deploy= Caffe prototxt file" << std::endl << + " --output=[,]* Output names (it can be specified multiple times); at least one output " + "is required for UFF and Caffe" << std::endl; + UffInput::help(os); + // clang-format on +} + +void BuildOptions::help(std::ostream& os) +{ + // clang-format off + os << "=== Build Options ===" "\n" + " --maxBatch Set max batch size and build an implicit batch engine (default = same size as --batch)" "\n" + " This option should not be used when the input model is ONNX or when dynamic shapes are provided." "\n" + " --minShapes=spec Build with dynamic shapes using a profile with the min shapes provided" "\n" + " --optShapes=spec Build with dynamic shapes using a profile with the opt shapes provided" "\n" + " --maxShapes=spec Build with dynamic shapes using a profile with the max shapes provided" "\n" + " --minShapesCalib=spec Calibrate with dynamic shapes using a profile with the min shapes provided" "\n" + " --optShapesCalib=spec Calibrate with dynamic shapes using a profile with the opt shapes provided" "\n" + " --maxShapesCalib=spec Calibrate with dynamic shapes using a profile with the max shapes provided" "\n" + " Note: All three of min, opt and max shapes must be supplied." "\n" + " However, if only opt shapes is supplied then it will be expanded so" "\n" + " that min shapes and max shapes are set to the same values as opt shapes." "\n" + " Input names can be wrapped with escaped single quotes (ex: 'Input:0')." "\n" + " Example input shapes spec: input0:1x3x256x256,input1:1x3x128x128" "\n" + " Each input shape is supplied as a key-value pair where key is the input name and" "\n" + " value is the dimensions (including the batch dimension) to be used for that input." "\n" + " Each key-value pair has the key and value separated using a colon (:)." "\n" + " Multiple input shapes can be provided via comma-separated key-value pairs." "\n" + " --inputIOFormats=spec Type and format of each of the input tensors (default = all inputs in fp32:chw)" "\n" + " See --outputIOFormats help for the grammar of type and format list." "\n" + " Note: If this option is specified, please set comma-separated types and formats for all" "\n" + " inputs following the same order as network inputs ID (even if only one input" "\n" + " needs specifying IO format) or set the type and format once for broadcasting." "\n" + " --outputIOFormats=spec Type and format of each of the output tensors (default = all outputs in fp32:chw)" "\n" + " Note: If this option is specified, please set comma-separated types and formats for all" "\n" + " outputs following the same order as network outputs ID (even if only one output" "\n" + " needs specifying IO format) or set the type and format once for broadcasting." "\n" + R"( IO Formats: spec ::= IOfmt[","spec])" "\n" + " IOfmt ::= type:fmt" "\n" + R"( type ::= "fp32"|"fp16"|"int32"|"int8")" "\n" + R"( fmt ::= ("chw"|"chw2"|"chw4"|"hwc8"|"chw16"|"chw32"|"dhwc8"|)" "\n" + R"( "cdhw32"|"hwc"|"dla_linear"|"dla_hwc4")["+"fmt])" "\n" + " --workspace=N Set workspace size in MiB." "\n" + " --memPoolSize=poolspec Specify the size constraints of the designated memory pool(s) in MiB." "\n" + " Note: Also accepts decimal sizes, e.g. 0.25MiB. Will be rounded down to the nearest integer bytes." "\n" + R"( Pool constraint: poolspec ::= poolfmt[","poolspec])" "\n" + " poolfmt ::= pool:sizeInMiB" "\n" + R"( pool ::= "workspace"|"dlaSRAM"|"dlaLocalDRAM"|"dlaGlobalDRAM")" "\n" + " --profilingVerbosity=mode Specify profiling verbosity. mode ::= layer_names_only|detailed|none (default = layer_names_only)" "\n" + " --minTiming=M Set the minimum number of iterations used in kernel selection (default = " + << defaultMinTiming << ")" "\n" + " --avgTiming=M Set the number of times averaged in each iteration for kernel selection (default = " + << defaultAvgTiming << ")" "\n" + " --refit Mark the engine as refittable. This will allow the inspection of refittable layers " "\n" + " and weights within the engine." "\n" + " --versionCompatible, --vc Mark the engine as version compatible. This allows the engine to be used with newer versions" "\n" + " of TensorRT on the same host OS, as well as TensorRT's dispatch and lean runtimes." "\n" + " Only supported with explicit batch." "\n" + R"( --useRuntime=runtime TensorRT runtime to execute engine. "lean" and "dispatch" require loading VC engine and do)" "\n" + " not support building an engine." "\n" + R"( runtime::= "full"|"lean"|"dispatch")" "\n" + " --leanDLLPath= External lean runtime DLL to use in version compatiable mode." "\n" + " --excludeLeanRuntime When --versionCompatible is enabled, this flag indicates that the generated engine should" "\n" + " not include an embedded lean runtime. If this is set, the user must explicitly specify a" "\n" + " valid lean runtime to use when loading the engine. Only supported with explicit batch" "\n" + " and weights within the engine." "\n" + " --sparsity=spec Control sparsity (default = disabled). " "\n" + R"( Sparsity: spec ::= "disable", "enable", "force")" "\n" + " Note: Description about each of these options is as below" "\n" + " disable = do not enable sparse tactics in the builder (this is the default)" "\n" + " enable = enable sparse tactics in the builder (but these tactics will only be" "\n" + " considered if the weights have the right sparsity pattern)" "\n" + " force = enable sparse tactics in the builder and force-overwrite the weights to have" "\n" + " a sparsity pattern (even if you loaded a model yourself)" "\n" + " --noTF32 Disable tf32 precision (default is to enable tf32, in addition to fp32)" "\n" + " --fp16 Enable fp16 precision, in addition to fp32 (default = disabled)" "\n" + " --int8 Enable int8 precision, in addition to fp32 (default = disabled)" "\n" + " --fp8 Enable fp8 precision, in addition to fp32 (default = disabled)" "\n" + " --best Enable all precisions to achieve the best performance (default = disabled)" "\n" + " --directIO Avoid reformatting at network boundaries. (default = disabled)" "\n" + " --precisionConstraints=spec Control precision constraint setting. (default = none)" "\n" + R"( Precision Constraints: spec ::= "none" | "obey" | "prefer")" "\n" + " none = no constraints" "\n" + " prefer = meet precision constraints set by --layerPrecisions/--layerOutputTypes if possible" "\n" + " obey = meet precision constraints set by --layerPrecisions/--layerOutputTypes or fail" "\n" + " otherwise" "\n" + " --layerPrecisions=spec Control per-layer precision constraints. Effective only when precisionConstraints is set to" "\n" + R"( "obey" or "prefer". (default = none))" "\n" + R"( The specs are read left-to-right, and later ones override earlier ones. "*" can be used as a)" "\n" + " layerName to specify the default precision for all the unspecified layers." "\n" + R"( Per-layer precision spec ::= layerPrecision[","spec])" "\n" + R"( layerPrecision ::= layerName":"precision)" "\n" + R"( precision ::= "fp32"|"fp16"|"int32"|"int8")" "\n" + " --layerOutputTypes=spec Control per-layer output type constraints. Effective only when precisionConstraints is set to" "\n" + R"( "obey" or "prefer". (default = none)" "\n" + R"( The specs are read left-to-right, and later ones override earlier ones. "*" can be used as a)" "\n" + " layerName to specify the default precision for all the unspecified layers. If a layer has more than""\n" + R"( one output, then multiple types separated by "+" can be provided for this layer.)" "\n" + R"( Per-layer output type spec ::= layerOutputTypes[","spec])" "\n" + R"( layerOutputTypes ::= layerName":"type)" "\n" + R"( type ::= "fp32"|"fp16"|"int32"|"int8"["+"type])" "\n" + " --layerDeviceTypes=spec Specify layer-specific device type." "\n" + " The specs are read left-to-right, and later ones override earlier ones. If a layer does not have" "\n" + " a device type specified, the layer will opt for the default device type." "\n" + R"( Per-layer device type spec ::= layerDeviceTypePair[","spec])" "\n" + R"( layerDeviceTypePair ::= layerName":"deviceType)" "\n" + R"( deviceType ::= "GPU"|"DLA")" "\n" + " --calib= Read INT8 calibration cache file" "\n" + " --safe Enable build safety certified engine" "\n" + " --consistency Perform consistency checking on safety certified engine" "\n" + " --restricted Enable safety scope checking with kSAFETY_SCOPE build flag" "\n" + " --saveEngine= Save the serialized engine" "\n" + " --loadEngine= Load a serialized engine" "\n" + " --tacticSources=tactics Specify the tactics to be used by adding (+) or removing (-) tactics from the default " "\n" + " tactic sources (default = all available tactics)." "\n" + " Note: Currently only cuDNN, cuBLAS, cuBLAS-LT, and edge mask convolutions are listed as optional" "\n" + " tactics." "\n" + R"( Tactic Sources: tactics ::= [","tactic])" "\n" + " tactic ::= (+|-)lib" "\n" + R"( lib ::= "CUBLAS"|"CUBLAS_LT"|"CUDNN"|"EDGE_MASK_CONVOLUTIONS")" "\n" + R"( |"JIT_CONVOLUTIONS")" "\n" + " For example, to disable cudnn and enable cublas: --tacticSources=-CUDNN,+CUBLAS" "\n" + " --noBuilderCache Disable timing cache in builder (default is to enable timing cache)" "\n" + " --heuristic Enable tactic selection heuristic in builder (default is to disable the heuristic)" "\n" + " --timingCacheFile= Save/load the serialized global timing cache" "\n" + " --preview=features Specify preview feature to be used by adding (+) or removing (-) preview features from the default" "\n" + R"( Preview Features: features ::= [","feature])" "\n" + " feature ::= (+|-)flag" "\n" + R"( flag ::= "fasterDynamicShapes0805")" "\n" + R"( |"disableExternalTacticSourcesForCore0805")" "\n" + R"( |"profileSharing0806")" "\n" + " --builderOptimizationLevel Set the builder optimization level. (default is 3" "\n" + " Higher level allows TensorRT to spend more building time for more optimization options." "\n" + " The default level is 3. Valid values include integers from 0 to the maximum optimization level," "\n" + " which is currently 5." "\n" + " --hardwareCompatibilityLevel=mode Make the engine file compatible with other GPU architectures. (default = none)" "\n" + R"( Hardware Compatibility Level: mode ::= "none" | "ampere+")" "\n" + " none = no compatibility" "\n" + " ampere+ = compatible with Ampere and newer GPUs" "\n" + " --tempdir= Overrides the default temporary directory TensorRT will use when creating temporary files." "\n" + " See IRuntime::setTemporaryDirectory API documentation for more information." "\n" + " --tempfileControls=controls Controls what TensorRT is allowed to use when creating temporary executable files." "\n" + " Should be a comma-separated list with entries in the format (in_memory|temporary):(allow|deny)." "\n" + " in_memory: Controls whether TensorRT is allowed to create temporary in-memory executable files." "\n" + " temporary: Controls whether TensorRT is allowed to create temporary executable files in the" "\n" + " filesystem (in the directory given by --tempdir)." "\n" + " For example, to allow in-memory files and disallow temporary files:" "\n" + " --tempfileControls=in_memory:allow,temporary:deny" "\n" + R"( If a flag is unspecified, the default behavior is "allow".)" "\n" + " --maxAuxStreams=N Set maximum number of auxiliary streams per inference stream that TRT is allowed to use to run " "\n" + " kernels in parallel if the network contains ops that can run in parallel, with the cost of more " "\n" + " memory usage. Set this to 0 for optimal memory usage. (default = using heuristics)" "\n" + ; + // clang-format on + os << std::flush; +} + +void SystemOptions::help(std::ostream& os) +{ + // clang-format off + os << "=== System Options ===" << std::endl << + " --device=N Select cuda device N (default = " << defaultDevice << ")" << std::endl << + " --useDLACore=N Select DLA core N for layers that support DLA (default = none)" << std::endl << + " --allowGPUFallback When DLA is enabled, allow GPU fallback for unsupported layers " + "(default = disabled)" << std::endl << + " --staticPlugins Plugin library (.so) to load statically (can be specified multiple times)" << std::endl << + " --dynamicPlugins Plugin library (.so) to load dynamically and may be serialized with the engine if they are included in --setPluginsToSerialize (can be specified multiple times)" << std::endl << + " --setPluginsToSerialize Plugin library (.so) to be serialized with the engine (can be specified multiple times)" << std::endl << + " --ignoreParsedPluginLibs By default, when building a version-compatible engine, plugin libraries specified by the ONNX parser " << std::endl << + " are implicitly serialized with the engine (unless --excludeLeanRuntime is specified) and loaded dynamically. " << std::endl << + " Enable this flag to ignore these plugin libraries instead." << std::endl; + // clang-format on +} + +void InferenceOptions::help(std::ostream& os) +{ + // clang-format off + os << "=== Inference Options ===" << std::endl << + " --batch=N Set batch size for implicit batch engines (default = " << defaultBatch << ")" << std::endl << + " This option should not be used when the engine is built from an ONNX model or when dynamic" << std::endl << + " shapes are provided when the engine is built." << std::endl << + " --shapes=spec Set input shapes for dynamic shapes inference inputs." << std::endl << + R"( Note: Input names can be wrapped with escaped single quotes (ex: 'Input:0').)" << std::endl << + " Example input shapes spec: input0:1x3x256x256, input1:1x3x128x128" << std::endl << + " Each input shape is supplied as a key-value pair where key is the input name and" << std::endl << + " value is the dimensions (including the batch dimension) to be used for that input." << std::endl << + " Each key-value pair has the key and value separated using a colon (:)." << std::endl << + " Multiple input shapes can be provided via comma-separated key-value pairs." << std::endl << + " --loadInputs=spec Load input values from files (default = generate random inputs). Input names can be " + "wrapped with single quotes (ex: 'Input:0')" << std::endl << + R"( Input values spec ::= Ival[","spec])" << std::endl << + R"( Ival ::= name":"file)" << std::endl << + " --iterations=N Run at least N inference iterations (default = " << defaultIterations << ")" << std::endl << + " --warmUp=N Run for N milliseconds to warmup before measuring performance (default = " + << defaultWarmUp << ")" << std::endl << + " --duration=N Run performance measurements for at least N seconds wallclock time (default = " + << defaultDuration << ")" << std::endl << + " --sleepTime=N Delay inference start with a gap of N milliseconds between launch and compute " + "(default = " << defaultSleep << ")" << std::endl << + " --idleTime=N Sleep N milliseconds between two continuous iterations" + "(default = " << defaultIdle << ")" << std::endl << + " --infStreams=N Instantiate N engines to run inference concurrently (default = " << defaultStreams << ")" << std::endl << + " --exposeDMA Serialize DMA transfers to and from device (default = disabled)." << std::endl << + " --noDataTransfers Disable DMA transfers to and from device (default = enabled)." << std::endl << + " --useManagedMemory Use managed memory instead of separate host and device allocations (default = disabled)." << std::endl << + " --useSpinWait Actively synchronize on GPU events. This option may decrease synchronization time but " + "increase CPU usage and power (default = disabled)" << std::endl << + " --threads Enable multithreading to drive engines with independent threads" + " or speed up refitting (default = disabled) " << std::endl << + " --useCudaGraph Use CUDA graph to capture engine execution and then launch inference (default = disabled)." << std::endl << + " This flag may be ignored if the graph capture fails." << std::endl << + " --timeDeserialize Time the amount of time it takes to deserialize the network and exit." << std::endl << + " --timeRefit Time the amount of time it takes to refit the engine before inference." << std::endl << + " --separateProfileRun Do not attach the profiler in the benchmark run; if profiling is enabled, a second " + "profile run will be executed (default = disabled)" << std::endl << + " --skipInference Exit after the engine has been built and skip inference perf measurement " + "(default = disabled)" << std::endl << + " --persistentCacheRatio Set the persistentCacheLimit in ratio, 0.5 represent half of max persistent L2 size " + "(default = 0)" << std::endl; + // clang-format on +} + +void ReportingOptions::help(std::ostream& os) +{ + // clang-format off + os << "=== Reporting Options ===" << std::endl << + " --verbose Use verbose logging (default = false)" << std::endl << + " --avgRuns=N Report performance measurements averaged over N consecutive " + "iterations (default = " << defaultAvgRuns << ")" << std::endl << + " --percentile=P1,P2,P3,... Report performance for the P1,P2,P3,... percentages (0<=P_i<=100, 0 " + "representing max perf, and 100 representing min perf; (default" + " = " << joinValuesToString(defaultPercentiles, ",") << "%)" << std::endl << + " --dumpRefit Print the refittable layers and weights from a refittable " + "engine" << std::endl << + " --dumpOutput Print the output tensor(s) of the last inference iteration " + "(default = disabled)" << std::endl << + " --dumpRawBindingsToFile Print the input/output tensor(s) of the last inference iteration to file" + "(default = disabled)" << std::endl << + " --dumpProfile Print profile information per layer (default = disabled)" << std::endl << + " --dumpLayerInfo Print layer information of the engine to console " + "(default = disabled)" << std::endl << + " --exportTimes= Write the timing results in a json file (default = disabled)" << std::endl << + " --exportOutput= Write the output tensors to a json file (default = disabled)" << std::endl << + " --exportProfile= Write the profile information per layer in a json file " + "(default = disabled)" << std::endl << + " --exportLayerInfo= Write the layer information of the engine in a json file " + "(default = disabled)" << std::endl; + // clang-format on +} + +void TaskInferenceOptions::help(std::ostream& os) +{ + // clang-format off + os << "=== Task Inference Options ===" << std::endl << + " engine= Specify a serialized engine for this task" << std::endl << + " device=N Specify a GPU device for this task" << std::endl << + " DLACore=N Specify a DLACore for this task" << std::endl << + " batch=N Set batch size for implicit batch engines (default = " << defaultBatch << ")" << std::endl << + " This option should not be used for explicit batch engines" << std::endl << + " graph=1 Use cuda graph for this task" << std::endl << + " persistentCacheRatio=[0-1] Set the persistentCacheLimit ratio for this task (default = 0)" << std::endl; + // clang-format on +} + +void helpHelp(std::ostream& os) +{ + // clang-format off + os << "=== Help ===" << std::endl << + " --help, -h Print this message" << std::endl; + // clang-format on +} + +void AllOptions::help(std::ostream& os) +{ + ModelOptions::help(os); + os << std::endl; + BuildOptions::help(os); + os << std::endl; + InferenceOptions::help(os); + os << std::endl; + // clang-format off + os << "=== Build and Inference Batch Options ===" << std::endl << + " When using implicit batch, the max batch size of the engine, if not given, " << std::endl << + " is set to the inference batch size;" << std::endl << + " when using explicit batch, if shapes are specified only for inference, they " << std::endl << + " will be used also as min/opt/max in the build profile; if shapes are " << std::endl << + " specified only for the build, the opt shapes will be used also for inference;" << std::endl << + " if both are specified, they must be compatible; and if explicit batch is " << std::endl << + " enabled but neither is specified, the model must provide complete static" << std::endl << + " dimensions, including batch size, for all inputs" << std::endl << + " Using ONNX models automatically forces explicit batch." << std::endl << + std::endl; + // clang-format on + ReportingOptions::help(os); + os << std::endl; + SystemOptions::help(os); + os << std::endl; + helpHelp(os); +} + +void SafeBuilderOptions::printHelp(std::ostream& os) +{ + // clang-format off + os << "=== Mandatory ===" << std::endl << + " --onnx= ONNX model" << std::endl << + " " << std::endl << + "=== Optional ===" << std::endl << + " --inputIOFormats=spec Type and format of each of the input tensors (default = all inputs in fp32:chw)" << std::endl << + " See --outputIOFormats help for the grammar of type and format list." << std::endl << + " Note: If this option is specified, please set comma-separated types and formats for all" << std::endl << + " inputs following the same order as network inputs ID (even if only one input" << std::endl << + " needs specifying IO format) or set the type and format once for broadcasting." << std::endl << + " --outputIOFormats=spec Type and format of each of the output tensors (default = all outputs in fp32:chw)" << std::endl << + " Note: If this option is specified, please set comma-separated types and formats for all" << std::endl << + " outputs following the same order as network outputs ID (even if only one output" << std::endl << + " needs specifying IO format) or set the type and format once for broadcasting." << std::endl << + R"( IO Formats: spec ::= IOfmt[","spec])" << std::endl << + " IOfmt ::= type:fmt" << std::endl << + R"( type ::= "fp32"|"fp16"|"int32"|"int8")" << std::endl << + R"( fmt ::= ("chw"|"chw2"|"chw4"|"hwc8"|"chw16"|"chw32"|"dhwc8"|)" << std::endl << + R"( "cdhw32"|"hwc"|"dla_linear"|"dla_hwc4")["+"fmt])" << std::endl << + " --int8 Enable int8 precision, in addition to fp16 (default = disabled)" << std::endl << + " --consistency Enable consistency check for serialized engine, (default = disabled)" << std::endl << + " --std Build standard serialized engine, (default = disabled)" << std::endl << + " --calib= Read INT8 calibration cache file" << std::endl << + " --serialized= Save the serialized network" << std::endl << + " --staticPlugins Plugin library (.so) to load statically (can be specified multiple times)" << std::endl << + " --verbose or -v Use verbose logging (default = false)" << std::endl << + " --help or -h Print this message" << std::endl << + " --noBuilderCache Disable timing cache in builder (default is to enable timing cache)" << std::endl << + " --timingCacheFile= Save/load the serialized global timing cache" << std::endl << + " --sparsity=spec Control sparsity (default = disabled). " << std::endl << + R"( Sparsity: spec ::= "disable", "enable", "force")" << std::endl << + " Note: Description about each of these options is as below" << std::endl << + " disable = do not enable sparse tactics in the builder (this is the default)" << std::endl << + " enable = enable sparse tactics in the builder (but these tactics will only be" << std::endl << + " considered if the weights have the right sparsity pattern)" << std::endl << + " force = enable sparse tactics in the builder and force-overwrite the weights to have" << std::endl << + " a sparsity pattern" << std::endl << + " --minTiming=M Set the minimum number of iterations used in kernel selection (default = " << std::endl << + "" << defaultMinTiming << ")" << std::endl << + " --avgTiming=M Set the number of times averaged in each iteration for kernel selection (default = " << std::endl << + "" << defaultAvgTiming << ")" << std::endl << + "" << std::endl; + // clang-format on +} + +} // namespace sample diff --git a/Code/TestTRTInterDll/trtinfer_lib/common/sampleOptions.h b/Code/TestTRTInterDll/trtinfer_lib/common/sampleOptions.h new file mode 100644 index 0000000..efea2e4 --- /dev/null +++ b/Code/TestTRTInterDll/trtinfer_lib/common/sampleOptions.h @@ -0,0 +1,456 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef TRT_SAMPLE_OPTIONS_H +#define TRT_SAMPLE_OPTIONS_H + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "NvInfer.h" + +namespace sample +{ + +// Build default params +constexpr int32_t maxBatchNotProvided{0}; +constexpr int32_t defaultMinTiming{1}; +constexpr int32_t defaultAvgTiming{8}; +constexpr int32_t defaultMaxAuxStreams{-1}; +constexpr int32_t defaultBuilderOptimizationLevel{3}; + +// System default params +constexpr int32_t defaultDevice{0}; + +// Inference default params +constexpr int32_t defaultBatch{1}; +constexpr int32_t batchNotProvided{0}; +constexpr int32_t defaultStreams{1}; +constexpr int32_t defaultIterations{10}; +constexpr float defaultWarmUp{200.F}; +constexpr float defaultDuration{3.F}; +constexpr float defaultSleep{}; +constexpr float defaultIdle{}; +constexpr float defaultPersistentCacheRatio{0}; + +// Reporting default params +constexpr int32_t defaultAvgRuns{10}; +constexpr std::array defaultPercentiles{90, 95, 99}; + +enum class PrecisionConstraints +{ + kNONE, + kOBEY, + kPREFER +}; + +enum class ModelFormat +{ + kANY, + kCAFFE, + kONNX, + kUFF +}; + +enum class SparsityFlag +{ + kDISABLE, + kENABLE, + kFORCE +}; + +enum class TimingCacheMode +{ + kDISABLE, + kLOCAL, + kGLOBAL +}; + +//! +//! \enum RuntimeMode +//! +//! \brief Used to dictate which TensorRT runtime library to dynamically load. +//! +enum class RuntimeMode +{ + //! Maps to libnvinfer.so or nvinfer.dll + kFULL, + + //! Maps to libnvinfer_dispatch.so or nvinfer_dispatch.dll + kDISPATCH, + + //! Maps to libnvinfer_lean.so or nvinfer_lean.dll + kLEAN, +}; + +inline std::ostream& operator<<(std::ostream& os, RuntimeMode const mode) +{ + switch (mode) + { + case RuntimeMode::kFULL: + { + os << "full"; + break; + } + case RuntimeMode::kDISPATCH: + { + os << "dispatch"; + break; + } + case RuntimeMode::kLEAN: + { + os << "lean"; + break; + } + } + + return os; +} + +using Arguments = std::unordered_multimap; + +using IOFormat = std::pair; + +using ShapeRange = std::array, nvinfer1::EnumMax()>; + +using LayerPrecisions = std::unordered_map; +using LayerOutputTypes = std::unordered_map>; +using LayerDeviceTypes = std::unordered_map; + +class Options +{ +public: + virtual ~Options() = default; + virtual void parse(Arguments& arguments) = 0; +}; + +class BaseModelOptions : public Options +{ +public: + ModelFormat format{ModelFormat::kANY}; + std::string model; + + void parse(Arguments& arguments) override; + + static void help(std::ostream& out); +}; + +class UffInput : public Options +{ +public: + std::vector> inputs; + bool NHWC{false}; + + void parse(Arguments& arguments) override; + + static void help(std::ostream& out); +}; + +class ModelOptions : public Options +{ +public: + BaseModelOptions baseModel; + std::string prototxt; + std::vector outputs; + UffInput uffInputs; + + void parse(Arguments& arguments) override; + + static void help(std::ostream& out); +}; + +constexpr nvinfer1::TempfileControlFlags getTempfileControlDefaults() +{ + using F = nvinfer1::TempfileControlFlag; + return (1U << static_cast(F::kALLOW_TEMPORARY_FILES)) + | (1U << static_cast(F::kALLOW_IN_MEMORY_FILES)); +} + +class BuildOptions : public Options +{ +public: + int32_t maxBatch{maxBatchNotProvided}; + double workspace{-1.0}; + double dlaSRAM{-1.0}; + double dlaLocalDRAM{-1.0}; + double dlaGlobalDRAM{-1.0}; + int32_t minTiming{defaultMinTiming}; + int32_t avgTiming{defaultAvgTiming}; + bool tf32{true}; + bool fp16{false}; + bool int8{false}; + bool fp8{false}; + bool directIO{false}; + PrecisionConstraints precisionConstraints{PrecisionConstraints::kNONE}; + LayerPrecisions layerPrecisions; + LayerOutputTypes layerOutputTypes; + LayerDeviceTypes layerDeviceTypes; + bool safe{false}; + bool consistency{false}; + bool restricted{false}; + bool skipInference{false}; + bool save{false}; + bool load{false}; + bool refittable{false}; + bool heuristic{false}; + bool versionCompatible{false}; + bool excludeLeanRuntime{false}; + int32_t builderOptimizationLevel{defaultBuilderOptimizationLevel}; + SparsityFlag sparsity{SparsityFlag::kDISABLE}; + nvinfer1::ProfilingVerbosity profilingVerbosity{nvinfer1::ProfilingVerbosity::kLAYER_NAMES_ONLY}; + std::string engine; + std::string calibration; + using ShapeProfile = std::unordered_map; + ShapeProfile shapes; + ShapeProfile shapesCalib; + std::vector inputFormats; + std::vector outputFormats; + nvinfer1::TacticSources enabledTactics{0}; + nvinfer1::TacticSources disabledTactics{0}; + TimingCacheMode timingCacheMode{TimingCacheMode::kLOCAL}; + std::string timingCacheFile{}; + // C++11 does not automatically generate hash function for enum class. + // Use int32_t to support C++11 compilers. + std::unordered_map previewFeatures; + nvinfer1::HardwareCompatibilityLevel hardwareCompatibilityLevel{nvinfer1::HardwareCompatibilityLevel::kNONE}; + std::string tempdir{}; + nvinfer1::TempfileControlFlags tempfileControls{getTempfileControlDefaults()}; + RuntimeMode useRuntime{RuntimeMode::kFULL}; + std::string leanDLLPath{}; + int32_t maxAuxStreams{defaultMaxAuxStreams}; + + void parse(Arguments& arguments) override; + + static void help(std::ostream& out); +}; + +class SystemOptions : public Options +{ +public: + int32_t device{defaultDevice}; + int32_t DLACore{-1}; + bool fallback{false}; + bool ignoreParsedPluginLibs{false}; + std::vector plugins; + std::vector setPluginsToSerialize; + std::vector dynamicPlugins; + + void parse(Arguments& arguments) override; + + static void help(std::ostream& out); +}; + +class InferenceOptions : public Options +{ +public: + int32_t batch{batchNotProvided}; + int32_t iterations{defaultIterations}; + int32_t infStreams{defaultStreams}; + float warmup{defaultWarmUp}; + float duration{defaultDuration}; + float sleep{defaultSleep}; + float idle{defaultIdle}; + float persistentCacheRatio{defaultPersistentCacheRatio}; + bool overlap{true}; + bool skipTransfers{false}; + bool useManaged{false}; + bool spin{false}; + bool threads{false}; + bool graph{false}; + bool rerun{false}; + bool timeDeserialize{false}; + bool timeRefit{false}; + std::unordered_map inputs; + using ShapeProfile = std::unordered_map>; + ShapeProfile shapes; + nvinfer1::ProfilingVerbosity nvtxVerbosity{nvinfer1::ProfilingVerbosity::kLAYER_NAMES_ONLY}; + + void parse(Arguments& arguments) override; + + static void help(std::ostream& out); +}; + +class ReportingOptions : public Options +{ +public: + bool verbose{false}; + int32_t avgs{defaultAvgRuns}; + std::vector percentiles{defaultPercentiles.begin(), defaultPercentiles.end()}; + bool refit{false}; + bool output{false}; + bool dumpRawBindings{false}; + bool profile{false}; + bool layerInfo{false}; + std::string exportTimes; + std::string exportOutput; + std::string exportProfile; + std::string exportLayerInfo; + + void parse(Arguments& arguments) override; + + static void help(std::ostream& out); +}; + +class SafeBuilderOptions : public Options +{ +public: + std::string serialized{}; + std::string onnxModelFile{}; + bool help{false}; + bool verbose{false}; + std::vector inputFormats; + std::vector outputFormats; + bool int8{false}; + bool fp8{false}; + std::string calibFile{}; + std::vector plugins; + bool consistency{false}; + bool standard{false}; + TimingCacheMode timingCacheMode{TimingCacheMode::kLOCAL}; + std::string timingCacheFile{}; + SparsityFlag sparsity{SparsityFlag::kDISABLE}; + int32_t minTiming{defaultMinTiming}; + int32_t avgTiming{defaultAvgTiming}; + + void parse(Arguments& arguments) override; + + static void printHelp(std::ostream& out); +}; + +class AllOptions : public Options +{ +public: + ModelOptions model; + BuildOptions build; + SystemOptions system; + InferenceOptions inference; + ReportingOptions reporting; + bool helps{false}; + + void parse(Arguments& arguments) override; + + static void help(std::ostream& out); +}; + +class TaskInferenceOptions : public Options +{ +public: + std::string engine; + int32_t device{defaultDevice}; + int32_t DLACore{-1}; + int32_t batch{batchNotProvided}; + bool graph{false}; + float persistentCacheRatio{defaultPersistentCacheRatio}; + void parse(Arguments& arguments) override; + static void help(std::ostream& out); +}; + +Arguments argsToArgumentsMap(int32_t argc, char* argv[]); + +bool parseHelp(Arguments& arguments); + +void helpHelp(std::ostream& out); + +// Functions to print options + +std::ostream& operator<<(std::ostream& os, const BaseModelOptions& options); + +std::ostream& operator<<(std::ostream& os, const UffInput& input); + +std::ostream& operator<<(std::ostream& os, const IOFormat& format); + +std::ostream& operator<<(std::ostream& os, const ShapeRange& dims); + +std::ostream& operator<<(std::ostream& os, const ModelOptions& options); + +std::ostream& operator<<(std::ostream& os, const BuildOptions& options); + +std::ostream& operator<<(std::ostream& os, const SystemOptions& options); + +std::ostream& operator<<(std::ostream& os, const InferenceOptions& options); + +std::ostream& operator<<(std::ostream& os, const ReportingOptions& options); + +std::ostream& operator<<(std::ostream& os, const AllOptions& options); + +std::ostream& operator<<(std::ostream& os, const SafeBuilderOptions& options); + +inline std::ostream& operator<<(std::ostream& os, const nvinfer1::Dims& dims) +{ + for (int32_t i = 0; i < dims.nbDims; ++i) + { + os << (i ? "x" : "") << dims.d[i]; + } + return os; +} +inline std::ostream& operator<<(std::ostream& os, const nvinfer1::WeightsRole role) +{ + switch (role) + { + case nvinfer1::WeightsRole::kKERNEL: + { + os << "Kernel"; + break; + } + case nvinfer1::WeightsRole::kBIAS: + { + os << "Bias"; + break; + } + case nvinfer1::WeightsRole::kSHIFT: + { + os << "Shift"; + break; + } + case nvinfer1::WeightsRole::kSCALE: + { + os << "Scale"; + break; + } + case nvinfer1::WeightsRole::kCONSTANT: + { + os << "Constant"; + break; + } + case nvinfer1::WeightsRole::kANY: + { + os << "Any"; + break; + } + } + + return os; +} + +inline std::ostream& operator<<(std::ostream& os, const std::vector& vec) +{ + for (int32_t i = 0, e = static_cast(vec.size()); i < e; ++i) + { + os << (i ? "x" : "") << vec[i]; + } + return os; +} + +} // namespace sample + +#endif // TRT_SAMPLES_OPTIONS_H diff --git a/Code/TestTRTInterDll/trtinfer_lib/common/sampleReporting.cpp b/Code/TestTRTInterDll/trtinfer_lib/common/sampleReporting.cpp new file mode 100644 index 0000000..10eb985 --- /dev/null +++ b/Code/TestTRTInterDll/trtinfer_lib/common/sampleReporting.cpp @@ -0,0 +1,579 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "sampleInference.h" +#include "sampleOptions.h" +#include "sampleReporting.h" + +using namespace nvinfer1; + +namespace sample +{ + +namespace +{ + +//! +//! \brief Find percentile in an ascending sequence of timings +//! \note percentile must be in [0, 100]. Otherwise, an exception is thrown. +//! +template +float findPercentile(float percentile, std::vector const& timings, T const& toFloat) +{ + int32_t const all = static_cast(timings.size()); + int32_t const exclude = static_cast((1 - percentile / 100) * all); + if (timings.empty()) + { + return std::numeric_limits::infinity(); + } + if (percentile < 0.F || percentile > 100.F) + { + throw std::runtime_error("percentile is not in [0, 100]!"); + } + return toFloat(timings[std::max(all - 1 - exclude, 0)]); +} + +//! +//! \brief Find median in a sorted sequence of timings +//! +template +float findMedian(std::vector const& timings, T const& toFloat) +{ + if (timings.empty()) + { + return std::numeric_limits::infinity(); + } + + int32_t const m = timings.size() / 2; + if (timings.size() % 2) + { + return toFloat(timings[m]); + } + + return (toFloat(timings[m - 1]) + toFloat(timings[m])) / 2; +} + +//! +//! \brief Find coefficient of variance (which is std / mean) in a sorted sequence of timings given the mean +//! +template +float findCoeffOfVariance(std::vector const& timings, T const& toFloat, float mean) +{ + if (timings.empty()) + { + return 0; + } + + if (mean == 0.F) + { + return std::numeric_limits::infinity(); + } + + auto const metricAccumulator = [toFloat, mean](float acc, InferenceTime const& a) { + float const diff = toFloat(a) - mean; + return acc + diff * diff; + }; + float const variance = std::accumulate(timings.begin(), timings.end(), 0.F, metricAccumulator) / timings.size(); + + return std::sqrt(variance) / mean * 100.F; +} + +inline InferenceTime traceToTiming(const InferenceTrace& a) +{ + return InferenceTime((a.enqEnd - a.enqStart), (a.h2dEnd - a.h2dStart), (a.computeEnd - a.computeStart), + (a.d2hEnd - a.d2hStart)); +} + +} // namespace + +void printProlog(int32_t warmups, int32_t timings, float warmupMs, float benchTimeMs, std::ostream& os) +{ + os << "Warmup completed " << warmups << " queries over " << warmupMs << " ms" << std::endl; + os << "Timing trace has " << timings << " queries over " << benchTimeMs / 1000 << " s" << std::endl; +} + +void printTiming(std::vector const& timings, int32_t runsPerAvg, std::ostream& os) +{ + int32_t count = 0; + InferenceTime sum; + + os << std::endl; + os << "=== Trace details ===" << std::endl; + os << "Trace averages of " << runsPerAvg << " runs:" << std::endl; + for (auto const& t : timings) + { + sum += t; + + if (++count == runsPerAvg) + { + // clang-format off + os << "Average on " << runsPerAvg << " runs - GPU latency: " << sum.compute / runsPerAvg + << " ms - Host latency: " << sum.latency() / runsPerAvg << " ms (enqueue " << sum.enq / runsPerAvg + << " ms)" << std::endl; + // clang-format on + count = 0; + sum.enq = 0; + sum.h2d = 0; + sum.compute = 0; + sum.d2h = 0; + } + } +} + +void printMetricExplanations(std::ostream& os) +{ + os << std::endl; + os << "=== Explanations of the performance metrics ===" << std::endl; + os << "Total Host Walltime: the host walltime from when the first query (after warmups) is enqueued to when the " + "last query is completed." + << std::endl; + os << "GPU Compute Time: the GPU latency to execute the kernels for a query." << std::endl; + os << "Total GPU Compute Time: the summation of the GPU Compute Time of all the queries. If this is significantly " + "shorter than Total Host Walltime, the GPU may be under-utilized because of host-side overheads or data " + "transfers." + << std::endl; + os << "Throughput: the observed throughput computed by dividing the number of queries by the Total Host Walltime. " + "If this is significantly lower than the reciprocal of GPU Compute Time, the GPU may be under-utilized " + "because of host-side overheads or data transfers." + << std::endl; + os << "Enqueue Time: the host latency to enqueue a query. If this is longer than GPU Compute Time, the GPU may be " + "under-utilized." + << std::endl; + os << "H2D Latency: the latency for host-to-device data transfers for input tensors of a single query." + << std::endl; + os << "D2H Latency: the latency for device-to-host data transfers for output tensors of a single query." + << std::endl; + os << "Latency: the summation of H2D Latency, GPU Compute Time, and D2H Latency. This is the latency to infer a " + "single query." + << std::endl; +} + +PerformanceResult getPerformanceResult(std::vector const& timings, + std::function metricGetter, std::vector const& percentiles) +{ + auto const metricComparator + = [metricGetter](InferenceTime const& a, InferenceTime const& b) { return metricGetter(a) < metricGetter(b); }; + auto const metricAccumulator = [metricGetter](float acc, InferenceTime const& a) { return acc + metricGetter(a); }; + std::vector newTimings = timings; + std::sort(newTimings.begin(), newTimings.end(), metricComparator); + PerformanceResult result; + result.min = metricGetter(newTimings.front()); + result.max = metricGetter(newTimings.back()); + result.mean = std::accumulate(newTimings.begin(), newTimings.end(), 0.0f, metricAccumulator) / newTimings.size(); + result.median = findMedian(newTimings, metricGetter); + for (auto percentile : percentiles) + { + result.percentiles.emplace_back(findPercentile(percentile, newTimings, metricGetter)); + } + result.coeffVar = findCoeffOfVariance(newTimings, metricGetter, result.mean); + return result; +} + +void printEpilog(std::vector const& timings, float walltimeMs, std::vector const& percentiles, + int32_t batchSize, int32_t infStreams, std::ostream& osInfo, std::ostream& osWarning, std::ostream& osVerbose) +{ + float const throughput = batchSize * timings.size() / walltimeMs * 1000; + + auto const getLatency = [](InferenceTime const& t) { return t.latency(); }; + auto const latencyResult = getPerformanceResult(timings, getLatency, percentiles); + + auto const getEnqueue = [](InferenceTime const& t) { return t.enq; }; + auto const enqueueResult = getPerformanceResult(timings, getEnqueue, percentiles); + + auto const getH2d = [](InferenceTime const& t) { return t.h2d; }; + auto const h2dResult = getPerformanceResult(timings, getH2d, percentiles); + + auto const getCompute = [](InferenceTime const& t) { return t.compute; }; + auto const gpuComputeResult = getPerformanceResult(timings, getCompute, percentiles); + + auto const getD2h = [](InferenceTime const& t) { return t.d2h; }; + auto const d2hResult = getPerformanceResult(timings, getD2h, percentiles); + + auto const toPerfString = [&](const PerformanceResult& r) { + std::stringstream s; + s << "min = " << r.min << " ms, max = " << r.max << " ms, mean = " << r.mean << " ms, " + << "median = " << r.median << " ms"; + for (int32_t i = 0, n = percentiles.size(); i < n; ++i) + { + s << ", percentile(" << percentiles[i] << "%) = " << r.percentiles[i] << " ms"; + } + return s.str(); + }; + + osInfo << std::endl; + osInfo << "=== Performance summary ===" << std::endl; + osInfo << "Throughput: " << throughput << " qps" << std::endl; + osInfo << "Latency: " << toPerfString(latencyResult) << std::endl; + osInfo << "Enqueue Time: " << toPerfString(enqueueResult) << std::endl; + osInfo << "H2D Latency: " << toPerfString(h2dResult) << std::endl; + osInfo << "GPU Compute Time: " << toPerfString(gpuComputeResult) << std::endl; + osInfo << "D2H Latency: " << toPerfString(d2hResult) << std::endl; + osInfo << "Total Host Walltime: " << walltimeMs / 1000 << " s" << std::endl; + osInfo << "Total GPU Compute Time: " << gpuComputeResult.mean * timings.size() / 1000 << " s" << std::endl; + + // Report warnings if the throughput is bound by other factors than GPU Compute Time. + constexpr float kENQUEUE_BOUND_REPORTING_THRESHOLD{0.8F}; + if (enqueueResult.median > kENQUEUE_BOUND_REPORTING_THRESHOLD * gpuComputeResult.median) + { + osWarning + << "* Throughput may be bound by Enqueue Time rather than GPU Compute and the GPU may be under-utilized." + << std::endl; + osWarning << " If not already in use, --useCudaGraph (utilize CUDA graphs where possible) may increase the " + "throughput." + << std::endl; + } + if (h2dResult.median >= gpuComputeResult.median) + { + osWarning << "* Throughput may be bound by host-to-device transfers for the inputs rather than GPU Compute and " + "the GPU may be under-utilized." + << std::endl; + osWarning << " Add --noDataTransfers flag to disable data transfers." << std::endl; + } + if (d2hResult.median >= gpuComputeResult.median) + { + osWarning << "* Throughput may be bound by device-to-host transfers for the outputs rather than GPU Compute " + "and the GPU may be under-utilized." + << std::endl; + osWarning << " Add --noDataTransfers flag to disable data transfers." << std::endl; + } + + // Report warnings if the GPU Compute Time is unstable. + constexpr float kUNSTABLE_PERF_REPORTING_THRESHOLD{1.0F}; + if (gpuComputeResult.coeffVar > kUNSTABLE_PERF_REPORTING_THRESHOLD) + { + osWarning << "* GPU compute time is unstable, with coefficient of variance = " << gpuComputeResult.coeffVar + << "%." << std::endl; + osWarning << " If not already in use, locking GPU clock frequency or adding --useSpinWait may improve the " + << "stability." << std::endl; + } + + // Report warnings if multiple inference streams are used. + if (infStreams > 1) + { + osWarning << "* Multiple inference streams are used. Latencies may not be accurate since inferences may run in " + << " parallel. Please use \"Throughput\" as the performance metric instead." << std::endl; + } + + // Explain what the metrics mean. + osInfo << "Explanations of the performance metrics are printed in the verbose logs." << std::endl; + printMetricExplanations(osVerbose); + + osInfo << std::endl; +} + +void printPerformanceReport(std::vector const& trace, ReportingOptions const& reportingOpts, + InferenceOptions const& infOpts, std::ostream& osInfo, std::ostream& osWarning, std::ostream& osVerbose) +{ + int32_t batchSize = infOpts.batch; + float const warmupMs = infOpts.warmup; + auto const isNotWarmup = [&warmupMs](const InferenceTrace& a) { return a.computeStart >= warmupMs; }; + auto const noWarmup = std::find_if(trace.begin(), trace.end(), isNotWarmup); + int32_t const warmups = noWarmup - trace.begin(); + float const benchTime = trace.back().d2hEnd - noWarmup->h2dStart; + // when implicit batch used, batchSize = options.inference.batch, which is parsed through --batch + // when explicit batch used, batchSize = options.inference.batch = 0 + // treat inference with explicit batch as a single query and report the throughput + batchSize = batchSize ? batchSize : 1; + printProlog(warmups * batchSize, (trace.size() - warmups) * batchSize, warmupMs, benchTime, osInfo); + + std::vector timings(trace.size() - warmups); + std::transform(noWarmup, trace.end(), timings.begin(), traceToTiming); + printTiming(timings, reportingOpts.avgs, osInfo); + printEpilog( + timings, benchTime, reportingOpts.percentiles, batchSize, infOpts.infStreams, osInfo, osWarning, osVerbose); + + if (!reportingOpts.exportTimes.empty()) + { + exportJSONTrace(trace, reportingOpts.exportTimes, warmups); + } +} + +//! Printed format: +//! [ value, ...] +//! value ::= { "start enq : time, "end enq" : time, "start h2d" : time, "end h2d" : time, "start compute" : time, +//! "end compute" : time, "start d2h" : time, "end d2h" : time, "h2d" : time, "compute" : time, +//! "d2h" : time, "latency" : time } +//! +void exportJSONTrace(std::vector const& trace, std::string const& fileName, int32_t const nbWarmups) +{ + std::ofstream os(fileName, std::ofstream::trunc); + os << "[" << std::endl; + char const* sep = " "; + for (auto iter = trace.begin() + nbWarmups; iter < trace.end(); ++iter) + { + auto const& t = *iter; + InferenceTime const it(traceToTiming(t)); + os << sep << "{ "; + sep = ", "; + // clang-format off + os << "\"startEnqMs\" : " << t.enqStart << sep << "\"endEnqMs\" : " << t.enqEnd << sep + << "\"startH2dMs\" : " << t.h2dStart << sep << "\"endH2dMs\" : " << t.h2dEnd << sep + << "\"startComputeMs\" : " << t.computeStart << sep << "\"endComputeMs\" : " << t.computeEnd << sep + << "\"startD2hMs\" : " << t.d2hStart << sep << "\"endD2hMs\" : " << t.d2hEnd << sep + << "\"h2dMs\" : " << it.h2d << sep << "\"computeMs\" : " << it.compute << sep + << "\"d2hMs\" : " << it.d2h << sep << "\"latencyMs\" : " << it.latency() << " }" + << std::endl; + // clang-format on + } + os << "]" << std::endl; +} + +void Profiler::reportLayerTime(char const* layerName, float timeMs) noexcept +{ + if (mIterator == mLayers.end()) + { + bool const first = !mLayers.empty() && mLayers.begin()->name == layerName; + mUpdatesCount += mLayers.empty() || first; + if (first) + { + mIterator = mLayers.begin(); + } + else + { + mLayers.emplace_back(); + mLayers.back().name = layerName; + mIterator = mLayers.end() - 1; + } + } + + mIterator->timeMs.push_back(timeMs); + ++mIterator; +} + +void Profiler::print(std::ostream& os) const noexcept +{ + std::string const nameHdr("Layer"); + std::string const timeHdr(" Time (ms)"); + std::string const avgHdr(" Avg. Time (ms)"); + std::string const medHdr(" Median Time (ms)"); + std::string const percentageHdr(" Time %"); + + float const totalTimeMs = getTotalTime(); + + auto const cmpLayer = [](LayerProfile const& a, LayerProfile const& b) { return a.name.size() < b.name.size(); }; + auto const longestName = std::max_element(mLayers.begin(), mLayers.end(), cmpLayer); + auto const nameLength = std::max(longestName->name.size() + 1, nameHdr.size()); + auto const timeLength = timeHdr.size(); + auto const avgLength = avgHdr.size(); + auto const medLength = medHdr.size(); + auto const percentageLength = percentageHdr.size(); + + os << std::endl + << "=== Profile (" << mUpdatesCount << " iterations ) ===" << std::endl + << std::setw(nameLength) << nameHdr << timeHdr << avgHdr << medHdr << percentageHdr << std::endl; + + for (auto const& p : mLayers) + { + if (p.timeMs.empty() || getTotalTime(p) == 0.F) + { + // there is no point to print profiling for layer that didn't run at all + continue; + } + // clang-format off + os << std::setw(nameLength) << p.name << std::setw(timeLength) << std::fixed << std::setprecision(2) << getTotalTime(p) + << std::setw(avgLength) << std::fixed << std::setprecision(4) << getAvgTime(p) + << std::setw(medLength) << std::fixed << std::setprecision(4) << getMedianTime(p) + << std::setw(percentageLength) << std::fixed << std::setprecision(1) << getTotalTime(p) / totalTimeMs * 100 + << std::endl; + } + { + os << std::setw(nameLength) << "Total" << std::setw(timeLength) << std::fixed << std::setprecision(2) + << totalTimeMs << std::setw(avgLength) << std::fixed << std::setprecision(4) << totalTimeMs / mUpdatesCount + << std::setw(medLength) << std::fixed << std::setprecision(4) << getMedianTime() + << std::setw(percentageLength) << std::fixed << std::setprecision(1) << 100.0 << std::endl; + // clang-format on + } + os << std::endl; +} + +void Profiler::exportJSONProfile(std::string const& fileName) const noexcept +{ + std::ofstream os(fileName, std::ofstream::trunc); + os << "[" << std::endl << " { \"count\" : " << mUpdatesCount << " }" << std::endl; + + auto const totalTimeMs = getTotalTime(); + + for (auto const& l : mLayers) + { + // clang-format off + os << ", {" << R"( "name" : ")" << l.name << R"(")" + R"(, "timeMs" : )" << getTotalTime(l) + << R"(, "averageMs" : )" << getAvgTime(l) + << R"(, "medianMs" : )" << getMedianTime(l) + << R"(, "percentage" : )" << getTotalTime(l) / totalTimeMs * 100 + << " }" << std::endl; + // clang-format on + } + os << "]" << std::endl; +} + +void dumpInputs(nvinfer1::IExecutionContext const& context, Bindings const& bindings, std::ostream& os) +{ + os << "Input Tensors:" << std::endl; + bindings.dumpInputs(context, os); +} + +template +void dumpOutputs(ContextType const& context, Bindings const& bindings, std::ostream& os) +{ + os << "Output Tensors:" << std::endl; + bindings.dumpOutputs(context, os); +} + +template +void dumpOutputs(nvinfer1::IExecutionContext const& context, Bindings const& bindings, std::ostream& os); +template +void dumpOutputs(nvinfer1::safe::IExecutionContext const& context, Bindings const& bindings, std::ostream& os); + +template +void dumpRawBindingsToFiles(ContextType const& context, Bindings const& bindings, std::ostream& os) +{ + bindings.dumpRawBindingToFiles(context, os); +} + +template +void dumpRawBindingsToFiles(nvinfer1::IExecutionContext const& context, Bindings const& bindings, std::ostream& os); + +template +void dumpRawBindingsToFiles(nvinfer1::safe::IExecutionContext const& context, Bindings const& bindings, std::ostream& os); + +template +void exportJSONOutput( + ContextType const& context, Bindings const& bindings, std::string const& fileName, int32_t batch) +{ + std::ofstream os(fileName, std::ofstream::trunc); + std::string sep = " "; + auto const output = bindings.getOutputBindings(); + os << "[" << std::endl; + for (auto const& binding : output) + { + // clang-format off + os << sep << R"({ "name" : ")" << binding.first << "\"" << std::endl; + sep = ", "; + os << " " << sep << R"("dimensions" : ")"; + bindings.dumpBindingDimensions(binding.second, context, os); + os << "\"" << std::endl; + os << " " << sep << "\"values\" : [ "; + bindings.dumpBindingValues(context, binding.second, os, sep, batch); + os << " ]" << std::endl << " }" << std::endl; + // clang-format on + } + os << "]" << std::endl; +} + +template +void exportJSONOutput(nvinfer1::IExecutionContext const& context, Bindings const& bindings, std::string const& fileName, int32_t batch); + +template void exportJSONOutput(nvinfer1::safe::IExecutionContext const& context, Bindings const& bindings, + std::string const& fileName, int32_t batch); + +bool printLayerInfo( + ReportingOptions const& reporting, nvinfer1::ICudaEngine* engine, nvinfer1::IExecutionContext* context) +{ + if (reporting.layerInfo) + { + sample::gLogInfo << "Layer Information:" << std::endl; + sample::gLogInfo << getLayerInformation(engine, context, nvinfer1::LayerInformationFormat::kONELINE) + << std::flush; + } + if (!reporting.exportLayerInfo.empty()) + { + std::ofstream os(reporting.exportLayerInfo, std::ofstream::trunc); + os << getLayerInformation(engine, context, nvinfer1::LayerInformationFormat::kJSON) << std::flush; + } + return true; +} + +void printPerformanceProfile(ReportingOptions const& reporting, InferenceEnvironment& iEnv) +{ + if (reporting.profile) + { + iEnv.profiler->print(sample::gLogInfo); + } + if (!reporting.exportProfile.empty()) + { + iEnv.profiler->exportJSONProfile(reporting.exportProfile); + } + + // Print an warning about total per-layer latency when auxiliary streams are used. + if (!iEnv.safe && (reporting.profile || !reporting.exportProfile.empty())) + { + int32_t const nbAuxStreams = iEnv.engine.get()->getNbAuxStreams(); + if (nbAuxStreams > 0) + { + sample::gLogWarning << "The engine uses " << nbAuxStreams << " auxiliary streams, so the \"Total\" latency " + << "may not be accurate because some layers may have run in parallel!" << std::endl; + } + } +} + +namespace details +{ +template +void dump(std::unique_ptr const& context, std::unique_ptr const& binding, + ReportingOptions const& reporting, int32_t batch) +{ + if (!context) + { + sample::gLogError << "Empty context! Skip printing outputs." << std::endl; + return; + } + if (reporting.output) + { + dumpOutputs(*context, *binding, sample::gLogInfo); + } + if (reporting.dumpRawBindings) + { + dumpRawBindingsToFiles(*context, *binding, sample::gLogInfo); + } + if (!reporting.exportOutput.empty()) + { + exportJSONOutput(*context, *binding, reporting.exportOutput, batch); + } +} +} // namespace details + +void printOutput(ReportingOptions const& reporting, InferenceEnvironment const& iEnv, int32_t batch) +{ + auto const& binding = iEnv.bindings.at(0); + if (!binding) + { + sample::gLogError << "Empty bindings! Skip printing outputs." << std::endl; + return; + } + + if (iEnv.safe) + { + auto const& context = iEnv.safeContexts.at(0); + details::dump(context, binding, reporting, batch); + } + else + { + auto const& context = iEnv.contexts.at(0); + details::dump(context, binding, reporting, batch); + } +} + +} // namespace sample diff --git a/Code/TestTRTInterDll/trtinfer_lib/common/sampleReporting.h b/Code/TestTRTInterDll/trtinfer_lib/common/sampleReporting.h new file mode 100644 index 0000000..fa0d706 --- /dev/null +++ b/Code/TestTRTInterDll/trtinfer_lib/common/sampleReporting.h @@ -0,0 +1,302 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef TRT_SAMPLE_REPORTING_H +#define TRT_SAMPLE_REPORTING_H + +#include +#include +#include + +#include "NvInfer.h" + +#include "sampleDevice.h" +#include "sampleInference.h" +#include "sampleOptions.h" +#include "sampleUtils.h" + +namespace sample +{ + +class Bindings; + +//! +//! \struct InferenceTime +//! \brief Measurement times in milliseconds +//! +struct InferenceTime +{ + InferenceTime(float q, float i, float c, float o) + : enq(q) + , h2d(i) + , compute(c) + , d2h(o) + { + } + + InferenceTime() = default; + InferenceTime(InferenceTime const&) = default; + InferenceTime(InferenceTime&&) = default; + InferenceTime& operator=(InferenceTime const&) = default; + InferenceTime& operator=(InferenceTime&&) = default; + ~InferenceTime() = default; + + float enq{0}; // Enqueue + float h2d{0}; // Host to Device + float compute{0}; // Compute + float d2h{0}; // Device to Host + + // ideal latency + float latency() const + { + return h2d + compute + d2h; + } +}; + +//! +//! \struct InferenceTrace +//! \brief Measurement points in milliseconds +//! +struct InferenceTrace +{ + InferenceTrace(int32_t s, float es, float ee, float is, float ie, float cs, float ce, float os, float oe) + : stream(s) + , enqStart(es) + , enqEnd(ee) + , h2dStart(is) + , h2dEnd(ie) + , computeStart(cs) + , computeEnd(ce) + , d2hStart(os) + , d2hEnd(oe) + { + } + + InferenceTrace() = default; + InferenceTrace(InferenceTrace const&) = default; + InferenceTrace(InferenceTrace&&) = default; + InferenceTrace& operator=(InferenceTrace const&) = default; + InferenceTrace& operator=(InferenceTrace&&) = default; + ~InferenceTrace() = default; + + int32_t stream{0}; + float enqStart{0}; + float enqEnd{0}; + float h2dStart{0}; + float h2dEnd{0}; + float computeStart{0}; + float computeEnd{0}; + float d2hStart{0}; + float d2hEnd{0}; +}; + +inline InferenceTime operator+(InferenceTime const& a, InferenceTime const& b) +{ + return InferenceTime(a.enq + b.enq, a.h2d + b.h2d, a.compute + b.compute, a.d2h + b.d2h); +} + +inline InferenceTime operator+=(InferenceTime& a, InferenceTime const& b) +{ + return a = a + b; +} + +//! +//! \struct PerformanceResult +//! \brief Performance result of a performance metric +//! +struct PerformanceResult +{ + float min{0.F}; + float max{0.F}; + float mean{0.F}; + float median{0.F}; + std::vector percentiles; + float coeffVar{0.F}; // coefficient of variation +}; + +//! +//! \brief Print benchmarking time and number of traces collected +//! +void printProlog(int32_t warmups, int32_t timings, float warmupMs, float walltime, std::ostream& os); + +//! +//! \brief Print a timing trace +//! +void printTiming(std::vector const& timings, int32_t runsPerAvg, std::ostream& os); + +//! +//! \brief Print the performance summary of a trace +//! +void printEpilog(std::vector const& timings, std::vector const& percentiles, int32_t batchSize, + std::ostream& osInfo, std::ostream& osWarning, std::ostream& osVerbose); + +//! +//! \brief Get the result of a specific performance metric from a trace +//! +PerformanceResult getPerformanceResult(std::vector const& timings, + std::function metricGetter, std::vector const& percentiles); + +//! +//! \brief Print the explanations of the performance metrics printed in printEpilog() function. +//! +void printMetricExplanations(std::ostream& os); + +//! +//! \brief Print and summarize a timing trace +//! +void printPerformanceReport(std::vector const& trace, ReportingOptions const& reportingOpts, + InferenceOptions const& infOpts, std::ostream& osInfo, std::ostream& osWarning, std::ostream& osVerbose); + +//! +//! \brief Export a timing trace to JSON file +//! +void exportJSONTrace( + std::vector const& InferenceTime, std::string const& fileName, int32_t const nbWarmups); + +//! +//! \brief Print input tensors to stream +//! +void dumpInputs(nvinfer1::IExecutionContext const& context, Bindings const& bindings, std::ostream& os); + +//! +//! \brief Print output tensors to stream +//! +template +void dumpOutputs(ContextType const& context, Bindings const& bindings, std::ostream& os); + +template +void dumpRawBindingsToFiles(ContextType const& context, Bindings const& bindings, std::ostream& os); + +//! +//! \brief Export output tensors to JSON file +//! +template +void exportJSONOutput( + ContextType const& context, Bindings const& bindings, std::string const& fileName, int32_t batch); + + +//! +//! \struct LayerProfile +//! \brief Layer profile information +//! +struct LayerProfile +{ + std::string name; + std::vector timeMs; +}; + +//! +//! \class Profiler +//! \brief Collect per-layer profile information, assuming times are reported in the same order +//! +class Profiler : public nvinfer1::IProfiler +{ + +public: + void reportLayerTime(char const* layerName, float timeMs) noexcept override; + + void print(std::ostream& os) const noexcept; + + //! + //! \brief Export a profile to JSON file + //! + void exportJSONProfile(std::string const& fileName) const noexcept; + +private: + float getTotalTime() const noexcept + { + auto const plusLayerTime = [](float accumulator, LayerProfile const& lp) { + return accumulator + std::accumulate(lp.timeMs.begin(), lp.timeMs.end(), 0.F, std::plus()); + }; + return std::accumulate(mLayers.begin(), mLayers.end(), 0.0F, plusLayerTime); + } + + float getMedianTime() const noexcept + { + if (mLayers.empty()) + { + return 0.F; + } + std::vector totalTime; + for (size_t run = 0; run < mLayers[0].timeMs.size(); ++run) + { + auto const layerTime + = [&run](float accumulator, LayerProfile const& lp) { return accumulator + lp.timeMs[run]; }; + auto t = std::accumulate(mLayers.begin(), mLayers.end(), 0.F, layerTime); + totalTime.push_back(t); + } + return median(totalTime); + } + + float getMedianTime(LayerProfile const& p) const noexcept + { + return median(p.timeMs); + } + + static float median(std::vector vals) + { + if (vals.empty()) + { + return 0.F; + } + std::sort(vals.begin(), vals.end()); + if (vals.size() % 2U == 1U) + { + return vals[vals.size() / 2U]; + } + return (vals[vals.size() / 2U - 1U] + vals[vals.size() / 2U]) * 0.5F; + } + + //! return the total runtime of given layer profile + float getTotalTime(LayerProfile const& p) const noexcept + { + auto const& vals = p.timeMs; + return std::accumulate(vals.begin(), vals.end(), 0.F, std::plus()); + } + + float getAvgTime(LayerProfile const& p) const noexcept + { + return getTotalTime(p) / p.timeMs.size(); + } + + std::vector mLayers; + std::vector::iterator mIterator{mLayers.begin()}; + int32_t mUpdatesCount{0}; +}; + +//! +//! \brief Print layer info to logger or export it to output JSON file. +//! +bool printLayerInfo( + ReportingOptions const& reporting, nvinfer1::ICudaEngine* engine, nvinfer1::IExecutionContext* context); + +//! Forward declaration. +struct InferenceEnvironment; + +//! +//! \brief Print per-layer perf profile data to logger or export it to output JSON file. +//! +void printPerformanceProfile(ReportingOptions const& reporting, InferenceEnvironment& iEnv); + +//! +//! \brief Print binding output values to logger or export them to output JSON file. +//! +void printOutput(ReportingOptions const& reporting, InferenceEnvironment const& iEnv, int32_t batch); + +} // namespace sample + +#endif // TRT_SAMPLE_REPORTING_H diff --git a/Code/TestTRTInterDll/trtinfer_lib/common/sampleUtils.cpp b/Code/TestTRTInterDll/trtinfer_lib/common/sampleUtils.cpp new file mode 100644 index 0000000..8e55991 --- /dev/null +++ b/Code/TestTRTInterDll/trtinfer_lib/common/sampleUtils.cpp @@ -0,0 +1,528 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "sampleUtils.h" +#include "half.h" + +using namespace nvinfer1; + +namespace sample +{ + +size_t dataTypeSize(nvinfer1::DataType dataType) +{ + switch (dataType) + { + case nvinfer1::DataType::kINT32: + case nvinfer1::DataType::kFLOAT: return 4U; + case nvinfer1::DataType::kHALF: return 2U; + case nvinfer1::DataType::kBOOL: + case nvinfer1::DataType::kUINT8: + case nvinfer1::DataType::kINT8: + case nvinfer1::DataType::kFP8: return 1U; + } + return 0; +} + +int64_t volume(nvinfer1::Dims const& dims, nvinfer1::Dims const& strides, int32_t vecDim, int32_t comps, int32_t batch) +{ + int32_t maxNbElems = 1; + for (int32_t i = 0; i < dims.nbDims; ++i) + { + // Get effective length of axis. + int32_t d = dims.d[i]; + // Any dimension is 0, it is an empty tensor. + if (d == 0) + { + return 0; + } + if (i == vecDim) + { + d = samplesCommon::divUp(d, comps); + } + maxNbElems = std::max(maxNbElems, d * strides.d[i]); + } + return static_cast(maxNbElems) * batch * (vecDim < 0 ? 1 : comps); +} + +nvinfer1::Dims toDims(std::vector const& vec) +{ + int32_t limit = static_cast(nvinfer1::Dims::MAX_DIMS); + if (static_cast(vec.size()) > limit) + { + sample::gLogWarning << "Vector too long, only first 8 elements are used in dimension." << std::endl; + } + // Pick first nvinfer1::Dims::MAX_DIMS elements + nvinfer1::Dims dims{std::min(static_cast(vec.size()), limit), {}}; + std::copy_n(vec.begin(), dims.nbDims, std::begin(dims.d)); + return dims; +} + +void loadFromFile(std::string const& fileName, char* dst, size_t size) +{ + ASSERT(dst); + + std::ifstream file(fileName, std::ios::in | std::ios::binary); + if (file.is_open()) + { + file.read(dst, size); + size_t const nbBytesRead = file.gcount(); + file.close(); + if (nbBytesRead != size) + { + std::ostringstream msg; + msg << "Unexpected file size for input file: " << fileName << ". Note: Expected: " << size + << " bytes but only read: " << nbBytesRead << " bytes"; + throw std::invalid_argument(msg.str()); + } + } + else + { + std::ostringstream msg; + msg << "Cannot open file " << fileName << "!"; + throw std::invalid_argument(msg.str()); + } +} + +std::vector splitToStringVec(std::string const& s, char separator) +{ + std::vector splitted; + + for (size_t start = 0; start < s.length();) + { + size_t separatorIndex = s.find(separator, start); + if (separatorIndex == std::string::npos) + { + separatorIndex = s.length(); + } + splitted.emplace_back(s.substr(start, separatorIndex - start)); + start = separatorIndex + 1; + } + + return splitted; +} + +bool broadcastIOFormats(std::vector const& formats, size_t nbBindings, bool isInput /*= true*/) +{ + bool broadcast = formats.size() == 1; + bool validFormatsCount = broadcast || (formats.size() == nbBindings); + if (!formats.empty() && !validFormatsCount) + { + if (isInput) + { + throw std::invalid_argument( + "The number of inputIOFormats must match network's inputs or be one for broadcasting."); + } + + throw std::invalid_argument( + "The number of outputIOFormats must match network's outputs or be one for broadcasting."); + } + return broadcast; +} + +void sparsifyMatMulKernelWeights(nvinfer1::INetworkDefinition& network, std::vector>& sparseWeights) +{ + using TensorToLayer = std::unordered_map; + using LayerToTensor = std::unordered_map; + + // 1. Collect layers and tensors information from the network. + TensorToLayer matmulI2L; + TensorToLayer constO2L; + TensorToLayer shuffleI2L; + LayerToTensor shuffleL2O; + auto collectMappingInfo = [&](int32_t const idx) + { + ILayer* l = network.getLayer(idx); + switch (l->getType()) + { + case nvinfer1::LayerType::kMATRIX_MULTIPLY: + { + // assume weights on the second input. + matmulI2L.insert({l->getInput(1), l}); + break; + } + case nvinfer1::LayerType::kCONSTANT: + { + DataType const dtype = static_cast(l)->getWeights().type; + if (dtype == nvinfer1::DataType::kFLOAT || dtype == nvinfer1::DataType::kHALF) + { + // Sparsify float only. + constO2L.insert({l->getOutput(0), l}); + } + break; + } + case nvinfer1::LayerType::kSHUFFLE: + { + shuffleI2L.insert({l->getInput(0), l}); + shuffleL2O.insert({l, l->getOutput(0)}); + break; + } + default: break; + } + }; + int32_t const nbLayers = network.getNbLayers(); + for (int32_t i = 0; i < nbLayers; ++i) + { + collectMappingInfo(i); + } + if (matmulI2L.size() == 0 || constO2L.size() == 0) + { + // No MatrixMultiply or Constant layer found, no weights to sparsify. + return; + } + + // Helper for analysis + auto isTranspose + = [](nvinfer1::Permutation const& perm) -> bool { return (perm.order[0] == 1 && perm.order[1] == 0); }; + auto is2D = [](nvinfer1::Dims const& dims) -> bool { return dims.nbDims == 2; }; + auto isIdenticalReshape = [](nvinfer1::Dims const& dims) -> bool + { + for (int32_t i = 0; i < dims.nbDims; ++i) + { + if (dims.d[i] != i || dims.d[i] != -1) + { + return false; + } + } + return true; + }; + auto tensorReachedViaTranspose = [&](nvinfer1::ITensor* t, bool& needTranspose) -> ITensor* + { + while (shuffleI2L.find(t) != shuffleI2L.end()) + { + nvinfer1::IShuffleLayer* s = static_cast(shuffleI2L.at(t)); + if (!is2D(s->getInput(0)->getDimensions()) || !is2D(s->getReshapeDimensions()) + || !isIdenticalReshape(s->getReshapeDimensions())) + { + break; + } + + if (isTranspose(s->getFirstTranspose())) + { + needTranspose = !needTranspose; + } + if (isTranspose(s->getSecondTranspose())) + { + needTranspose = !needTranspose; + } + + t = shuffleL2O.at(s); + } + return t; + }; + + // 2. Forward analysis to collect the Constant layers connected to MatMul via Transpose + std::unordered_map constantLayerToSparse; + for (auto& o2l : constO2L) + { + // If need to transpose the weights of the Constant layer. + // Need to transpose by default due to semantic difference. + bool needTranspose{true}; + ITensor* t = tensorReachedViaTranspose(o2l.first, needTranspose); + if (matmulI2L.find(t) == matmulI2L.end()) + { + continue; + } + + // check MatMul params... + IMatrixMultiplyLayer* mm = static_cast(matmulI2L.at(t)); + bool const twoInputs = mm->getNbInputs() == 2; + bool const all2D = is2D(mm->getInput(0)->getDimensions()) && is2D(mm->getInput(1)->getDimensions()); + bool const isSimple = mm->getOperation(0) == nvinfer1::MatrixOperation::kNONE + && mm->getOperation(1) != nvinfer1::MatrixOperation::kVECTOR; + if (!(twoInputs && all2D && isSimple)) + { + continue; + } + if (mm->getOperation(1) == nvinfer1::MatrixOperation::kTRANSPOSE) + { + needTranspose = !needTranspose; + } + + constantLayerToSparse.insert({static_cast(o2l.second), needTranspose}); + } + + // 3. Finally, sparsify the weights + auto sparsifyConstantWeights = [&sparseWeights](nvinfer1::IConstantLayer* layer, bool const needTranspose) + { + Dims dims = layer->getOutput(0)->getDimensions(); + ASSERT(dims.nbDims == 2); + int32_t const idxN = needTranspose ? 1 : 0; + int32_t const n = dims.d[idxN]; + int32_t const k = dims.d[1 - idxN]; + sparseWeights.emplace_back(); + std::vector& spw = sparseWeights.back(); + Weights w = layer->getWeights(); + DataType const dtype = w.type; + ASSERT(dtype == nvinfer1::DataType::kFLOAT + || dtype == nvinfer1::DataType::kHALF); // non-float weights should have been ignored. + + if (needTranspose) + { + if (dtype == nvinfer1::DataType::kFLOAT) + { + spw.resize(w.count * sizeof(float)); + transpose2DWeights(spw.data(), w.values, k, n); + } + else if (dtype == nvinfer1::DataType::kHALF) + { + spw.resize(w.count * sizeof(half_float::half)); + transpose2DWeights(spw.data(), w.values, k, n); + } + + w.values = spw.data(); + std::vector tmpW; + sparsify(w, n, 1, tmpW); + + if (dtype == nvinfer1::DataType::kFLOAT) + { + transpose2DWeights(spw.data(), tmpW.data(), n, k); + } + else if (dtype == nvinfer1::DataType::kHALF) + { + transpose2DWeights(spw.data(), tmpW.data(), n, k); + } + } + else + { + sparsify(w, n, 1, spw); + } + + w.values = spw.data(); + layer->setWeights(w); + }; + for (auto& l : constantLayerToSparse) + { + sparsifyConstantWeights(l.first, l.second); + } +} + +template +void setSparseWeights(L& l, int32_t k, int32_t trs, std::vector& sparseWeights) +{ + auto weights = l.getKernelWeights(); + sparsify(weights, k, trs, sparseWeights); + weights.values = sparseWeights.data(); + l.setKernelWeights(weights); +} + +// Explicit instantiation +template void setSparseWeights( + IConvolutionLayer& l, int32_t k, int32_t trs, std::vector& sparseWeights); +template void setSparseWeights( + IFullyConnectedLayer& l, int32_t k, int32_t trs, std::vector& sparseWeights); + +void sparsify(nvinfer1::INetworkDefinition& network, std::vector>& sparseWeights) +{ + for (int32_t l = 0; l < network.getNbLayers(); ++l) + { + auto* layer = network.getLayer(l); + auto const t = layer->getType(); + if (t == nvinfer1::LayerType::kCONVOLUTION) + { + auto& conv = *static_cast(layer); + auto const& dims = conv.getKernelSizeNd(); + ASSERT(dims.nbDims == 2 || dims.nbDims == 3); + auto const k = conv.getNbOutputMaps(); + auto const trs = std::accumulate(dims.d, dims.d + dims.nbDims, 1, std::multiplies()); + sparseWeights.emplace_back(); + setSparseWeights(conv, k, trs, sparseWeights.back()); + } + else if (t == nvinfer1::LayerType::kFULLY_CONNECTED) + { + auto& fc = *static_cast(layer); + auto const k = fc.getNbOutputChannels(); + sparseWeights.emplace_back(); + setSparseWeights(fc, k, 1, sparseWeights.back()); + } + } + + sparsifyMatMulKernelWeights(network, sparseWeights); +} + +void sparsify(Weights const& weights, int32_t k, int32_t trs, std::vector& sparseWeights) +{ + switch (weights.type) + { + case DataType::kFLOAT: + sparsify(static_cast(weights.values), weights.count, k, trs, sparseWeights); + break; + case DataType::kHALF: + sparsify(static_cast(weights.values), weights.count, k, trs, sparseWeights); + break; + case DataType::kINT8: + case DataType::kINT32: + case DataType::kUINT8: + case DataType::kBOOL: + case DataType::kFP8: break; + } +} + +template +void print(std::ostream& os, T v) +{ + os << v; +} + +void print(std::ostream& os, int8_t v) +{ + os << static_cast(v); +} + +template +void dumpBuffer(void const* buffer, std::string const& separator, std::ostream& os, Dims const& dims, + Dims const& strides, int32_t vectorDim, int32_t spv) +{ + auto const vol = volume(dims); + T const* typedBuffer = static_cast(buffer); + std::string sep; + for (int64_t v = 0; v < vol; ++v) + { + int64_t curV = v; + int32_t dataOffset = 0; + for (int32_t dimIndex = dims.nbDims - 1; dimIndex >= 0; --dimIndex) + { + int32_t dimVal = curV % dims.d[dimIndex]; + if (dimIndex == vectorDim) + { + dataOffset += (dimVal / spv) * strides.d[dimIndex] * spv + dimVal % spv; + } + else + { + dataOffset += dimVal * strides.d[dimIndex] * (vectorDim == -1 ? 1 : spv); + } + curV /= dims.d[dimIndex]; + ASSERT(curV >= 0); + } + + os << sep; + sep = separator; + print(os, typedBuffer[dataOffset]); + } +} + +// Explicit instantiation +template void dumpBuffer(void const* buffer, std::string const& separator, std::ostream& os, Dims const& dims, + Dims const& strides, int32_t vectorDim, int32_t spv); +template void dumpBuffer(void const* buffer, std::string const& separator, std::ostream& os, Dims const& dims, + Dims const& strides, int32_t vectorDim, int32_t spv); +template void dumpBuffer(void const* buffer, std::string const& separator, std::ostream& os, Dims const& dims, + Dims const& strides, int32_t vectorDim, int32_t spv); +template void dumpBuffer(void const* buffer, std::string const& separator, std::ostream& os, Dims const& dims, + Dims const& strides, int32_t vectorDim, int32_t spv); +template void dumpBuffer<__half>(void const* buffer, std::string const& separator, std::ostream& os, Dims const& dims, + Dims const& strides, int32_t vectorDim, int32_t spv); +template void dumpBuffer(void const* buffer, std::string const& separator, std::ostream& os, Dims const& dims, + Dims const& strides, int32_t vectorDim, int32_t spv); + +template +void sparsify(T const* values, int64_t count, int32_t k, int32_t trs, std::vector& sparseWeights) +{ + auto const c = count / (k * trs); + sparseWeights.resize(count * sizeof(T)); + auto* sparseValues = reinterpret_cast(sparseWeights.data()); + + constexpr int32_t window = 4; + constexpr int32_t nonzeros = 2; + + int32_t const crs = c * trs; + auto const getIndex = [=](int32_t ki, int32_t ci, int32_t rsi) { return ki * crs + ci * trs + rsi; }; + + for (int64_t ki = 0; ki < k; ++ki) + { + for (int64_t rsi = 0; rsi < trs; ++rsi) + { + int32_t w = 0; + int32_t nz = 0; + for (int64_t ci = 0; ci < c; ++ci) + { + auto const index = getIndex(ki, ci, rsi); + if (nz < nonzeros) + { + sparseValues[index] = values[index]; + ++nz; + } + else + { + sparseValues[index] = 0; + } + if (++w == window) + { + w = 0; + nz = 0; + } + } + } + } +} + +// Explicit instantiation +template void sparsify( + float const* values, int64_t count, int32_t k, int32_t trs, std::vector& sparseWeights); +template void sparsify( + half_float::half const* values, int64_t count, int32_t k, int32_t trs, std::vector& sparseWeights); + +template +void transpose2DWeights(void* dst, void const* src, int32_t const m, int32_t const n) +{ + ASSERT(dst != src); + T* tdst = reinterpret_cast(dst); + T const* tsrc = reinterpret_cast(src); + for (int32_t mi = 0; mi < m; ++mi) + { + for (int32_t ni = 0; ni < n; ++ni) + { + int32_t const isrc = mi * n + ni; + int32_t const idst = ni * m + mi; + tdst[idst] = tsrc[isrc]; + } + } +} + +// Explicit instantiation +template void transpose2DWeights(void* dst, void const* src, int32_t const m, int32_t const n); +template void transpose2DWeights(void* dst, void const* src, int32_t const m, int32_t const n); + +template ::value, bool>::type> +void fillBuffer(void* buffer, int64_t volume, T min, T max) +{ + T* typedBuffer = static_cast(buffer); + std::default_random_engine engine; + std::uniform_int_distribution distribution(min, max); + auto generator = [&engine, &distribution]() { return static_cast(distribution(engine)); }; + std::generate(typedBuffer, typedBuffer + volume, generator); +} + +template ::value, int32_t>::type> +void fillBuffer(void* buffer, int64_t volume, T min, T max) +{ + T* typedBuffer = static_cast(buffer); + std::default_random_engine engine; + std::uniform_real_distribution distribution(min, max); + auto generator = [&engine, &distribution]() { return static_cast(distribution(engine)); }; + std::generate(typedBuffer, typedBuffer + volume, generator); +} + +// Explicit instantiation +template void fillBuffer(void* buffer, int64_t volume, bool min, bool max); +template void fillBuffer(void* buffer, int64_t volume, float min, float max); +template void fillBuffer(void* buffer, int64_t volume, int32_t min, int32_t max); +template void fillBuffer(void* buffer, int64_t volume, int8_t min, int8_t max); +template void fillBuffer<__half>(void* buffer, int64_t volume, __half min, __half max); +template void fillBuffer(void* buffer, int64_t volume, uint8_t min, uint8_t max); + +} // namespace sample diff --git a/Code/TestTRTInterDll/trtinfer_lib/common/sampleUtils.h b/Code/TestTRTInterDll/trtinfer_lib/common/sampleUtils.h new file mode 100644 index 0000000..618c278 --- /dev/null +++ b/Code/TestTRTInterDll/trtinfer_lib/common/sampleUtils.h @@ -0,0 +1,105 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef TRT_SAMPLE_UTILS_H +#define TRT_SAMPLE_UTILS_H + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "NvInfer.h" + +#include "common.h" +#include "logger.h" + +#define SMP_RETVAL_IF_FALSE(condition, msg, retval, err) \ + { \ + if ((condition) == false) \ + { \ + (err) << (msg) << std::endl; \ + return retval; \ + } \ + } + +namespace sample +{ + +size_t dataTypeSize(nvinfer1::DataType dataType); + +template +inline T roundUp(T m, T n) +{ + return ((m + n - 1) / n) * n; +} + +//! comps is the number of components in a vector. Ignored if vecDim < 0. +int64_t volume(nvinfer1::Dims const& dims, nvinfer1::Dims const& strides, int32_t vecDim, int32_t comps, int32_t batch); + +using samplesCommon::volume; + +nvinfer1::Dims toDims(std::vector const& vec); + +template ::value, bool>::type = true> +void fillBuffer(void* buffer, int64_t volume, T min, T max); + +template ::value, int32_t>::type = 0> +void fillBuffer(void* buffer, int64_t volume, T min, T max); + +template +void dumpBuffer(void const* buffer, std::string const& separator, std::ostream& os, nvinfer1::Dims const& dims, + nvinfer1::Dims const& strides, int32_t vectorDim, int32_t spv); + +void loadFromFile(std::string const& fileName, char* dst, size_t size); + +std::vector splitToStringVec(std::string const& option, char separator); + +bool broadcastIOFormats(std::vector const& formats, size_t nbBindings, bool isInput = true); + +int32_t getCudaDriverVersion(); + +int32_t getCudaRuntimeVersion(); + +void sparsify(nvinfer1::INetworkDefinition& network, std::vector>& sparseWeights); +void sparsify(nvinfer1::Weights const& weights, int32_t k, int32_t rs, std::vector& sparseWeights); + +// Walk the weights elements and overwrite (at most) 2 out of 4 elements to 0. +template +void sparsify(T const* values, int64_t count, int32_t k, int32_t rs, std::vector& sparseWeights); + +template +void setSparseWeights(L& l, int32_t k, int32_t rs, std::vector& sparseWeights); + +// Sparsify the weights of Constant layers that are fed to MatMul via Shuffle layers. +// Forward analysis on the API graph to determine which weights to sparsify. +void sparsifyMatMulKernelWeights( + nvinfer1::INetworkDefinition& network, std::vector>& sparseWeights); + +template +void transpose2DWeights(void* dst, void const* src, int32_t const m, int32_t const n); + +} // namespace sample + +#endif // TRT_SAMPLE_UTILS_H diff --git a/Code/TestTRTInterDll/trtinfer_lib/include/MI_Interface.h b/Code/TestTRTInterDll/trtinfer_lib/include/MI_Interface.h new file mode 100644 index 0000000..45780ba --- /dev/null +++ b/Code/TestTRTInterDll/trtinfer_lib/include/MI_Interface.h @@ -0,0 +1,32 @@ +#pragma once + +#include "utils.h" +#include "kernel_function.cuh" + + +class MI_VisionInterface +{ +public: + ~MI_VisionInterface() = default; + + // ʼengine + virtual bool initEngine(const std::string& _onnxFileName) = 0; + // ģ + virtual bool check() = 0; + // + virtual bool doTRTInfer(const std::vector& _bufImg, std::vector* _detectRes, int* _user) = 0; + // + virtual bool doTRTInfer(const std::vector& _matImgs, std::vector* _detectRes, int* _user) = 0; + // ȡϢ + virtual std::string getError() = 0; + // ͷ/ڴ + virtual void freeMemeory() = 0; + + + virtual bool measureAxis(std::vector& measureRes, const MN_VisionImage::MS_ImageParam& _bufImg) = 0; + +}; + +// ӿ +MI_ALGORITHM_EXPORT MI_VisionInterface* getInterfacePtr(const utils::InitParameter& _params); + diff --git a/Code/TestTRTInterDll/trtinfer_lib/include/MS_Image_Param.h b/Code/TestTRTInterDll/trtinfer_lib/include/MS_Image_Param.h new file mode 100644 index 0000000..3589af1 --- /dev/null +++ b/Code/TestTRTInterDll/trtinfer_lib/include/MS_Image_Param.h @@ -0,0 +1,66 @@ +#pragma once + +#include +#include + +typedef unsigned char uchar; + +namespace MN_VisionImage { + + enum class ME_ImageType + { + E_GRAY = 0, + E_RGB, + E_RGBA + }; + + struct MS_ImageParam + { + //޲ι + MS_ImageParam() : + m_width(-1), + m_height(-1), + m_channels(0), + mImgType(MN_VisionImage::ME_ImageType::E_RGB) + {} + + //вι캯 + MS_ImageParam(uchar* _buffer, int _nW, int _nH, const ME_ImageType& _imgType) + { + int _nChannels = 0; + if (_imgType == ME_ImageType::E_GRAY) + { + _nChannels = 1; + } + else if (_imgType == ME_ImageType::E_RGBA) + { + _nChannels = 4; + } + else + { + _nChannels = 3; + } + + m_width = _nW; + m_height = _nH; + m_channels = _nChannels; + mImgType = _imgType; + int iSize = _nW * _nH * _nChannels; //ͼ + m_data = std::shared_ptr(new uchar[iSize], [](uchar* p) { + if (p != nullptr) + { + delete[] p; + p = nullptr; + } + }); + + memcpy(m_data.get(), _buffer, iSize); + } + + std::shared_ptr m_data; // ͼ + int m_width; // ͼ + int m_height; // ͼ߶ + int m_channels; // ͼͨ + ME_ImageType mImgType; // ͼ + }; +} \ No newline at end of file diff --git a/Code/TestTRTInterDll/trtinfer_lib/include/common_include.h b/Code/TestTRTInterDll/trtinfer_lib/include/common_include.h new file mode 100644 index 0000000..e1fa3fb --- /dev/null +++ b/Code/TestTRTInterDll/trtinfer_lib/include/common_include.h @@ -0,0 +1,35 @@ +#pragma once +// tensorrt +#include "argsParser.h" +#include "buffers.h" +#include "common.h" +#include "logger.h" +#include "parserOnnxConfig.h" +#include "NvOnnxParser.h" +#include +// cuda +#include +#include +#include +#include +#include +#include +#include +#include +#include +// opencv +#include +// cpp std +#include "Windows.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "MS_Image_Param.h" \ No newline at end of file diff --git a/Code/TestTRTInterDll/trtinfer_lib/include/kernel_function.cuh b/Code/TestTRTInterDll/trtinfer_lib/include/kernel_function.cuh new file mode 100644 index 0000000..dbc2e89 --- /dev/null +++ b/Code/TestTRTInterDll/trtinfer_lib/include/kernel_function.cuh @@ -0,0 +1,43 @@ +#pragma once +#include "common_include.h" +#include "utils.h" + +#define checkRuntime(op) __check_cuda_runtime((op), #op, __FILE__, __LINE__) + +bool __check_cuda_runtime(cudaError_t code, const char* op, const char* file, int line); + +#define BLOCK_SIZE 8 + +//note: resize rgb with padding +void resizeDevice(const int& batch_size, float* src, int src_width, int src_height, + float* dst, int dstWidth, int dstHeight, + float paddingValue, utils::AffineMat matrix); + +//overload:resize rgb with padding, but src's type is uin8 +void resizeDevice(const int& batch_size, unsigned char* src, int src_width, int src_height, + float* dst, int dstWidth, int dstHeight, + float paddingValue, utils::AffineMat matrix); + +// overload: resize rgb/gray without padding +void resizeDevice(const int& batchSize, float* src, int srcWidth, int srcHeight, + float* dst, int dstWidth, int dstHeight, + utils::ColorMode mode, utils::AffineMat matrix); + +void bgr2rgbDevice(const int& batch_size, float* src, int srcWidth, int srcHeight, + float* dst, int dstWidth, int dstHeight); + +void normDevice(const int& batch_size, float* src, int srcWidth, int srcHeight, + float* dst, int dstWidth, int dstHeight, + utils::InitParameter norm_param); + +void hwc2chwDevice(const int& batch_size, float* src, int srcWidth, int srcHeight, + float* dst, int dstWidth, int dstHeight); + +void decodeDevice(utils::InitParameter param, float* src, int srcWidth, int srcHeight, int srcLength, float* dst, int dstWidth, int dstHeight); + +// nms fast +void nmsDeviceV1(utils::InitParameter param, float* src, int srcWidth, int srcHeight, int srcArea); + +// nms sort +void nmsDeviceV2(utils::InitParameter param, float* src, int srcWidth, int srcHeight, int srcArea, + int* idx, float* conf); \ No newline at end of file diff --git a/Code/TestTRTInterDll/trtinfer_lib/include/utils.h b/Code/TestTRTInterDll/trtinfer_lib/include/utils.h new file mode 100644 index 0000000..5b3dea6 --- /dev/null +++ b/Code/TestTRTInterDll/trtinfer_lib/include/utils.h @@ -0,0 +1,280 @@ +#pragma once +#include "common_include.h" + + +// ӿڵ +#ifndef BUILD_STATIC +# if defined(MI_ALGORITHM_LIB) +# define MI_ALGORITHM_EXPORT __declspec(dllexport) +# else +# define MI_ALGORITHM_EXPORT __declspec(dllimport) +# endif +#else +# define MI_ALGORITHM_EXPORT +#endif + + +namespace utils +{ + namespace dataSets + { + const std::vector coco80 = { + "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light", + "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow", + "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee", + "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard", + "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple", + "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch", + "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone", + "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear", + "hair drier", "toothbrush" + }; + const std::vector coco91 = { + "person", "bicycle","car","motorcycle","airplane","bus","train","truck","boat","traffic light", + "fire hydrant","street sign","stop sign","parking meter","bench","bird","cat","dog","horse","sheep","cow","elephant","bear","zebra","giraffe", + "hat","backpack","umbrella","shoe","eye glasses","handbag","tie","suitcase","frisbee","skis","snowboard","sports ball","kite","baseball bat", + "baseball glove","skateboard","surfboard","tennis racket","bottle","plate","wine glass","cup","fork","knife","spoon","bowl","banana","apple", + "sandwich","orange","broccoli","carrot","hot dog","pizza","donut","cake","chair","couch","potted plant","bed","mirror","dining table","window", + "desk","toilet","door","tv","laptop","mouse","remote","keyboard","cell phone","microwave","oven","toaster","sink","refrigerator","blender", + "book","clock","vase","scissors","teddy bear","hair drier","toothbrush","hair brush" + }; + const std::vector voc20 = { + "aeroplane","bicycle","bird","boat","bottle","bus","car","cat","chair","cow","diningtable", + "dog","horse","motorbike","person","pottedplant","sheep","sofa","train","tvmonitor" + }; + + const std::vector face2 = { "non-face", "face" }; + + // flower_data + const std::vector flower_labels = { "dailsy", "dandelion", "rose", "sunflower", "tulip" }; + } + namespace Colors + { + const std::vector color80{ + cv::Scalar(128, 77, 207),cv::Scalar(65, 32, 208),cv::Scalar(0, 224, 45),cv::Scalar(3, 141, 219),cv::Scalar(80, 239, 253),cv::Scalar(239, 184, 12), + cv::Scalar(7, 144, 145),cv::Scalar(161, 88, 57),cv::Scalar(0, 166, 46),cv::Scalar(218, 113, 53),cv::Scalar(193, 33, 128),cv::Scalar(190, 94, 113), + cv::Scalar(113, 123, 232),cv::Scalar(69, 205, 80),cv::Scalar(18, 170, 49),cv::Scalar(89, 51, 241),cv::Scalar(153, 191, 154),cv::Scalar(27, 26, 69), + cv::Scalar(20, 186, 194),cv::Scalar(210, 202, 167),cv::Scalar(196, 113, 204),cv::Scalar(9, 81, 88),cv::Scalar(191, 162, 67),cv::Scalar(227, 73, 120), + cv::Scalar(177, 31, 19),cv::Scalar(133, 102, 137),cv::Scalar(146, 72, 97),cv::Scalar(145, 243, 208),cv::Scalar(2, 184, 176),cv::Scalar(219, 220, 93), + cv::Scalar(238, 153, 134),cv::Scalar(197, 169, 160),cv::Scalar(204, 201, 106),cv::Scalar(13, 24, 129),cv::Scalar(40, 38, 4),cv::Scalar(5, 41, 34), + cv::Scalar(46, 94, 129),cv::Scalar(102, 65, 107),cv::Scalar(27, 11, 208),cv::Scalar(191, 240, 183),cv::Scalar(225, 76, 38),cv::Scalar(193, 89, 124), + cv::Scalar(30, 14, 175),cv::Scalar(144, 96, 90),cv::Scalar(181, 186, 86),cv::Scalar(102, 136, 34),cv::Scalar(158, 71, 15),cv::Scalar(183, 81, 247), + cv::Scalar(73, 69, 89),cv::Scalar(123, 73, 232),cv::Scalar(4, 175, 57),cv::Scalar(87, 108, 23),cv::Scalar(105, 204, 142),cv::Scalar(63, 115, 53), + cv::Scalar(105, 153, 126),cv::Scalar(247, 224, 137),cv::Scalar(136, 21, 188),cv::Scalar(122, 129, 78),cv::Scalar(145, 80, 81),cv::Scalar(51, 167, 149), + cv::Scalar(162, 173, 20),cv::Scalar(252, 202, 17),cv::Scalar(10, 40, 3),cv::Scalar(150, 90, 254),cv::Scalar(169, 21, 68),cv::Scalar(157, 148, 180), + cv::Scalar(131, 254, 90),cv::Scalar(7, 221, 102),cv::Scalar(19, 191, 184),cv::Scalar(98, 126, 199),cv::Scalar(210, 61, 56),cv::Scalar(252, 86, 59), + cv::Scalar(102, 195, 55),cv::Scalar(160, 26, 91),cv::Scalar(60, 94, 66),cv::Scalar(204, 169, 193),cv::Scalar(126, 4, 181),cv::Scalar(229, 209, 196), + cv::Scalar(195, 170, 186),cv::Scalar(155, 207, 148) + }; + const std::vector color91{ + cv::Scalar(148, 99, 164),cv::Scalar(65, 172, 90),cv::Scalar(18, 117, 190),cv::Scalar(173, 208, 229),cv::Scalar(37, 162, 147),cv::Scalar(121, 99, 42), + cv::Scalar(218, 173, 104),cv::Scalar(193, 213, 138),cv::Scalar(142, 168, 45),cv::Scalar(107, 143, 94),cv::Scalar(242, 89, 7),cv::Scalar(87, 218, 248), + cv::Scalar(126, 168, 9),cv::Scalar(86, 152, 105),cv::Scalar(155, 135, 251),cv::Scalar(73, 234, 44),cv::Scalar(177, 37, 42),cv::Scalar(219, 215, 54), + cv::Scalar(124, 207, 143),cv::Scalar(7, 81, 209),cv::Scalar(254, 18, 130),cv::Scalar(71, 54, 73),cv::Scalar(172, 198, 63),cv::Scalar(64, 217, 224), + cv::Scalar(105, 224, 25),cv::Scalar(41, 52, 130),cv::Scalar(220, 27, 193),cv::Scalar(65, 222, 86),cv::Scalar(250, 150, 201),cv::Scalar(201, 150, 105), + cv::Scalar(104, 96, 142),cv::Scalar(111, 230, 54),cv::Scalar(105, 24, 22),cv::Scalar(42, 226, 101),cv::Scalar(67, 26, 144),cv::Scalar(155, 113, 106), + cv::Scalar(152, 196, 216),cv::Scalar(58, 68, 152),cv::Scalar(68, 230, 213),cv::Scalar(169, 143, 129),cv::Scalar(191, 102, 41),cv::Scalar(5, 73, 170), + cv::Scalar(15, 73, 233),cv::Scalar(95, 13, 71),cv::Scalar(25, 92, 218),cv::Scalar(85, 173, 16),cv::Scalar(247, 158, 17),cv::Scalar(36, 28, 8), + cv::Scalar(31, 100, 134),cv::Scalar(131, 71, 45),cv::Scalar(158, 190, 91),cv::Scalar(90, 207, 220),cv::Scalar(125, 77, 228),cv::Scalar(40, 156, 67), + cv::Scalar(35, 250, 69),cv::Scalar(229, 61, 245),cv::Scalar(210, 201, 106),cv::Scalar(184, 35, 131),cv::Scalar(47, 124, 120),cv::Scalar(1, 114, 23), + cv::Scalar(99, 181, 17),cv::Scalar(77, 141, 151),cv::Scalar(79, 33, 95),cv::Scalar(194, 111, 146),cv::Scalar(187, 199, 138),cv::Scalar(129, 215, 40), + cv::Scalar(160, 209, 144),cv::Scalar(139, 121, 58),cv::Scalar(97, 208, 197),cv::Scalar(185, 105, 171),cv::Scalar(160, 96, 136),cv::Scalar(232, 26, 26), + cv::Scalar(34, 165, 109),cv::Scalar(19, 86, 215),cv::Scalar(205, 209, 199),cv::Scalar(131, 91, 25),cv::Scalar(51, 201, 16),cv::Scalar(64, 35, 128), + cv::Scalar(120, 161, 247),cv::Scalar(123, 164, 190),cv::Scalar(15, 191, 40),cv::Scalar(11, 44, 117),cv::Scalar(198, 136, 70),cv::Scalar(14, 224, 240), + cv::Scalar(60, 186, 193),cv::Scalar(253, 190, 129),cv::Scalar(134, 228, 173),cv::Scalar(219, 156, 214),cv::Scalar(137, 67, 254),cv::Scalar(178, 223, 250), + cv::Scalar(219, 199, 139) + }; + const std::vector color20{ + cv::Scalar(128, 77, 207),cv::Scalar(65, 32, 208),cv::Scalar(0, 224, 45),cv::Scalar(3, 141, 219),cv::Scalar(80, 239, 253),cv::Scalar(239, 184, 12), + cv::Scalar(7, 144, 145),cv::Scalar(161, 88, 57),cv::Scalar(0, 166, 46),cv::Scalar(218, 113, 53),cv::Scalar(193, 33, 128),cv::Scalar(190, 94, 113), + cv::Scalar(113, 123, 232),cv::Scalar(69, 205, 80),cv::Scalar(18, 170, 49),cv::Scalar(89, 51, 241),cv::Scalar(153, 191, 154),cv::Scalar(27, 26, 69), + cv::Scalar(20, 186, 194),cv::Scalar(210, 202, 167),cv::Scalar(196, 113, 204),cv::Scalar(9, 81, 88),cv::Scalar(191, 162, 67),cv::Scalar(227, 73, 120) + }; + } + + // JC_Xiong-20240424 + // ģ/ܣͨöʵģͶ + enum class ME_ModelType + { + E_RESNET34 = 0, + E_RESNET50, + E_YOLOV8, + }; + + // ǰƷ + enum class ME_DetectRes + { + E_DETECT_OK = 0, + E_DETECT_NG, + E_DETECT_NONE, + }; + + // ͼ෵ؽ + typedef struct MS_Classification + { + MS_Classification() :mDetectRes(ME_DetectRes::E_DETECT_OK), mConfidence(0.0), mLabel("") + {} + + ME_DetectRes mDetectRes; + double mConfidence; + std::string mLabel; + }MS_ClassificationParam; + + struct Box + { + float left, top, right, bottom, confidence; + int label; + std::vector land_marks; + + Box() = default; + Box(float left, float top, float right, float bottom, float confidence, int label) : + left(left), top(top), right(right), bottom(bottom), confidence(confidence), label(label) {} + + Box(float left, float top, float right, float bottom, float confidence, int label, int numLandMarks) : + left(left), top(top), right(right), bottom(bottom), confidence(confidence), label(label) + { + land_marks.reserve(numLandMarks); + } + }; + + // Ŀⷵؽ + typedef struct MS_ObjectDetect + { + MS_ObjectDetect() :mDetectRes(ME_DetectRes::E_DETECT_OK), mBoxVec(std::vector()) + {} + + ME_DetectRes mDetectRes; + std::vector mBoxVec; + }MS_ObjectDetectParam; + + + struct MR_Result + { + MR_Result() : mObjectDecRes(), mClassifyDecRes() + {} + + MS_ClassificationParam mClassifyDecRes; + std::vector> mObjectDecRes; + }; + + struct InitParameter + { + InitParameter() :num_class(5), dynamic_batch(false), batch_size(1), dst_h(0), dst_w(0), scale(255.0f), + meanVec{ 0.0f,0.0f,0.0f }, stdVec{ 1.0f,1.0f,1.0f }, iou_thresh(0.5), conf_thresh(0.5), topK(1000), + save_path(""), char_width(11), det_info_render_width(15), font_scale(0.6), is_show(false), is_save(false) + {} + + + ME_ModelType m_modelType; // ģ + int num_class; // flower_data + std::vector class_names; + std::vector input_output_names; + + bool dynamic_batch; + int batch_size; // + MN_VisionImage::MS_ImageParam mImage; + int dst_h, dst_w; // 뵽ģ͵ͼ + + float scale; + std::vector stdVec; + std::vector meanVec; + + float iou_thresh; + float conf_thresh; + + int topK; + std::string save_path; + + std::string winname = "TensorRT-Infer"; + int char_width; + int det_info_render_width; + double font_scale; + bool is_show; + bool is_save; + }; + + // legacy + struct CandidateObject + { + float mBboxAndkeyPoints[14]; // bbox:[x y w h] + 5 facial key points:[x1 y1 x2 y2 ...x5 y5] + float mScore; + bool mIsGood; + CandidateObject() + { + std::fill_n(mBboxAndkeyPoints, 14, FLT_MAX); + mScore = FLT_MAX; + mIsGood = true; + } + CandidateObject(float* bboxAndkeyPoints, float score, bool isGood) : + mScore(score), + mIsGood(isGood) + { + memcpy(mBboxAndkeyPoints, bboxAndkeyPoints, 14 * sizeof(float)); + } + }; + + + + enum class InputStream { IMAGE, VIDEO, CAMERA }; + + enum class ColorMode { RGB, GRAY }; + + struct AffineMat + { + float v0, v1, v2; + float v3, v4, v5; + }; + + + void saveBinaryFile(float* vec, size_t len, const std::string& file); + + std::vector readBinaryFile(const std::string& file); + + std::vector loadModel(const std::string& file); + + std::string getSystemTimeStr(); + + bool setInputStream(const InputStream& source, const std::string& imagePath, const std::string& videoPath, const int& cameraID, + cv::VideoCapture& capture, int& totalBatches, int& delayTime, InitParameter& param); + + void setRenderWindow(InitParameter& param); + + std::string getTimeStamp(); + + void show(const std::vector>& objectss, + const std::vector& classNames, + const int& cvDelayTime, std::vector& imgsBatch); + + void save(const std::vector>& objectss, + const std::vector& classNames, + const std::string& savePath, std::vector& imgsBatch, + const int& batchSize, const int& batchi); + + class HostTimer + { + public: + HostTimer(); + float getUsedTime(); // while timing for cuda code, add "cudaDeviceSynchronize();" before this + ~HostTimer(); + + private: + std::chrono::steady_clock::time_point t1; + std::chrono::steady_clock::time_point t2; + }; + + + class DeviceTimer + { + public: + DeviceTimer(); + float getUsedTime(); + // overload + DeviceTimer(cudaStream_t ctream); + float getUsedTime(cudaStream_t ctream); + + ~DeviceTimer(); + + private: + cudaEvent_t start, end; + }; +} diff --git a/Code/TestTRTInterDll/vs2019-opencv-release-X64.props b/Code/TestTRTInterDll/vs2019-opencv-release-X64.props new file mode 100644 index 0000000..7b6a329 --- /dev/null +++ b/Code/TestTRTInterDll/vs2019-opencv-release-X64.props @@ -0,0 +1,16 @@ + + + + + + + + ..\MF_TRTInfer\lib\opencv_lib\include\opencv2;..\MF_TRTInfer\lib\opencv_lib\include;%(AdditionalIncludeDirectories) + + + ..\MF_TRTInfer\lib\opencv_lib\x64\vc15\lib;%(AdditionalLibraryDirectories) + opencv_world453.lib;%(AdditionalDependencies) + + + + \ No newline at end of file