pytorch經(jīng)onnx轉(zhuǎn)tensorrt初體驗上、下中學(xué)習(xí)了tensorrt如何調(diào)用onnx模型,但其中遇到的問題是tensorrt7沒有辦法直接輸入動態(tài)batchsize的數(shù)據(jù),當(dāng)batchsize>1時只有第一個sample的結(jié)果是正確的,而其后的samples的輸出都為0. 本文主要是探索如何進行批量化的處理。
1. 添加輔助引擎。
這是TensorRT/samples/sampleDynamicReshape/sampleDynamicReshape.cpp 中給出的一個解決方案,其主要思路是在原有的INetwork 之前再創(chuàng)建一個用于input resize的Network, 該Network的主要功能是對可變的輸入進行resize,以及設(shè)置配置文件和參數(shù)綁定。
其中,最主要的部分如下:
// Finally, configure and build the preprocessor engine.
auto preprocessorConfig = makeUnique(builder->createBuilderConfig());
// Create an optimization profile so that we can specify a range of input dimensions.
auto profile = builder->createOptimizationProfile();
// This profile will be valid for all images whose size falls in the range of [(1, 1, 1, 1), (1, 1, 56, 56)]
// but TensorRT will optimize for (1, 1, 28, 28)
profile->setDimensions(input->getName(), OptProfileSelector::kMIN, Dims4{1, 1, 1, 1});
profile->setDimensions(input->getName(), OptProfileSelector::kOPT, Dims4{1, 1, 28, 28});
profile->setDimensions(input->getName(), OptProfileSelector::kMAX, Dims4{1, 1, 56, 56});
preprocessorConfig->addOptimizationProfile(profile);
mPreprocessorEngine = makeUnique(builder->buildEngineWithConfig(*preprocessorNetwork, *preprocessorConfig));
其中配置器profile指定了輸入的最小尺寸、最優(yōu)尺寸和最大尺寸。那么真實輸入時,處在最小和最大尺寸中都行。
2. 直接給context 配置profile文件。
參考:# TensorRT 7 ONNX models with variable batch size

另一個比較簡潔的使用Python API的: 示例
- 安裝文檔,首先我們需要生成一個onnx模型,我們這里生成兩個模型,分別對應(yīng)batchsize固定
resnet18.onnx和batchsize可變resnet18_dynamic.onnx
#--*-- coding:utf-8 --*--
import onnx
import torch
import torchvision
import netron
net = torchvision.models.resnet18(pretrained=True).cuda()
net.eval()
export_onnx_file = "./resnet18.onnx"
x=torch.onnx.export(net, # 待轉(zhuǎn)換的網(wǎng)絡(luò)模型和參數(shù)
torch.randn(1, 3, 224, 224, device='cuda'), # 虛擬的輸入,用于確定輸入尺寸和推理計算圖每個節(jié)點的尺寸
export_onnx_file, # 輸出文件的名稱
verbose=False, # 是否以字符串的形式顯示計算圖
input_names=["input"],# + ["params_%d"%i for i in range(120)], # 輸入節(jié)點的名稱,這里也可以給一個list,list中名稱分別對應(yīng)每一層可學(xué)習(xí)的參數(shù),便于后續(xù)查詢
output_names=["output"], # 輸出節(jié)點的名稱
opset_version=10, # onnx 支持采用的operator set, 應(yīng)該和pytorch版本相關(guān),目前我這里最高支持10
do_constant_folding=True, # 是否壓縮常量
)
export_onnx_file = "./resnet18_dynamic.onnx"
x=torch.onnx.export(net, # 待轉(zhuǎn)換的網(wǎng)絡(luò)模型和參數(shù)
torch.randn(1, 3, 224, 224, device='cuda'), # 虛擬的輸入,用于確定輸入尺寸和推理計算圖每個節(jié)點的尺寸
export_onnx_file, # 輸出文件的名稱
verbose=False, # 是否以字符串的形式顯示計算圖
input_names=["input"],# + ["params_%d"%i for i in range(120)], # 輸入節(jié)點的名稱,這里也可以給一個list,list中名稱分別對應(yīng)每一層可學(xué)習(xí)的參數(shù),便于后續(xù)查詢
output_names=["output"], # 輸出節(jié)點的名稱
opset_version=10, # onnx 支持采用的operator set, 應(yīng)該和pytorch版本相關(guān),目前我這里最高支持10
do_constant_folding=True, # 是否壓縮常量
dynamic_axes={"input":{0: "batch_size"}, "output":{0: "batch_size"},} #設(shè)置動態(tài)維度,此處指明input節(jié)點的第0維度可變,命名為batch_size
)
- 然后我們使用 TensorRT 提供的
trtexec工具由onnx模型直接生成并保存cuda引擎。
trtexec指令的位置:<path-to-TensorRT>/bin, 所以把該路徑添加到PATH環(huán)境變量中
export PATH=/home/zwzhou/packages/TensorRT-7.0.0.11/bin:$PATH
測試 trtexec -h 發(fā)現(xiàn)

查看cuda版本
nvcc -V 以及對應(yīng)PATH變量 echo $PATH均未發(fā)現(xiàn)異常。然后輸出
echo $LD_LIBRARY_PATH 發(fā)現(xiàn):
發(fā)現(xiàn)LD_LIBRARY_PATH的第一個cuda路徑對應(yīng)著 cuda9.2, 這是因為之前root環(huán)境安裝的是cuda環(huán)境,所以此時需要將自己安裝的cuda版本放在第一個搜索的路徑上:
export LD_LIBRARY_PATH=/home/zwzhou/cuda-9.0/bin:${LD_LIBRARY_PATH}
再次 trtexec -h 即正確顯示幫助信息。其中給出了 model options、build options、 inference options和system options等。
a. 從固定尺寸的onnx轉(zhuǎn)cudaEngine
···
trtexec --explicitBatch --onnx=./resnet18.onnx --saveEngine=resnet18.engine
···
b.從可變尺寸的onnx轉(zhuǎn)cudaEngine,需要指定profile。
trtexec --onnx=./resnet18_dynamic.onnx --explicitBatch \
--minShapes="input":1x3x224x224\
--optShapes="input":16x3x224x224\
--maxShapes="input":32x3x224x224\
--shapes="input":1x3x224x224\
--saveEngine=resnet18_dynamic.engine
c. 接下來看一下python API的調(diào)用
import argparse
from typing import Tuple, List
import numpy as np
import pycuda.driver as cuda
import pycuda.autoinit
import tensorrt as trt
TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
BatchSize = 32
// 判斷 shape是否是動態(tài)的或者固定的
def is_fixed(shape: Tuple[int]):
return not is_dynamic(shape)
def is_dynamic(shape: Tuple[int]):
return any(dim is None or dim < 0 for dim in shape)
def load_engine(filename: str):
# Load serialized engine file into memory 加載序列化的cuda引擎并進行反序列化
with open(filename, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime:
return runtime.deserialize_cuda_engine(f.read())
def get_binding_idxs(engine: trt.ICudaEngine, profile_index: int):
# Calculate start/end binding indices for current context's profile
num_bindings_per_profile = engine.num_bindings // engine.num_optimization_profiles
start_binding = profile_index * num_bindings_per_profile
end_binding = start_binding + num_bindings_per_profile
print("Engine/Binding Metadata")
print("\tNumber of optimization profiles: {}".format(engine.num_optimization_profiles))
print("\tNumber of bindings per profile: {}".format(num_bindings_per_profile))
print("\tFirst binding for profile {}: {}".format(profile_index, start_binding))
print("\tLast binding for profile {}: {}".format(profile_index, end_binding-1))
# Separate input and output binding indices for convenience
input_binding_idxs = []
output_binding_idxs = []
for binding_index in range(start_binding, end_binding):
if engine.binding_is_input(binding_index):
input_binding_idxs.append(binding_index)
else:
output_binding_idxs.append(binding_index)
return input_binding_idxs, output_binding_idxs
# 指定輸入的shape,同時根據(jù)輸入的shape指定輸出的shape,并未輸出賦予cuda空間
def setup_binding_shapes(
engine: trt.ICudaEngine,
context: trt.IExecutionContext,
host_inputs: List[np.ndarray],
input_binding_idxs: List[int],
output_binding_idxs: List[int],
):
# Explicitly set the dynamic input shapes, so the dynamic output
# shapes can be computed internally
for host_input, binding_index in zip(host_inputs, input_binding_idxs):
context.set_binding_shape(binding_index, host_input.shape)
assert context.all_binding_shapes_specified
host_outputs = []
device_outputs = []
for binding_index in output_binding_idxs:
output_shape = context.get_binding_shape(binding_index)
# Allocate buffers to hold output results after copying back to host
buffer = np.empty(output_shape, dtype=np.float32)
host_outputs.append(buffer)
# Allocate output buffers on device
device_outputs.append(cuda.mem_alloc(buffer.nbytes))
return host_outputs, device_outputs
def get_random_inputs(
engine: trt.ICudaEngine,
context: trt.IExecutionContext,
input_binding_idxs: List[int],
seed: int = 42,
):
# Input data for inference
host_inputs = []
print("Generating Random Inputs")
print("\tUsing random seed: {}".format(seed))
np.random.seed(seed)
for binding_index in input_binding_idxs:
# If input shape is fixed, we'll just use it
input_shape = context.get_binding_shape(binding_index)
input_name = engine.get_binding_name(binding_index)
print("\tInput [{}] shape: {}".format(input_name, input_shape))
# If input shape is dynamic, we'll arbitrarily select one of the
# the min/opt/max shapes from our optimization profile
if is_dynamic(input_shape):
profile_index = context.active_optimization_profile
profile_shapes = engine.get_profile_shape(profile_index, binding_index)
print("\tProfile Shapes for [{}]: [kMIN {} | kOPT {} | kMAX {}]".format(input_name, *profile_shapes))
# 0=min, 1=opt, 2=max, or choose any shape, (min <= shape <= max)
input_shape = (BatchSize, 3, 224, 224)#profile_shapes[1]
print("\tInput [{}] shape was dynamic, setting inference shape to {}".format(input_name, input_shape))
host_inputs.append(np.random.random(input_shape).astype(np.float32))
return host_inputs
主函數(shù):
def main():
parser = argparse.ArgumentParser()
parser.add_argument("-e", "--engine", required=True, type=str,
help="Path to TensorRT engine file.")
parser.add_argument("-s", "--seed", type=int, default=42,
help="Random seed for reproducibility.")
args = parser.parse_args()
# Load a serialized engine into memory
engine = load_engine(args.engine) // 加載序列化的cuda引擎
print("Loaded engine: {}".format(args.engine))
# Create context, this can be re-used 創(chuàng)建 執(zhí)行環(huán)境
context = engine.create_execution_context()
# Profile 0 (first profile) is used by default context可以設(shè)置多個profile, 這里選擇第一個,也是默認的profile,其中規(guī)定了輸入尺寸的變化區(qū)間
context.active_optimization_profile = 0
print("Active Optimization Profile: {}".format(context.active_optimization_profile))
# These binding_idxs can change if either the context or the
# active_optimization_profile are changed 獲得輸入輸出變量名對應(yīng)profile的idx
input_binding_idxs, output_binding_idxs = get_binding_idxs(
engine, context.active_optimization_profile
)
# 獲得輸入變量的變量名
input_names = [engine.get_binding_name(binding_idx) for binding_idx in input_binding_idxs]
# Generate random inputs based on profile shapes, 隨機產(chǎn)生輸入變量
host_inputs = get_random_inputs(engine, context, input_binding_idxs, seed=args.seed)
# Allocate device memory for inputs. This can be easily re-used if the
# input shapes don't change 為輸入變量賦予host空間,該空間可復(fù)用
device_inputs = [cuda.mem_alloc(h_input.nbytes) for h_input in host_inputs]
# Copy host inputs to device, this needs to be done for each new input, 由host拷貝到device
for h_input, d_input in zip(host_inputs, device_inputs):
cuda.memcpy_htod(d_input, h_input)
print("Input Metadata")
print("\tNumber of Inputs: {}".format(len(input_binding_idxs)))
print("\tInput Bindings for Profile {}: {}".format(context.active_optimization_profile, input_binding_idxs))
print("\tInput names: {}".format(input_names))
print("\tInput shapes: {}".format([inp.shape for inp in host_inputs]))
# This needs to be called everytime your input shapes change
# If your inputs are always the same shape (same batch size, etc.),
# then you will only need to call this once 重新指定網(wǎng)絡(luò)輸入輸出的大小。
host_outputs, device_outputs = setup_binding_shapes(
engine, context, host_inputs, input_binding_idxs, output_binding_idxs,
) # 返回的是輸出的idx和device buffer
output_names = [engine.get_binding_name(binding_idx) for binding_idx in output_binding_idxs]
print("Output Metadata")
print("\tNumber of Outputs: {}".format(len(output_binding_idxs)))
print("\tOutput names: {}".format(output_names))
print("\tOutput shapes: {}".format([out.shape for out in host_outputs]))
print("\tOutput Bindings for Profile {}: {}".format(context.active_optimization_profile, output_binding_idxs))
# Bindings are a list of device pointers for inputs and outputs
bindings = device_inputs + device_outputs # list的合并
# Inference
t1 = time.time()
for i in range(1000):
context.execute_v2(bindings) // 執(zhí)行1000次, 該處和execute_async_v2函數(shù)不大一樣
t2 = time.time()
print("Inference iterations: {}".format(((t2-t1))))
print("Inference iterations per sample: {}".format(((t2-t1)/BatchSize)))
# Copy outputs back to host to view results 將輸出由gpu拷貝到cpu。
for h_output, d_output in zip(host_outputs, device_outputs):
cuda.memcpy_dtoh(h_output, d_output)
# View outputs
# print("Inference Outputs:", host_outputs)
# Cleanup (Can also use context managers instead)
del context
del engine
下表給出的是V100上resnet18的前向推斷時間(ms)
| bs | 1 | 2 | 4 | 6 | 8 | 12 | 16 | 24 | 32 |
|---|---|---|---|---|---|---|---|---|---|
| all time | 1.57 | 1.67 | 1.78 | 2.67 | 2.81 | 3.83 | 4.80 | 6.43 | 8.65 |
| avg time | 1.57 | 0.84 | 0.45 | 0.44 | 0.35 | 0.32 | 0.30 | 0.27 | 0.27 |
總結(jié)一下,dynamic batchsize的處理流程:
- 生成 batch可變化的onnx,當(dāng)然這一步非必須,可以后面tensorrt中修改
- 將onnx模型保存成 engine文件,可以使用trtexec工具
- 輸入在profile限定尺寸范圍內(nèi)的數(shù)據(jù),并分配host和device空間
- 根據(jù)輸入的尺寸,推到輸出變量的尺寸,并分配host和device空間
- execute_v2進行推理
- 將輸出由cuda拷貝到cpu進行處理