簡單說明一下pytorch轉(zhuǎn)onnx的意義。在pytorch訓(xùn)練出一個深度學(xué)習(xí)模型后,,需要在TensorRT或者openvino部署,,這時需要先把Pytorch模型轉(zhuǎn)換到onnx模型之后再做其它轉(zhuǎn)換,。因此,,在使用pytorch訓(xùn)練深度學(xué)習(xí)模型完成后,在TensorRT或者openvino或者opencv和onnxruntime部署時,,pytorch模型轉(zhuǎn)onnx這一步是必不可少的,。本文介紹Python、pytorch轉(zhuǎn)換onnx的過程中遇到的坑,。
配置
Ubuntu 16.04
python 3.6
onnx 1.6
pytorch 1.5
pycuda 2019.1.2
torchvision 0.1.8
建議詳讀,先安裝好環(huán)境:https://docs./deeplearning/tensorrt/developer-guide/index.html#import_onnx_python)
步驟
1.將pytorch模型轉(zhuǎn)換成onnx模型
這邊用的是Darknet生成的pytoch模型
import torch
from torch.autograd import Variable
import onnx
input_name = ['input']
output_name = ['output']
input = Variable(torch.randn(1, 3, 544, 544)).cuda()
model = x.model.cuda()#x.model為我生成的模型
# model = torch.load('', map_location='cuda:0')
torch.onnx.export(model, input, 'model.onnx', input_names=input_name, output_names=output_name, verbose=True)
其中
#model = x.model.cuda()
#若是不添加cuda()
model = x.model
出現(xiàn)報錯
RuntimeError: Input type (torch.cuda.FloatTensor) and weight type (torch.FloatTensor) should be the same
2.檢查模型
model = onnx.load('model.onnx')
onnx.checker.check_model(model)
print('==> Passed')
3.測試onnx模型使用tensorrt推理前后對比
import pycuda.autoinit
import numpy as np
import pycuda.driver as cuda
import tensorrt as trt
import torch
import os
import time
from PIL import Image
import cv2
import torchvision
filename = '000000.jpg'
max_batch_size = 1
onnx_model_path = 'yolo.onnx'
TRT_LOGGER = trt.Logger() # This logger is required to build an engine
def get_img_np_nchw(filename):
image = cv2.imread(filename)
image_cv = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
image_cv = cv2.resize(image_cv, (1920, 1080))
miu = np.array([0.485, 0.456, 0.406])
std = np.array([0.229, 0.224, 0.225])
img_np = np.array(image_cv, dtype=float) / 255.
r = (img_np[:, :, 0] - miu[0]) / std[0]
g = (img_np[:, :, 1] - miu[1]) / std[1]
b = (img_np[:, :, 2] - miu[2]) / std[2]
img_np_t = np.array([r, g, b])
img_np_nchw = np.expand_dims(img_np_t, axis=0)
return img_np_nchw
class HostDeviceMem(object):
def __init__(self, host_mem, device_mem):
'''Within this context, host_mom means the cpu memory and device means the GPU memory
'''
self.host = host_mem
self.device = device_mem
def __str__(self):
return 'Host:\n' + str(self.host) + '\nDevice:\n' + str(self.device)
def __repr__(self):
return self.__str__()
def allocate_buffers(engine):
inputs = []
outputs = []
bindings = []
stream = cuda.Stream()
for binding in engine:
size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
dtype = trt.nptype(engine.get_binding_dtype(binding))
# Allocate host and device buffers
host_mem = cuda.pagelocked_empty(size, dtype)
device_mem = cuda.mem_alloc(host_mem.nbytes)
# Append the device buffer to device bindings.
bindings.append(int(device_mem))
# Append to the appropriate list.
if engine.binding_is_input(binding):
inputs.append(HostDeviceMem(host_mem, device_mem))
else:
outputs.append(HostDeviceMem(host_mem, device_mem))
return inputs, outputs, bindings, stream
def get_engine(max_batch_size=1, onnx_file_path='', engine_file_path='', \
fp16_mode=False, int8_mode=False, save_engine=False,
):
'''Attempts to load a serialized engine if available, otherwise builds a new TensorRT engine and saves it.'''
def build_engine(max_batch_size, save_engine):
'''Takes an ONNX file and creates a TensorRT engine to run inference with'''
EXPLICIT_BATCH = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
with trt.Builder(TRT_LOGGER) as builder, \
builder.create_network(EXPLICIT_BATCH) as network, \
trt.OnnxParser(network, TRT_LOGGER) as parser:
builder.max_workspace_size = 1 << 30 # Your workspace size
builder.max_batch_size = max_batch_size
# pdb.set_trace()
builder.fp16_mode = fp16_mode # Default: False
builder.int8_mode = int8_mode # Default: False
if int8_mode:
# To be updated
raise NotImplementedError
# Parse model file
if not os.path.exists(onnx_file_path):
quit('ONNX file {} not found'.format(onnx_file_path))
print('Loading ONNX file from path {}...'.format(onnx_file_path))
with open(onnx_file_path, 'rb') as model:
print('Beginning ONNX file parsing')
parser.parse(model.read())
if not parser.parse(model.read()):
for error in range(parser.num_errors):
print(parser.get_error(error))
print('===========Parsing fail!!!!=================')
else :
print('Completed parsing of ONNX file')
print('Building an engine from file {}; this may take a while...'.format(onnx_file_path))
engine = builder.build_cuda_engine(network)
print('Completed creating Engine')
if save_engine:
with open(engine_file_path, 'wb') as f:
f.write(engine.serialize())
return engine
if os.path.exists(engine_file_path):
# If a serialized engine exists, load it instead of building a new one.
print('Reading engine from file {}'.format(engine_file_path))
with open(engine_file_path, 'rb') as f, trt.Runtime(TRT_LOGGER) as runtime:
return runtime.deserialize_cuda_engine(f.read())
else:
return build_engine(max_batch_size, save_engine)
def do_inference(context, bindings, inputs, outputs, stream, batch_size=1):
# Transfer data from CPU to the GPU.
[cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]
# Run inference.
context.execute_async(batch_size=batch_size, bindings=bindings, stream_handle=stream.handle)
# Transfer predictions back from the GPU.
[cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]
# Synchronize the stream
stream.synchronize()
# Return only the host outputs.
return [out.host for out in outputs]
def postprocess_the_outputs(h_outputs, shape_of_output):
h_outputs = h_outputs.reshape(*shape_of_output)
return h_outputs
img_np_nchw = get_img_np_nchw(filename)
img_np_nchw = img_np_nchw.astype(dtype=np.float32)
# These two modes are dependent on hardwares
fp16_mode = False
int8_mode = False
trt_engine_path = './model_fp16_{}_int8_{}.trt'.format(fp16_mode, int8_mode)
# Build an engine
engine = get_engine(max_batch_size, onnx_model_path, trt_engine_path, fp16_mode, int8_mode)
# Create the context for this engine
context = engine.create_execution_context()
# Allocate buffers for input and output
inputs, outputs, bindings, stream = allocate_buffers(engine) # input, output: host # bindings
# Do inference
shape_of_output = (max_batch_size, 1000)
# Load data to the buffer
inputs[0].host = img_np_nchw.reshape(-1)
# inputs[1].host = ... for multiple input
t1 = time.time()
trt_outputs = do_inference(context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream) # numpy data
t2 = time.time()
feat = postprocess_the_outputs(trt_outputs[0], shape_of_output)
print('TensorRT ok')
#將model改為自己的模型,此處為pytoch的resnet50,需聯(lián)網(wǎng)下載
model = torchvision.models.resnet50(pretrained=True).cuda()
resnet_model = model.eval()
input_for_torch = torch.from_numpy(img_np_nchw).cuda()
t3 = time.time()
feat_2= resnet_model(input_for_torch)
t4 = time.time()
feat_2 = feat_2.cpu().data.numpy()
print('Pytorch ok!')
mse = np.mean((feat - feat_2)**2)
print('Inference time with the TensorRT engine: {}'.format(t2-t1))
print('Inference time with the PyTorch model: {}'.format(t4-t3))
print('MSE Error = {}'.format(mse))
print('All completed!')
報錯:
In node -1 (importModel): INVALID_VALUE: Assertion failed: !_importer_ctx.network()->hasImplicitBatchDimension() && 'This version of the ONNX parser only supports TensorRT INetworkDefinitions with an explicit batch dimension. Please ensure the network was created using the EXPLICIT_BATCH NetworkDefinitionCreationFlag.'
解決:
def build_engine(max_batch_size, save_engine):
EXPLICIT_BATCH = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
with trt.Builder(TRT_LOGGER) as builder, \
builder.create_network(EXPLICIT_BATCH) as network, \
trt.OnnxParser(network, TRT_LOGGER) as parser:
報錯:
Traceback (most recent call last):
line 126, in <listcomp>
[cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]
pycuda._driver.LogicError: cuMemcpyHtoDAsync failed: invalid argument
解決:
def get_img_np_nchw(filename):
image = cv2.imread(filename)
image_cv = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
image_cv = cv2.resize(image_cv, (1920, 1080))
輸入的檢測圖像尺寸需要resize成model的input的size
改為
def get_img_np_nchw(filename):
image = cv2.imread(filename)
image_cv = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
image_cv = cv2.resize(image_cv, (544,544))
報錯
line 139, in postprocess_the_outputs
h_outputs = h_outputs.reshape(*shape_of_output)
ValueError: cannot reshape array of size 5780 into shape (1,1000)
解決:
#shape_of_output = (max_batch_size, 1000)
#修改成自己模型ouput的大小
shape_of_output = (1,20,17,17)