iree: ResNet18 Wrong Results in FP16 Mode on GPU RTX 4000
What happened?
Pytorch fp16 resnet18 model returns wrong results on RTX 4000
Steps to reproduce your issue
- Create a file with following content
import torch
import os
import io
import numpy as np
import time
import torch_mlir
import iree.compiler as ireec
import iree.runtime as ireert
model = torch.hub.load('pytorch/vision:v0.10.0', 'resnet18', pretrained=True)
model.eval()
# Download an example image from the pytorch website
import urllib
url, filename = ("https://github.com/pytorch/hub/raw/master/images/dog.jpg", "dog.jpg")
try: urllib.URLopener().retrieve(url, filename)
except: urllib.request.urlretrieve(url, filename)
# sample execution (requires torchvision)
from PIL import Image
from torchvision import transforms
input_image = Image.open(filename)
preprocess = transforms.Compose([
transforms.Resize(256),
transforms.CenterCrop(224),
transforms.ToTensor(),
transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])
input_tensor = preprocess(input_image)
input_batch = input_tensor.unsqueeze(0) # create a mini-batch as expected by the model
# move the input and model to GPU for speed
device = "cuda"
input_batch = input_batch.to(device)
model.to(device)
model.half()
input_batch = input_batch.half()
# warmup
with torch.no_grad():
output = model(input_batch)
start = time.time()
output = model(input_batch)
print("torch result", output[0, 0])
print("torch time", time.time() - start)
# IREE
mlir = torch_mlir.compile(
model,
input_batch,
output_type="linalg-on-tensors",
use_tracing=True)
iree_input_type = "tm_tensor"
bytecode_stream = io.BytesIO()
mlir.operation.write_bytecode(bytecode_stream)
flatbuffer = ireec.compile_str(bytecode_stream.getvalue(),
target_backends=[device],
input_type=iree_input_type,
extra_args=[
"--iree-hal-cuda-llvm-target-arch=sm_75",
"--iree-flow-dump-dispatch-graph",
"--iree-flow-dump-dispatch-graph-output-file=foo.dot"])
iree_device = ireert.get_device(device)
config = ireert.Config(device=iree_device)
ctx = ireert.SystemContext(config=config)
vm_module = ireert.VmModule.from_flatbuffer(ctx.instance, flatbuffer)
ctx.add_vm_module(vm_module)
invoker = ctx.modules.module
# warmup
iree_input_batch = ireert.asdevicearray(iree_device, input_batch.cpu().numpy())
result = invoker.forward(iree_input_batch)
start = time.time()
result = invoker.forward(iree_input_batch)
numpy_result = np.asarray(result)
print("iree result", numpy_result[0][0])
print("iree time", time.time() - start)
- Execute it and check the log
torch result tensor(0.0119, device='cuda:0', dtype=torch.float16)
torch time 0.029006481170654297
iree result 0.0005054
iree time 0.016841650009155273
What component(s) does this issue relate to?
No response
Version information
6a46afd4f82715979b54c57fb41a8505466cd68f
Additional context
No response
About this issue
- Original URL
- State: open
- Created a year ago
- Comments: 23 (15 by maintainers)
It’s too early to know if my team needs to be involved. Let me try to get the IR like @jpienaar said to see if it helps narrow down what needs to happen here.