OpenLLM: bug: 40GB is not enough for llama-2-7b
Describe the bug
Starting a server with a int4 quantized models on a 40GB GPU throws of OOM errors. If I am not mistaken the quantization should result in a ~3.5 GB model and even the original model ~28GB should easily fit in memory.
To reproduce
-
openllm start llama --model-id meta-llama/Llama-2-7b-hf --quantize int4
-
curl -XPOST localhost:3000/v1/generate -H ‘content-type: application/json’ -d ‘{“prompt”: "hello there, my name is "}’
Logs
2023-08-28T22:20:12+0000 [ERROR] [runner:llm-llama-runner:1] Exception in ASGI application
Traceback (most recent call last):
File "/usr/local/lib/python3.10/dist-packages/uvicorn/protocols/http/h11_impl.py", line 408, in run_asgi
result = await app( # type: ignore[func-returns-value]
File "/usr/local/lib/python3.10/dist-packages/uvicorn/middleware/proxy_headers.py", line 84, in __call__
return await self.app(scope, receive, send)
File "/usr/local/lib/python3.10/dist-packages/starlette/applications.py", line 122, in __call__
await self.middleware_stack(scope, receive, send)
File "/usr/local/lib/python3.10/dist-packages/starlette/middleware/errors.py", line 184, in __call__
raise exc
File "/usr/local/lib/python3.10/dist-packages/starlette/middleware/errors.py", line 162, in __call__
await self.app(scope, receive, _send)
File "/usr/local/lib/python3.10/dist-packages/bentoml/_internal/server/http/traffic.py", line 26, in __call__
await self.app(scope, receive, send)
File "/usr/local/lib/python3.10/dist-packages/opentelemetry/instrumentation/asgi/__init__.py", line 580, in __call__
await self.app(scope, otel_receive, otel_send)
File "/usr/local/lib/python3.10/dist-packages/bentoml/_internal/server/http/instruments.py", line 252, in __call__
await self.app(scope, receive, wrapped_send)
File "/usr/local/lib/python3.10/dist-packages/bentoml/_internal/server/http/access.py", line 126, in __call__
await self.app(scope, receive, wrapped_send)
File "/usr/local/lib/python3.10/dist-packages/starlette/middleware/exceptions.py", line 62, in __call__
await wrap_app_handling_exceptions(self.app, conn)(scope, receive, send)
File "/usr/local/lib/python3.10/dist-packages/starlette/_exception_handler.py", line 57, in wrapped_app
raise exc
File "/usr/local/lib/python3.10/dist-packages/starlette/_exception_handler.py", line 46, in wrapped_app
await app(scope, receive, sender)
File "/usr/local/lib/python3.10/dist-packages/starlette/routing.py", line 727, in __call__
await route.handle(scope, receive, send)
File "/usr/local/lib/python3.10/dist-packages/starlette/routing.py", line 285, in handle
await self.app(scope, receive, send)
File "/usr/local/lib/python3.10/dist-packages/starlette/routing.py", line 74, in app
await wrap_app_handling_exceptions(app, request)(scope, receive, send)
File "/usr/local/lib/python3.10/dist-packages/starlette/_exception_handler.py", line 57, in wrapped_app
raise exc
File "/usr/local/lib/python3.10/dist-packages/starlette/_exception_handler.py", line 46, in wrapped_app
await app(scope, receive, sender)
File "/usr/local/lib/python3.10/dist-packages/starlette/routing.py", line 69, in app
response = await func(request)
File "/usr/local/lib/python3.10/dist-packages/bentoml/_internal/server/runner_app.py", line 291, in _request_handler
payload = await infer(params)
File "/usr/local/lib/python3.10/dist-packages/bentoml/_internal/marshal/dispatcher.py", line 182, in _func
raise r
File "/usr/local/lib/python3.10/dist-packages/bentoml/_internal/marshal/dispatcher.py", line 377, in outbound_call
outputs = await self.callback(
File "/usr/local/lib/python3.10/dist-packages/bentoml/_internal/server/runner_app.py", line 271, in infer_single
ret = await runner_method.async_run(*params.args, **params.kwargs)
File "/usr/local/lib/python3.10/dist-packages/bentoml/_internal/runner/runner.py", line 55, in async_run
return await self.runner._runner_handle.async_run_method(self, *args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/bentoml/_internal/runner/runner_handle/local.py", line 62, in async_run_method
return await anyio.to_thread.run_sync(
File "/usr/local/lib/python3.10/dist-packages/anyio/to_thread.py", line 33, in run_sync
return await get_asynclib().run_sync_in_worker_thread(
File "/usr/local/lib/python3.10/dist-packages/anyio/_backends/_asyncio.py", line 877, in run_sync_in_worker_thread
return await future
File "/usr/local/lib/python3.10/dist-packages/anyio/_backends/_asyncio.py", line 807, in run
result = context.run(func, *args)
File "/usr/local/lib/python3.10/dist-packages/bentoml/_internal/runner/runnable.py", line 143, in method
return self.func(obj, *args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/openllm/_llm.py", line 1168, in generate
return self.generate(prompt, **attrs)
File "/usr/local/lib/python3.10/dist-packages/openllm/_llm.py", line 986, in generate
for it in self.generate_iterator(prompt, **attrs):
File "/usr/local/lib/python3.10/dist-packages/openllm/_llm.py", line 1024, in generate_iterator
out = self.model(torch.as_tensor([input_ids], device=self.device), use_cache=True)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1502, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1511, in _call_impl
return forward_call(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/accelerate/hooks.py", line 165, in new_forward
output = old_forward(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/transformers/models/llama/modeling_llama.py", line 809, in forward
outputs = self.model(
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1502, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1511, in _call_impl
return forward_call(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/accelerate/hooks.py", line 165, in new_forward
output = old_forward(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/transformers/models/llama/modeling_llama.py", line 697, in forward
layer_outputs = decoder_layer(
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1502, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1511, in _call_impl
return forward_call(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/accelerate/hooks.py", line 165, in new_forward
output = old_forward(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/transformers/models/llama/modeling_llama.py", line 426, in forward
hidden_states = self.mlp(hidden_states)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1502, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1511, in _call_impl
return forward_call(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/accelerate/hooks.py", line 165, in new_forward
output = old_forward(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/transformers/models/llama/modeling_llama.py", line 220, in forward
down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1502, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1511, in _call_impl
return forward_call(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/accelerate/hooks.py", line 165, in new_forward
output = old_forward(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/bitsandbytes/nn/modules.py", line 248, in forward
out = bnb.matmul_4bit(x, self.weight.t(), bias=bias, quant_state=self.weight.quant_state)
File "/usr/local/lib/python3.10/dist-packages/bitsandbytes/autograd/_functions.py", line 579, in matmul_4bit
return MatMul4Bit.apply(A, B, out, bias, quant_state)
File "/usr/local/lib/python3.10/dist-packages/torch/autograd/function.py", line 506, in apply
return super().apply(*args, **kwargs) # type: ignore[misc]
File "/usr/local/lib/python3.10/dist-packages/bitsandbytes/autograd/_functions.py", line 516, in forward
output = torch.nn.functional.linear(A, F.dequantize_4bit(B, state).to(A.dtype).t(), bias)
torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 86.00 MiB. GPU 0 has a total capacty of 39.39 GiB of which 85.12 MiB is free. Process 17224 has 39.31 GiB memory in use. Of the allocated memory 36.93 GiB is allocated by PyTorch, and 1.79 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation. See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF
2023-08-28T22:20:12+0000 [ERROR] [api_server:llm-llama-service:6] Exception on /v1/generate [POST] (trace=64f1f2855c26af6f1d6212b60faeab5d,span=2f1792bb2f5b298e,sampled=1,service.name=llm-llama-service)
Traceback (most recent call last):
File "/usr/local/lib/python3.10/dist-packages/bentoml/_internal/server/http_app.py", line 341, in api_func
output = await api.func(*args)
File "/usr/local/lib/python3.10/dist-packages/openllm/_service.py", line 36, in generate_v1
responses = await runner.generate.async_run(qa_inputs.prompt, **{'adapter_name': qa_inputs.adapter_name, **config})
File "/usr/local/lib/python3.10/dist-packages/bentoml/_internal/runner/runner.py", line 55, in async_run
return await self.runner._runner_handle.async_run_method(self, *args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/bentoml/_internal/runner/runner_handle/remote.py", line 242, in async_run_method
raise RemoteException(
bentoml.exceptions.RemoteException: An unexpected exception occurred in remote runner llm-llama-runner: [500] Internal Server Error
### Environment
"openllm[llama,gptq]==0.2.27"
protobuf==3.20.3
bitsandbytes==0.41.0
### System information (Optional)
_No response_
About this issue
- Original URL
- State: open
- Created 10 months ago
- Comments: 23 (11 by maintainers)
Yes so this is currently a bug that has also been reported else where, I’m taking a look atm.