transformers: device_map='auto' doesn't use MPS backend on Apple M2

With the following program:

import os
import time
import readline
import textwrap


os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
os.environ["HF_ENDPOINT"] = "https://huggingface.co"
os.environ["ACCELERATE_USE_MPS_DEVICE"] = "True"


import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig
from accelerate import init_empty_weights, load_checkpoint_and_dispatch, Accelerator

def main():
    print('Pytorch version', torch.__version__)
    if torch.backends.mps.is_available():
        active_device = torch.device('mps')
    elif torch.cuda.is_available():
        active_device = torch.device('cuda', 0)
    else:
        active_device = torch.device('cpu')

    accelerator = Accelerator()
    print('Accelerator device: ', accelerator.device)

    checkpoint = "bigscience/bloom"

    tm_start = time.time()
    tokenizer = AutoTokenizer.from_pretrained(checkpoint)
    model = AutoModelForCausalLM.from_pretrained(
        checkpoint,
        device_map="auto",
        offload_folder="offload",
        offload_state_dict=True,
    )
    tm_end = time.time()
    print(f'Loaded in {tm_end - tm_start} seconds.')

    while True:
        prompt = input('Request to LLM: ')

        tm_start = time.time()
        inputs = tokenizer.encode(prompt, return_tensors="pt").to(active_device)
        tm_end = time.time()
        print(f'Encoded in {tm_end - tm_start} seconds.')

        tm_start = time.time()
        outputs = model.generate(
            inputs, max_new_tokens=2048, pad_token_id=tokenizer.eos_token_id,
            repetition_penalty=1.2)
        tm_end = time.time()
        print(f'Generated in {tm_end - tm_start} seconds.')

        tm_start = time.time()
        response = tokenizer.decode(outputs[0])
        tm_end = time.time()
        print(f'Decoded in {tm_end - tm_start} seconds.')

        print("\n".join(textwrap.wrap(response, width=120)))


if __name__ == '__main__':
    main()

the cpu backend is used by transformers/accelerate, even though it prints Accelerator device: mps. I know this because it’s slow (below NVMe bandwidth) and the following is printed:

/Users/serge/PycharmProjects/macLLM/venv/lib/python3.9/site-packages/transformers/generation/utils.py:1359:
UserWarning: You are calling .generate() with the `input_ids` being on a device type different than your model's device.
`input_ids` is on mps, whereas the model is on cpu. You may experience unexpected behaviors or slower generation.
Please make sure that you have put `input_ids` to the correct device by calling for example input_ids = input_ids.to('cpu')
before running `.generate()`.
  warnings.warn(

Environment: transformers v4.26.1 accelerate v0.17.0 PyTorch v1.13.1 MacOS 13.2.1 (22D68) Python 3.9.6

About this issue

Original URL
State: closed
Created a year ago
Comments: 15 (1 by maintainers)

Most upvoted comments

Hi @moradisina, since the version v0.20.0: of accelerate, mps device is supported with device_map="auto". It should automatically map your model to mps device if you are using a M2 chip .

from transformers import AutoModelForCausalLM
model = AutoModelForCausalLM("facebook/opt-350m",device_map="auto")
# should return {"":"mps"}
print(model.hf_device_map)

You can also do it manually by setting device_map={"":"mps"}:

from transformers import AutoModelForCausalLM
model = AutoModelForCausalLM("facebook/opt-350m",device_map={"":"mps"})
# should return {"":"mps"}
print(model.hf_device_map)

SunMarc on Sep 28, 2023

Yes you need to load it without the device_map="auto".

sgugger on Mar 20, 2023

THis is solved in the latest version of Accelerate (cc @SunMarc )

sgugger on Jun 9, 2023

Hi, I am on M2 MAX CHIP MACOS that has 12 CPU, 38 GPU. I am having issue with ever modification of this code snippet. Would you please tell me how I can correct it?

from transformers import AutoTokenizer, AutoModelForCausalLM import transformers import torch

model = AutoModelForCausalLM.from_pretrained(“tiiuae/falcon-40b-instruct”, trust_remote_code=True) model = model.to(‘mps’)

tokenizer = AutoTokenizer.from_pretrained(model) pipeline = transformers.pipeline( “text-generation”, model=model, tokenizer=tokenizer, torch_dtype=torch.bfloat16, trust_remote_code=True, # device = torch.device(‘mps’), # device_map=“auto”, )

phdykd on May 31, 2023