axolotl: The error maze of deepspeed + qlora + falcon

I’ve been trying to make the combination deepspeed + qlora + falcon work but due to unknown reasons I’ve stuck in an error maze.

Setup

  • Docker image: winglian/axolotl-runpod:main-py3.9-cu118-2.0.0
  • Entry script: bash -c "curl -H 'Cache-Control: no-cache' https://raw.githubusercontent.com/utensil/llm-playground/main/scripts/entry/prepare_ax.sh -sSf | bash"
  • ds_config.json (final version, modified from the default one in axolotl):
{
    "zero_optimization": {
        "stage": 3,
        "offload_optimizer": {
            "device": "cpu",
            "pin_memory": true
        },
        "offload_param": {
            "device": "cpu",
            "pin_memory": true
        },
        "overlap_comm": true,
        "contiguous_gradients": true,
        "sub_group_size": 0,
        "reduce_bucket_size": "auto",
        "stage3_prefetch_bucket_size": "auto",
        "stage3_param_persistence_threshold": "auto",
        "stage3_max_live_parameters": 0,
        "stage3_max_reuse_distance": 0,
        "stage3_gather_16bit_weights_on_model_save": true
    },
   "bf16": {
        "enabled": "auto"
    },
    "fp16": {
        "enabled": "auto",
        "auto_cast": false,
        "loss_scale": 0,
        "initial_scale_power": 32,
        "loss_scale_window": 1000,
        "hysteresis": 2,
        "min_loss_scale": 1
    },
    "optimizer": {
        "type": "AdamW",
        "params": {
          "lr": "auto",
          "betas": "auto",
          "eps": "auto",
          "weight_decay": "auto"
        }
    },
    "scheduler": {
      "type": "WarmupDecayLR",
      "params": {
        "total_num_steps": "auto",
        "warmup_min_lr": "auto",
        "warmup_max_lr": "auto",
        "warmup_num_steps": "auto"
       }
    },
    "gradient_accumulation_steps": "auto",
    "train_batch_size": "auto",
    "train_micro_batch_size_per_gpu": "auto",
    "wall_clock_breakdown": false
}
  • examples/falcon/config-40b-qlora.yml
# 1b: tiiuae/falcon-rw-1b
# 7b: tiiuae/falcon-7b
# 40b: tiiuae/falcon-40b
base_model: tiiuae/falcon-40b
base_model_config: tiiuae/falcon-40b
# required by falcon custom model code: https://huggingface.co/tiiuae/falcon-7b/tree/main
trust_remote_code: true
model_type: AutoModelForCausalLM
tokenizer_type: AutoTokenizer
load_in_8bit: false
# enable 4bit for QLoRA
load_in_4bit: true
gptq: false
strict: false

push_dataset_to_hub: utensil
hf_use_auth_token: true

datasets:
  - path: QingyiSi/Alpaca-CoT
    data_files:
      - Chain-of-Thought/formatted_cot_data/gsm8k_train.json
    type: "alpaca:chat"

dataset_prepared_path: last_run_prepared
val_set_size: 0.01
# enable QLoRA
adapter: qlora
lora_model_dir:
sequence_len: 2048
max_packed_sequence_len:

# hyperparameters from QLoRA paper Appendix B.2
# "We find hyperparameters to be largely robust across datasets"
lora_r: 64
lora_alpha: 16
# 0.1 for models up to 13B
# 0.05 for 33B and 65B models
lora_dropout: 0.05
# add LoRA modules on all linear layers of the base model
lora_target_modules:
lora_target_linear: true
lora_fan_in_fan_out:

wandb_project: falcon-qlora
wandb_watch:
wandb_run_id:
wandb_log_model:
output_dir: /content/axolotl-trained/falcon-qlora-40b-gsm8k/

# QLoRA paper Table 9
# - 16 for 7b & 13b
# - 32 for 33b, 64 for 64b
# Max size tested on A6000
# - 7b: 40
# - 40b: 4
# decrease if OOM, increase for max VRAM utilization
micro_batch_size: 1
gradient_accumulation_steps: 1
num_epochs: 3
# Optimizer for QLoRA
# optimizer: paged_adamw_32bit
torchdistx_path:
# lr_scheduler: cosine
# QLoRA paper Table 9
# - 2e-4 for 7b & 13b
# - 1e-4 for 33b & 64b
learning_rate: 0.0002
train_on_inputs: false
group_by_length: false
bf16: true
fp16: false
tf32: true
gradient_checkpointing: true
# stop training after this many evaluation losses have increased in a row
# https://huggingface.co/transformers/v4.2.2/_modules/transformers/trainer_callback.html#EarlyStoppingCallback
early_stopping_patience: 3
resume_from_checkpoint:
auto_resume_from_checkpoints: true
local_rank:
logging_steps: 1
xformers_attention: true
flash_attention:
gptq_groupsize:
gptq_model_v1:
warmup_steps: 10
eval_steps: 5
save_steps: 10
debug:
deepspeed:
weight_decay: 0.01
fsdp:
fsdp_config:
special_tokens:
  pad_token: "<|endoftext|>"
  bos_token: ">>ABSTRACT<<"
  eos_token: "<|endoftext|>"
  • Environment reported by ds_report
Setting ds_accelerator to cuda (auto detect)
--------------------------------------------------
DeepSpeed C++/CUDA extension op report
--------------------------------------------------
NOTE: Ops not installed will be just-in-time (JIT) compiled at
      runtime if needed. Op compatibility means that your system
      meet the required dependencies to JIT install the op.
--------------------------------------------------
JIT compiled ops requires ninja
ninja .................. [OKAY]
--------------------------------------------------
op name ................ installed .. compatible
--------------------------------------------------
async_io ............... [YES] ...... [OKAY]
cpu_adagrad ............ [YES] ...... [OKAY]
cpu_adam ............... [YES] ...... [OKAY]
fused_adam ............. [YES] ...... [OKAY]
fused_lamb ............. [YES] ...... [OKAY]
quantizer .............. [YES] ...... [OKAY]
random_ltd ............. [YES] ...... [OKAY]
 [WARNING]  sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.0
 [WARNING]  using untested triton version (2.0.0), only 1.0.0 is known to be compatible
sparse_attn ............ [NO] ....... [NO]
spatial_inference ...... [YES] ...... [OKAY]
transformer ............ [YES] ...... [OKAY]
stochastic_transformer . [YES] ...... [OKAY]
transformer_inference .. [YES] ...... [OKAY]
utils .................. [YES] ...... [OKAY]
--------------------------------------------------
DeepSpeed general environment info:
torch install path ............... ['/root/miniconda3/envs/py3.9/lib/python3.9/site-packages/torch']
torch version .................... 2.0.1+cu118
deepspeed install path ........... ['/root/miniconda3/envs/py3.9/lib/python3.9/site-packages/deepspeed']
deepspeed info ................... 0.9.3+52907a66, 52907a66, master
torch cuda version ............... 11.8
torch hip version ................ None
nvcc version ..................... 11.8
deepspeed wheel compiled w. ...... torch 2.0, cuda 11.8

Errors

Error Cause Solution
RuntimeError: CUDA version mismatch! DeepSpeed ops were compiled and installed with a different version than what is being used at runtime. Please re-install DeepSpeed or switch torch versions. Install CUDA version=11.8, Runtime CUDA version=11.7 + AttributeError: 'DeepSpeedCPUAdam' object has no attribute 'ds_opt_adam'(#138) torch 2.0.1 reinstalled for CUDA 11.7 due to unknown reason ❓ pip3 install -U torch --index-url https://download.pytorch.org/whl/cu118
RuntimeError: expected there to be only one unique element in <generator object Init._convert_to_deepspeed_param.<locals>.all_gather_coalesced.<locals>.<genexpr> at 0x7f0f04211eb0> Training started, then this error during forward due to unknown reason ❓
ValueError: Found optimizer configured in the DeepSpeed config, but no scheduler. Please configure a scheduler in the DeepSpeed config. Need to add optimizer configs, but others work fine without it Add optimizer configs as in the final config above
many mismatch errors hf and ds configs mismatch axolotl config must not set optimizer and lr_scheduler, many ds configs need to set to auto, add missing ds config keys, see the final config above
compile errors when reinstalling deepspeed with TORCH_CUDA_ARCH_LIST="3.5;5.0;6.0;6.1;7.0;7.5;8.0;8.6+PTX" DS_BUILD_OPS=1 DS_BUILD_SPARSE_ATTN=0 pip install deepspeed --global-option="build_ext" --global-option="-j8" # --global-option="bdist_wheel" Maybe it’s not a complete environment for compilation ❓
ValueError: Can't find a valid checkpoint at /content/axolotl-trained/falcon-qlora-40b-gsm8k/checkpoint-50 If I disable deepspeed and use just accelerate for multiple GPUs, training is normal but failed to resume from checkpoint ( tried each of latest 3 checkpoints) ❓
ValueError: ZeRO inference only makes sense with ZeRO Stage 3 - please adjust your config If I run deepspeed with 1 A100, training is normal but eval fails with this error due to is_zero3() return false for eval ❓

About this issue

  • Original URL
  • State: open
  • Created a year ago
  • Reactions: 2
  • Comments: 15 (9 by maintainers)

Most upvoted comments

Title should be changed to include llama models, I get the RuntimeError: expected there to be only one unique element in <generator object Init._convert_to_deepspeed_param.<locals>.all_gather_coalesced.<locals>.<genexpr> at 0x7f0f04211eb0> error on startup with 4x 4090s with zero3 config.

Hi @utensil, I’m working with DeepSpeed and am having similar issues. Although my process is still broken, I’ll share my current config in case it helps. For testing, I have been able to start the the training process on 1 node w/ 3x A6000s under zero 2. Here is my Makefile target:

WORKSPACE_HOST_PATH:=...
MODELS_HOST_PATH:=...
DATA_HOST_PATH:=...
WORK_HOST_PATH:=...

train:
	docker run --gpus='all' -it --rm \
	  	--volume=$(WORKSPACE_HOST_PATH):/workspace \
		--volume=$(MODELS_HOST_PATH):/models \
		--volume=$(DATA_HOST_PATH):/data \
		--volume=$(WORK_HOST_PATH):/work \
		--volume=$(WORKSPACE_HOST_PATH)/extern/axolotl:/opt/axolotl \
		--env-file=$(CURDIR)/.env \
		--entrypoint=accelerate \
	         quay.io/theobjectivedad/axolotl-main:latest \
			launch \
				--config_file /work/accelerate/basic.yaml \
				/opt/axolotl/scripts/finetune.py \
					/work/atheos/config.yaml

My accelerate config:

compute_environment: LOCAL_MACHINE
deepspeed_config:
  deepspeed_config_file: /SET_IN_AXOLOTL_CONFIG.yaml
  zero3_init_flag: false
distributed_type: DEEPSPEED
downcast_bf16: "no"
machine_rank: 0
main_training_function: main
num_machines: 1
num_processes: 3
rdzv_backend: static
same_network: true
tpu_env: []
tpu_use_cluster: false
tpu_use_sudo: false
use_cpu: false

Deepspeed config:

{
    "optimizer": {
        "type": "auto"
    },
    "scheduler": {
        "type": "auto"
    },
    "activation_checkpointing": {
        "partition_activations": "auto"
    },
    "zero_optimization": {
        "stage": 2,
        "offload_optimizer": {
            "device": "auto"
        },
        "offload_param": {
            "device": "auto"
        },
        "allgather_bucket_size": "auto",
        "allgather_bucket_dtype": "auto",
        "dp_bucket_size": "auto",
        "overlap_comm": "auto",
        "contiguous_gradients": "auto",
        "sub_group_size": "auto",
        "reduce_bucket_size": "auto"
    },
    "gradient_clipping": "auto",
    "fp16": {
        "enabled": "auto"
    },
    "bf16": {
        "enabled": "auto"
    },
    "train_batch_size": "auto",
    "train_micro_batch_size_per_gpu": "auto"
}

Axolotl config:

###############################################################################
# Model
###############################################################################
base_model: /models/llama-7b-hf
base_model_config: /models/llama-7b-hf
model_type: LlamaForCausalLM
tokenizer_type: LlamaTokenizer

output_dir: /work/atheos/output1

sequence_len: 2048
max_packed_sequence_len: 1024

tokens:
  bos_token: "<s>"
  eos_token: "</s>"
  unk_token: "<unk>"
  pad_token: "<unk>"
special_tokens:

###############################################################################
# Precision & Model loading
###############################################################################

bf16: full
bfloat16: true

fp16: false
float16: false

tf32: true

load_in_8bit: false
load_in_4bit: false

lora_model_dir:

###############################################################################
# Dataset
###############################################################################
datasets:
  - path: /data/GPTeacher/Instruct
    type: gpteacher

dataset_prepared_path: /work/last_run_prepared
val_set_size: 0.02

###############################################################################
# Training
###############################################################################

deepspeed: /work/accelerate/ds_stage2_auto.json

adapter: lora
lora_r: 8
lora_alpha: 16
lora_dropout: 0.05
lora_target_modules:
  - q_proj
  - v_proj
lora_fan_in_fan_out: false

# WanDB configuration
wandb_project: smoketest
wandb_watch:
wandb_run_id:
wandb_log_model:

gradient_accumulation_steps: 1
micro_batch_size: 4
num_epochs: 8
optimizer:
torchdistx_path:

lr_scheduler: cosine
learning_rate: 5.0e-5
train_on_inputs: false
group_by_length: false

early_stopping_patience: 3

auto_resume_from_checkpoints: true
resume_from_checkpoint:

logging_steps: 500
xformers_attention: true
flash_attention:
gptq_groupsize:
gptq_model_v1:
warmup_steps: 20
eval_steps: 500
save_steps: 500
debug: false

weight_decay: 0.1
fsdp:
fsdp_config: