Pointcept: When I trained large-scale datasets, the GPU ran out of memory

We have set the parameters of the Toronto 3D dataset based on the S3DIS dataset, but during training, we have already set the batch_ The size has been reduced to the minimum, but there is still an error reported that the GPU is out of memory.

RuntimeError: CUDA out of memory. Tried to allocate 16.32 GiB (GPU 0; 47.35 GiB total capacity; 37.84 GiB already allocated; 8.27 GiB free; 37.98 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation. See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF The configs file is as follows: `base = [“…/base/default_runtime.py”]

misc custom setting

batch_size = 1 #12 # bs: total bs in all gpus mix_prob = 0.8 empty_cache = False enable_amp = True

model settings

model = dict( type=“DefaultSegmentor”, backbone=dict( type=“PT-v2m2”, in_channels=6, num_classes=8, patch_embed_depth=2, patch_embed_channels=48, patch_embed_groups=6, patch_embed_neighbours=16, enc_depths=(2, 6, 2), enc_channels=(96, 192, 384), enc_groups=(12, 24, 48), enc_neighbours=(16, 16, 16), dec_depths=(1, 1, 1), dec_channels=(48, 96, 192), dec_groups=(6, 12, 24), dec_neighbours=(16, 16, 16), grid_sizes=(0.1, 0.2, 0.4), attn_qkv_bias=True, pe_multiplier=False, pe_bias=True, attn_drop_rate=0.0, drop_path_rate=0.3, enable_checkpoint=False, unpool_backend=“interp”, # map / interp ), criteria=[dict(type=“CrossEntropyLoss”, loss_weight=1.0, ignore_index=-1)], )

scheduler settings

epoch = 3000 optimizer = dict(type=“AdamW”, lr=0.006, weight_decay=0.05) scheduler = dict(type=“MultiStepLR”, milestones=[0.6, 0.8], gamma=0.1)

dataset settings

dataset_type = “TorontoDataset” data_root = “/root/autodl-tmp/toronto1”

data = dict( num_classes=8, ignore_index=-1, names=[ “road”, “road_marking”, “natural”, “building”, “utility_line”, “pole”, “car”, “fence”, ], train=dict( type=dataset_type, split=(“L001”, “L002”, “L003”, “L004”), data_root=data_root, transform=[ dict(type=“CenterShift”, apply_z=True), # dict(type=“RandomDropout”, dropout_ratio=0.2, dropout_application_ratio=0.2), # dict(type=“RandomRotateTargetAngle”, angle=(1/2, 1, 3/2), center=[0, 0, 0], axis=“z”, p=0.75), # dict(type=“RandomRotate”, angle=[-1, 1], axis=“z”, center=[0, 0, 0], p=0.5), # dict(type=“RandomRotate”, angle=[-1 / 64, 1 / 64], axis=“x”, p=0.5), # dict(type=“RandomRotate”, angle=[-1 / 64, 1 / 64], axis=“y”, p=0.5), dict(type=“RandomScale”, scale=[0.9, 1.1]), # dict(type=“RandomShift”, shift=[0.2, 0.2, 0.2]), dict(type=“RandomFlip”, p=0.5), dict(type=“RandomJitter”, sigma=0.005, clip=0.02), # dict(type=“ElasticDistortion”, distortion_params=[[0.2, 0.4], [0.8, 1.6]]), dict(type=“ChromaticAutoContrast”, p=0.2, blend_factor=None), dict(type=“ChromaticTranslation”, p=0.95, ratio=0.05), dict(type=“ChromaticJitter”, p=0.95, std=0.05), # dict(type=“HueSaturationTranslation”, hue_max=0.2, saturation_max=0.2), # dict(type=“RandomColorDrop”, p=0.2, color_augment=0.0), dict( type=“GridSample”, grid_size=0.04, #0.04 hash_type=“fnv”, mode=“train”, keys=(“coord”, “color”, “segment”), return_discrete_coord=True, ), dict(type=“SphereCrop”, point_max=80000, mode=“random”), dict(type=“CenterShift”, apply_z=False), dict(type=“NormalizeColor”), # dict(type=“ShufflePoint”), dict(type=“ToTensor”), dict( type=“Collect”, keys=(“coord”, “discrete_coord”, “segment”), feat_keys=[“coord”, “color”], ), ], test_mode=False, ), val=dict( type=dataset_type, split=“L002”, data_root=data_root, transform=[ dict(type=“CenterShift”, apply_z=True), dict( type=“Copy”, keys_dict={“coord”: “origin_coord”, “segment”: “origin_segment”}, ), dict( type=“GridSample”, grid_size=0.04, #0.04 hash_type=“fnv”, mode=“train”, keys=(“coord”, “color”, “segment”), return_discrete_coord=True, ), dict(type=“CenterShift”, apply_z=False), dict(type=“NormalizeColor”), dict(type=“ToTensor”), dict( type=“Collect”, keys=(“coord”, “discrete_coord”, “segment”), offset_keys_dict=dict(offset=“coord”), feat_keys=[“coord”, “color”], ), ], test_mode=False, ), test=dict( type=dataset_type, split=“L002”, data_root=data_root, transform=[dict(type=“CenterShift”, apply_z=True), dict(type=“NormalizeColor”)], test_mode=True, test_cfg=dict( voxelize=dict( type=“GridSample”, grid_size=0.04, #0.04 hash_type=“fnv”, mode=“test”, keys=(“coord”, “color”), return_discrete_coord=True, ), crop=None, post_transform=[ dict(type=“CenterShift”, apply_z=False), dict(type=“ToTensor”), dict( type=“Collect”, keys=(“coord”, “discrete_coord”, “index”), feat_keys=(“coord”, “color”), ), ], aug_transform=[ [dict(type=“RandomScale”, scale=[0.9, 0.9])], [dict(type=“RandomScale”, scale=[0.95, 0.95])], [dict(type=“RandomScale”, scale=[1, 1])], [dict(type=“RandomScale”, scale=[1.05, 1.05])], [dict(type=“RandomScale”, scale=[1.1, 1.1])], [ dict(type=“RandomScale”, scale=[0.9, 0.9]), dict(type=“RandomFlip”, p=1), ], [ dict(type=“RandomScale”, scale=[0.95, 0.95]), dict(type=“RandomFlip”, p=1), ], [dict(type=“RandomScale”, scale=[1, 1]), dict(type=“RandomFlip”, p=1)], [ dict(type=“RandomScale”, scale=[1.05, 1.05]), dict(type=“RandomFlip”, p=1), ], [ dict(type=“RandomScale”, scale=[1.1, 1.1]), dict(type=“RandomFlip”, p=1), ], ], ), ), ) ` What parameters should we adjust?

About this issue

Original URL
State: open
Created 9 months ago
Comments: 26 (8 by maintainers)

Most upvoted comments

Hi, the validation results during training can not be used as the final result as they are not precise. Based on this setting, you can add a crop for validation if you encounter an OOM during validation (but it is actually strange that encounter OOM during validation)

Gofinge on Dec 6, 2023