• AIPressRoom
  • Posts
  • A number of GPU coaching in PyTorch and Gradient Accumulation as a substitute for it | by Alexey Kravets | Jul, 2023

A number of GPU coaching in PyTorch and Gradient Accumulation as a substitute for it | by Alexey Kravets | Jul, 2023

import osos.environ[“CUDA_VISIBLE_DEVICES”] = “0,1”print(os.environ[“CUDA_VISIBLE_DEVICES”])

import torchimport torch.nn as nnfrom torch.utils.information import DataLoader, Dataset, Samplerimport argparseimport torch.optim as optim import numpy as npimport randomimport torch.backends.cudnn as cudnnimport torch.nn.useful as F

from torch.distributed import init_process_groupimport torch.distributed as dist

class data_set(Dataset):

def __init__(self, df):self.df = df

def __len__(self):return len(self.df)

def __getitem__(self, index):

pattern = self.df[index]return index, pattern

class NeuralNetwork(nn.Module):def __init__(self, dsize):tremendous().__init__()self.linear = nn.Linear(dsize, 1, bias=False)self.linear.weight.information[:] = 1.

def ahead(self, x):x = self.linear(x)loss = x.sum()return loss

class DummySampler(Sampler):def __init__(self, information, batch_size, n_gpus=2):self.num_samples = len(information)self.b_size = batch_sizeself.n_gpus = n_gpus

def __iter__(self):ids = []for i in vary(0, self.num_samples, self.b_size * self.n_gpus):ids.append(np.arange(self.num_samples)[i: i + self.b_size*self.n_gpus :self.n_gpus])ids.append(np.arange(self.num_samples)[i+1: (i+1) + self.b_size*self.n_gpus :self.n_gpus])return iter(np.concatenate(ids))

def __len__(self):# print (‘tcalling Sampler:__len__’)return self.num_samples

def major(args=None):

d_size = args.data_size

if args.distributed:init_process_group(backend=”nccl”)gadget = int(os.environ[“LOCAL_RANK”])torch.cuda.set_device(gadget)else:gadget = “cuda:0”

# repair the seed for reproducibilityseed = args.seed

torch.manual_seed(seed)np.random.seed(seed)random.seed(seed)cudnn.benchmark = True

# generate informationinformation = torch.rand(d_size, d_size)

mannequin = NeuralNetwork(args.data_size) mannequin = mannequin.to(gadget)

if args.distributed:mannequin = torch.nn.parallel.DistributedDataParallel(mannequin, device_ids=[device])

optimizer = optim.SGD(mannequin.parameters(), lr=0.01, momentum=0.9)dataset = data_set(information)

if args.distributed:sampler = torch.utils.information.DistributedSampler(dataset, shuffle=False)else:# we outline `DummySampler` for precise reproducibility with `DistributedSampler`# which splits the information as described within the article. sampler = DummySampler(dataset, args.batch_size)

loader = DataLoader(dataset,batch_size=args.batch_size,num_workers=0,pin_memory=True,sampler=sampler,shuffle=False,collate_fn=None,)

if not args.distributed:grads = []

# ACC_STEPS identical as GPU as we have to divide the loss by this quantity# to acquire the identical gradient as from a number of GPUs which are # averaged collectivelyACC_STEPS = args.acc_steps optimizer.zero_grad()

for epoch in vary(args.epochs):

if args.distributed:loader.sampler.set_epoch(epoch)

for i, (idxs, row) in enumerate(loader):

if args.distributed:optimizer.zero_grad()

row = row.to(gadget, non_blocking=True)

if args.distributed:rank = dist.get_rank() == 0else:rank = True

loss = mannequin(row)

if args.distributed:# does common gradients routinely due to mannequin wrapper into # `DistributedDataParallel`loss.backward()else:# scale loss based on accumulation stepsloss = loss/ACC_STEPSloss.backward()

if i == 0 and rank:print(f”Epoch {epoch} {100 * ‘=’}”)

if not args.distributed:if (i + 1) % ACC_STEPS == 0: # solely step when we now have finished ACC_STEPS# acumulate grads for whole epochoptimizer.step() optimizer.zero_grad()else:optimizer.step()

if not args.distributed and args.verbose:print(100 * “=”)print(“Mannequin weights : “, mannequin.linear.weight)print(100 * “=”)elif args.distributed and args.verbose and rank:print(100 * “=”)print(“Mannequin weights : “, mannequin.module.linear.weight)print(100 * “=”)

if __name__ == “__main__”:

parser = argparse.ArgumentParser()parser.add_argument(‘–distributed’, motion=’store_true’,)parser.add_argument(‘–seed’, default=0, sort=int) parser.add_argument(‘–epochs’, default=2, sort=int) parser.add_argument(‘–batch_size’, default=4, sort=int) parser.add_argument(‘–data_size’, default=16, sort=int) parser.add_argument(‘–acc_steps’, default=3, sort=int) parser.add_argument(‘–verbose’, motion=’store_true’,)

args = parser.parse_args()

print(args)

major(args)