- AIPressRoom
- Posts
- A number of GPU coaching in PyTorch and Gradient Accumulation as a substitute for it | by Alexey Kravets | Jul, 2023
A number of GPU coaching in PyTorch and Gradient Accumulation as a substitute for it | by Alexey Kravets | Jul, 2023
import osos.environ[“CUDA_VISIBLE_DEVICES”] = “0,1”print(os.environ[“CUDA_VISIBLE_DEVICES”])
import torchimport torch.nn as nnfrom torch.utils.information import DataLoader, Dataset, Samplerimport argparseimport torch.optim as optim import numpy as npimport randomimport torch.backends.cudnn as cudnnimport torch.nn.useful as F
from torch.distributed import init_process_groupimport torch.distributed as dist
class data_set(Dataset):
def __init__(self, df):self.df = df
def __len__(self):return len(self.df)
def __getitem__(self, index):
pattern = self.df[index]return index, pattern
class NeuralNetwork(nn.Module):def __init__(self, dsize):tremendous().__init__()self.linear = nn.Linear(dsize, 1, bias=False)self.linear.weight.information[:] = 1.
def ahead(self, x):x = self.linear(x)loss = x.sum()return loss
class DummySampler(Sampler):def __init__(self, information, batch_size, n_gpus=2):self.num_samples = len(information)self.b_size = batch_sizeself.n_gpus = n_gpus
def __iter__(self):ids = []for i in vary(0, self.num_samples, self.b_size * self.n_gpus):ids.append(np.arange(self.num_samples)[i: i + self.b_size*self.n_gpus :self.n_gpus])ids.append(np.arange(self.num_samples)[i+1: (i+1) + self.b_size*self.n_gpus :self.n_gpus])return iter(np.concatenate(ids))
def __len__(self):# print (‘tcalling Sampler:__len__’)return self.num_samples
def major(args=None):
d_size = args.data_size
if args.distributed:init_process_group(backend=”nccl”)gadget = int(os.environ[“LOCAL_RANK”])torch.cuda.set_device(gadget)else:gadget = “cuda:0”
# repair the seed for reproducibilityseed = args.seed
torch.manual_seed(seed)np.random.seed(seed)random.seed(seed)cudnn.benchmark = True
# generate informationinformation = torch.rand(d_size, d_size)
mannequin = NeuralNetwork(args.data_size) mannequin = mannequin.to(gadget)
if args.distributed:mannequin = torch.nn.parallel.DistributedDataParallel(mannequin, device_ids=[device])
optimizer = optim.SGD(mannequin.parameters(), lr=0.01, momentum=0.9)dataset = data_set(information)
if args.distributed:sampler = torch.utils.information.DistributedSampler(dataset, shuffle=False)else:# we outline `DummySampler` for precise reproducibility with `DistributedSampler`# which splits the information as described within the article. sampler = DummySampler(dataset, args.batch_size)
loader = DataLoader(dataset,batch_size=args.batch_size,num_workers=0,pin_memory=True,sampler=sampler,shuffle=False,collate_fn=None,)
if not args.distributed:grads = []
# ACC_STEPS identical as GPU as we have to divide the loss by this quantity# to acquire the identical gradient as from a number of GPUs which are # averaged collectivelyACC_STEPS = args.acc_steps optimizer.zero_grad()
for epoch in vary(args.epochs):
if args.distributed:loader.sampler.set_epoch(epoch)
for i, (idxs, row) in enumerate(loader):
if args.distributed:optimizer.zero_grad()
row = row.to(gadget, non_blocking=True)
if args.distributed:rank = dist.get_rank() == 0else:rank = True
loss = mannequin(row)
if args.distributed:# does common gradients routinely due to mannequin wrapper into # `DistributedDataParallel`loss.backward()else:# scale loss based on accumulation stepsloss = loss/ACC_STEPSloss.backward()
if i == 0 and rank:print(f”Epoch {epoch} {100 * ‘=’}”)
if not args.distributed:if (i + 1) % ACC_STEPS == 0: # solely step when we now have finished ACC_STEPS# acumulate grads for whole epochoptimizer.step() optimizer.zero_grad()else:optimizer.step()
if not args.distributed and args.verbose:print(100 * “=”)print(“Mannequin weights : “, mannequin.linear.weight)print(100 * “=”)elif args.distributed and args.verbose and rank:print(100 * “=”)print(“Mannequin weights : “, mannequin.module.linear.weight)print(100 * “=”)
if __name__ == “__main__”:
parser = argparse.ArgumentParser()parser.add_argument(‘–distributed’, motion=’store_true’,)parser.add_argument(‘–seed’, default=0, sort=int) parser.add_argument(‘–epochs’, default=2, sort=int) parser.add_argument(‘–batch_size’, default=4, sort=int) parser.add_argument(‘–data_size’, default=16, sort=int) parser.add_argument(‘–acc_steps’, default=3, sort=int) parser.add_argument(‘–verbose’, motion=’store_true’,)
args = parser.parse_args()
print(args)
major(args)