In this paper, we present GossipGraD - a gossip communication protocol based
Stochastic Gradient Descent (SGD) algorithm for scaling Deep Learning (DL)
algorithms on large-scale systems. The salient features of GossipGraD are: 1)
reduction in overall communication complexity from \Theta(log(p)) for p
compute nodes in well-studied SGD to O(1), 2) model diffusion such that compute
nodes exchange their updates (gradients) indirectly after every log(p) steps,
3) rotation of communication partners for facilitating direct diffusion of
gradients, 4) asynchronous distributed shuffle of samples during the
feedforward phase in SGD to prevent over-fitting, 5) asynchronous communication
of gradients for further reducing the communication cost of SGD and GossipGraD.
We implement GossipGraD for GPU and CPU clusters and use NVIDIA GPUs (Pascal
P100) connected with InfiniBand, and Intel Knights Landing (KNL) connected with
Aries network. We evaluate GossipGraD using well-studied dataset ImageNet-1K
(~250GB), and widely studied neural network topologies such as GoogLeNet and
ResNet50 (current winner of ImageNet Large Scale Visualization Research
Challenge (ILSVRC)). Our performance evaluation using both KNL and Pascal GPUs
indicates that GossipGraD can achieve perfect efficiency for these datasets and
their associated neural network topologies. Specifically, for ResNet50,
GossipGraD is able to achieve ~100% compute efficiency using 128 NVIDIA Pascal
P100 GPUs - while matching the top-1 classification accuracy published in
literature.
Description
GossipGraD: Scalable Deep Learning using Gossip Communication based
Asynchronous Gradient Descent
%0 Generic
%1 daily2018gossipgrad
%A Daily, Jeff
%A Vishnu, Abhinav
%A Siegel, Charles
%A Warfel, Thomas
%A Amatya, Vinay
%D 2018
%K SGD async to_read
%T GossipGraD: Scalable Deep Learning using Gossip Communication based
Asynchronous Gradient Descent
%U http://arxiv.org/abs/1803.05880
%X In this paper, we present GossipGraD - a gossip communication protocol based
Stochastic Gradient Descent (SGD) algorithm for scaling Deep Learning (DL)
algorithms on large-scale systems. The salient features of GossipGraD are: 1)
reduction in overall communication complexity from \Theta(log(p)) for p
compute nodes in well-studied SGD to O(1), 2) model diffusion such that compute
nodes exchange their updates (gradients) indirectly after every log(p) steps,
3) rotation of communication partners for facilitating direct diffusion of
gradients, 4) asynchronous distributed shuffle of samples during the
feedforward phase in SGD to prevent over-fitting, 5) asynchronous communication
of gradients for further reducing the communication cost of SGD and GossipGraD.
We implement GossipGraD for GPU and CPU clusters and use NVIDIA GPUs (Pascal
P100) connected with InfiniBand, and Intel Knights Landing (KNL) connected with
Aries network. We evaluate GossipGraD using well-studied dataset ImageNet-1K
(~250GB), and widely studied neural network topologies such as GoogLeNet and
ResNet50 (current winner of ImageNet Large Scale Visualization Research
Challenge (ILSVRC)). Our performance evaluation using both KNL and Pascal GPUs
indicates that GossipGraD can achieve perfect efficiency for these datasets and
their associated neural network topologies. Specifically, for ResNet50,
GossipGraD is able to achieve ~100% compute efficiency using 128 NVIDIA Pascal
P100 GPUs - while matching the top-1 classification accuracy published in
literature.
@misc{daily2018gossipgrad,
abstract = {In this paper, we present GossipGraD - a gossip communication protocol based
Stochastic Gradient Descent (SGD) algorithm for scaling Deep Learning (DL)
algorithms on large-scale systems. The salient features of GossipGraD are: 1)
reduction in overall communication complexity from {\Theta}(log(p)) for p
compute nodes in well-studied SGD to O(1), 2) model diffusion such that compute
nodes exchange their updates (gradients) indirectly after every log(p) steps,
3) rotation of communication partners for facilitating direct diffusion of
gradients, 4) asynchronous distributed shuffle of samples during the
feedforward phase in SGD to prevent over-fitting, 5) asynchronous communication
of gradients for further reducing the communication cost of SGD and GossipGraD.
We implement GossipGraD for GPU and CPU clusters and use NVIDIA GPUs (Pascal
P100) connected with InfiniBand, and Intel Knights Landing (KNL) connected with
Aries network. We evaluate GossipGraD using well-studied dataset ImageNet-1K
(~250GB), and widely studied neural network topologies such as GoogLeNet and
ResNet50 (current winner of ImageNet Large Scale Visualization Research
Challenge (ILSVRC)). Our performance evaluation using both KNL and Pascal GPUs
indicates that GossipGraD can achieve perfect efficiency for these datasets and
their associated neural network topologies. Specifically, for ResNet50,
GossipGraD is able to achieve ~100% compute efficiency using 128 NVIDIA Pascal
P100 GPUs - while matching the top-1 classification accuracy published in
literature.},
added-at = {2018-03-16T21:22:43.000+0100},
author = {Daily, Jeff and Vishnu, Abhinav and Siegel, Charles and Warfel, Thomas and Amatya, Vinay},
biburl = {https://www.bibsonomy.org/bibtex/284be58ce9cf44c6009f05c467b464484/jk_itwm},
description = {GossipGraD: Scalable Deep Learning using Gossip Communication based
Asynchronous Gradient Descent},
interhash = {75ad0dc507bc2f5857c6dd6435d45d97},
intrahash = {84be58ce9cf44c6009f05c467b464484},
keywords = {SGD async to_read},
note = {cite arxiv:1803.05880Comment: 13 pages, 17 figures},
timestamp = {2018-03-16T21:22:43.000+0100},
title = {GossipGraD: Scalable Deep Learning using Gossip Communication based
Asynchronous Gradient Descent},
url = {http://arxiv.org/abs/1803.05880},
year = 2018
}