In this paper, we present a co-designed petascale high-density GPU cluster to
expedite distributed deep learning training with synchronous Stochastic
Gradient Descent~(SSGD). This architecture of our heterogeneous cluster is
inspired by Harvard architecture. Regarding to different roles in the system,
nodes are configured as different specifications. Based on the topology of the
whole system's network and properties of different types of nodes, we develop
and implement a novel job server parallel software framework, named by
"MiMatrix", for distributed deep learning training. Compared to the
parameter server framework, in which parameter server is a bottleneck of data
transfer in AllReduce algorithm of SSGD, the job server undertakes all of
controlling, scheduling and monitoring tasks without model data transfer. In
MiMatrix, we propose a novel GPUDirect Remote direct memory access~(RDMA)-aware
parallel algorithm of AllReucde executed by computing servers, which both
computation and handshake message are $O(1)$ at each epoch
%0 Generic
%1 chen2018mimatrix
%A Chen, Xin
%A Zhou, Hua
%A Gao, Yuxiang
%A Zhu, Yu
%A Wang, Dongyan
%D 2018
%K distributed
%T MiMatrix: A Massively Distributed Deep Learning Framework on a Petascale
High-density Heterogeneous Cluster
%U http://arxiv.org/abs/1802.02326
%X In this paper, we present a co-designed petascale high-density GPU cluster to
expedite distributed deep learning training with synchronous Stochastic
Gradient Descent~(SSGD). This architecture of our heterogeneous cluster is
inspired by Harvard architecture. Regarding to different roles in the system,
nodes are configured as different specifications. Based on the topology of the
whole system's network and properties of different types of nodes, we develop
and implement a novel job server parallel software framework, named by
"MiMatrix", for distributed deep learning training. Compared to the
parameter server framework, in which parameter server is a bottleneck of data
transfer in AllReduce algorithm of SSGD, the job server undertakes all of
controlling, scheduling and monitoring tasks without model data transfer. In
MiMatrix, we propose a novel GPUDirect Remote direct memory access~(RDMA)-aware
parallel algorithm of AllReucde executed by computing servers, which both
computation and handshake message are $O(1)$ at each epoch
@misc{chen2018mimatrix,
abstract = {In this paper, we present a co-designed petascale high-density GPU cluster to
expedite distributed deep learning training with synchronous Stochastic
Gradient Descent~(SSGD). This architecture of our heterogeneous cluster is
inspired by Harvard architecture. Regarding to different roles in the system,
nodes are configured as different specifications. Based on the topology of the
whole system's network and properties of different types of nodes, we develop
and implement a novel job server parallel software framework, named by
"\textit{MiMatrix}", for distributed deep learning training. Compared to the
parameter server framework, in which parameter server is a bottleneck of data
transfer in AllReduce algorithm of SSGD, the job server undertakes all of
controlling, scheduling and monitoring tasks without model data transfer. In
MiMatrix, we propose a novel GPUDirect Remote direct memory access~(RDMA)-aware
parallel algorithm of AllReucde executed by computing servers, which both
computation and handshake message are $O(1)$ at each epoch},
added-at = {2018-02-13T09:25:43.000+0100},
author = {Chen, Xin and Zhou, Hua and Gao, Yuxiang and Zhu, Yu and Wang, Dongyan},
biburl = {https://www.bibsonomy.org/bibtex/2aa8136df09e81c27c891b60a1742c9d8/jk_itwm},
description = {1802.02326.pdf},
interhash = {ec496122f3a8f416dd5a3454f671dd27},
intrahash = {aa8136df09e81c27c891b60a1742c9d8},
keywords = {distributed},
note = {cite arxiv:1802.02326Comment: 13 pages, 5 figures},
timestamp = {2018-02-13T09:25:43.000+0100},
title = {MiMatrix: A Massively Distributed Deep Learning Framework on a Petascale
High-density Heterogeneous Cluster},
url = {http://arxiv.org/abs/1802.02326},
year = 2018
}