Although deep learning has produced dazzling successes for applications of
image, speech, and video processing in the past few years, most trainings are
with suboptimal hyper-parameters, requiring unnecessarily long training times.
Setting the hyper-parameters remains a black art that requires years of
experience to acquire. This report proposes several efficient ways to set the
hyper-parameters that significantly reduce training time and improves
performance. Specifically, this report shows how to examine the training
validation/test loss function for subtle clues of underfitting and overfitting
and suggests guidelines for moving toward the optimal balance point. Then it
discusses how to increase/decrease the learning rate/momentum to speed up
training. Our experiments show that it is crucial to balance every manner of
regularization for each dataset and architecture. Weight decay is used as a
sample regularizer to show how its optimal value is tightly coupled with the
learning rates and momentums.
Description
A disciplined approach to neural network hyper-parameters: Part 1 --
learning rate, batch size, momentum, and weight decay
%0 Generic
%1 smith2018disciplined
%A Smith, Leslie N.
%D 2018
%K optimization seminar
%T A disciplined approach to neural network hyper-parameters: Part 1 --
learning rate, batch size, momentum, and weight decay
%U http://arxiv.org/abs/1803.09820
%X Although deep learning has produced dazzling successes for applications of
image, speech, and video processing in the past few years, most trainings are
with suboptimal hyper-parameters, requiring unnecessarily long training times.
Setting the hyper-parameters remains a black art that requires years of
experience to acquire. This report proposes several efficient ways to set the
hyper-parameters that significantly reduce training time and improves
performance. Specifically, this report shows how to examine the training
validation/test loss function for subtle clues of underfitting and overfitting
and suggests guidelines for moving toward the optimal balance point. Then it
discusses how to increase/decrease the learning rate/momentum to speed up
training. Our experiments show that it is crucial to balance every manner of
regularization for each dataset and architecture. Weight decay is used as a
sample regularizer to show how its optimal value is tightly coupled with the
learning rates and momentums.
@misc{smith2018disciplined,
abstract = {Although deep learning has produced dazzling successes for applications of
image, speech, and video processing in the past few years, most trainings are
with suboptimal hyper-parameters, requiring unnecessarily long training times.
Setting the hyper-parameters remains a black art that requires years of
experience to acquire. This report proposes several efficient ways to set the
hyper-parameters that significantly reduce training time and improves
performance. Specifically, this report shows how to examine the training
validation/test loss function for subtle clues of underfitting and overfitting
and suggests guidelines for moving toward the optimal balance point. Then it
discusses how to increase/decrease the learning rate/momentum to speed up
training. Our experiments show that it is crucial to balance every manner of
regularization for each dataset and architecture. Weight decay is used as a
sample regularizer to show how its optimal value is tightly coupled with the
learning rates and momentums.},
added-at = {2018-03-29T06:30:05.000+0200},
author = {Smith, Leslie N.},
biburl = {https://www.bibsonomy.org/bibtex/26c7166500f67f5639c57c39fa0d1daba/jk_itwm},
description = {A disciplined approach to neural network hyper-parameters: Part 1 --
learning rate, batch size, momentum, and weight decay},
interhash = {fc49edd5450da6ae594b81955d9c13ee},
intrahash = {6c7166500f67f5639c57c39fa0d1daba},
keywords = {optimization seminar},
note = {cite arxiv:1803.09820},
timestamp = {2018-03-29T06:30:05.000+0200},
title = {A disciplined approach to neural network hyper-parameters: Part 1 --
learning rate, batch size, momentum, and weight decay},
url = {http://arxiv.org/abs/1803.09820},
year = 2018
}