Computational results demonstrate that posterior sampling for reinforcement
learning (PSRL) dramatically outperforms algorithms driven by optimism, such as
UCRL2. We provide insight into the extent of this performance boost and the
phenomenon that drives it. We leverage this insight to establish an
$O(HSAT)$ Bayesian expected regret bound for PSRL in
finite-horizon episodic Markov decision processes, where $H$ is the horizon,
$S$ is the number of states, $A$ is the number of actions and $T$ is the time
elapsed. This improves upon the best previous bound of $O(H S
AT)$ for any reinforcement learning algorithm.
Description
[1607.00215] Why is Posterior Sampling Better than Optimism for Reinforcement Learning?
%0 Journal Article
%1 osband2016posterior
%A Osband, Ian
%A Van Roy, Benjamin
%D 2016
%K approximate bayesian reinforcement-learning sampling uncertainty
%T Why is Posterior Sampling Better than Optimism for Reinforcement
Learning?
%U http://arxiv.org/abs/1607.00215
%X Computational results demonstrate that posterior sampling for reinforcement
learning (PSRL) dramatically outperforms algorithms driven by optimism, such as
UCRL2. We provide insight into the extent of this performance boost and the
phenomenon that drives it. We leverage this insight to establish an
$O(HSAT)$ Bayesian expected regret bound for PSRL in
finite-horizon episodic Markov decision processes, where $H$ is the horizon,
$S$ is the number of states, $A$ is the number of actions and $T$ is the time
elapsed. This improves upon the best previous bound of $O(H S
AT)$ for any reinforcement learning algorithm.
@article{osband2016posterior,
abstract = {Computational results demonstrate that posterior sampling for reinforcement
learning (PSRL) dramatically outperforms algorithms driven by optimism, such as
UCRL2. We provide insight into the extent of this performance boost and the
phenomenon that drives it. We leverage this insight to establish an
$\tilde{O}(H\sqrt{SAT})$ Bayesian expected regret bound for PSRL in
finite-horizon episodic Markov decision processes, where $H$ is the horizon,
$S$ is the number of states, $A$ is the number of actions and $T$ is the time
elapsed. This improves upon the best previous bound of $\tilde{O}(H S
\sqrt{AT})$ for any reinforcement learning algorithm.},
added-at = {2020-01-13T12:33:25.000+0100},
author = {Osband, Ian and Van Roy, Benjamin},
biburl = {https://www.bibsonomy.org/bibtex/2142d9e8332d897184103e9ee8fa0aa21/kirk86},
description = {[1607.00215] Why is Posterior Sampling Better than Optimism for Reinforcement Learning?},
interhash = {b4f3d6f682b4f4ea0d17bbc777c61fac},
intrahash = {142d9e8332d897184103e9ee8fa0aa21},
keywords = {approximate bayesian reinforcement-learning sampling uncertainty},
note = {cite arxiv:1607.00215},
timestamp = {2020-01-13T12:33:25.000+0100},
title = {Why is Posterior Sampling Better than Optimism for Reinforcement
Learning?},
url = {http://arxiv.org/abs/1607.00215},
year = 2016
}