In this paper we consider online learning in finite Markov decision processes (MDPs) with changing cost sequences under full and bandit-information. We propose to view this problem as an instance of online linear optimization. We propose two methods for this problem: MD^2 (mirror descent with approximate projections) and the continuous exponential weights algorithm with Dikin walks. We provide a rigorous complexity analysis of these techniques, while providing near-optimal regret-bounds (in particular, we take into account the computational costs of performing approximate projections in MD^2). In the case of full-information feedback, our results complement existing ones. In the case of bandit-information feedback we consider the online stochastic shortest path problem, a special case of the above MDP problems, and manage to improve the existing results by removing the previous restrictive assumption that the state-visitation probabilities are uniformly bounded away from zero under all policies.
%0 Conference Paper
%1 DiGyoSze14
%A Dick, T.
%A György, A.
%A Szepesvári, Cs.
%B ICML
%D 2014
%K MDPs, adversarial finite learning, online path problem, recurrent setting, shortest theory
%P 512--520
%T Online Learning in Markov Decision Processes with Changing Cost Sequences
%X In this paper we consider online learning in finite Markov decision processes (MDPs) with changing cost sequences under full and bandit-information. We propose to view this problem as an instance of online linear optimization. We propose two methods for this problem: MD^2 (mirror descent with approximate projections) and the continuous exponential weights algorithm with Dikin walks. We provide a rigorous complexity analysis of these techniques, while providing near-optimal regret-bounds (in particular, we take into account the computational costs of performing approximate projections in MD^2). In the case of full-information feedback, our results complement existing ones. In the case of bandit-information feedback we consider the online stochastic shortest path problem, a special case of the above MDP problems, and manage to improve the existing results by removing the previous restrictive assumption that the state-visitation probabilities are uniformly bounded away from zero under all policies.
@inproceedings{DiGyoSze14,
abstract = {In this paper we consider online learning in finite Markov decision processes (MDPs) with changing cost sequences under full and bandit-information. We propose to view this problem as an instance of online linear optimization. We propose two methods for this problem: MD^2 (mirror descent with approximate projections) and the continuous exponential weights algorithm with Dikin walks. We provide a rigorous complexity analysis of these techniques, while providing near-optimal regret-bounds (in particular, we take into account the computational costs of performing approximate projections in MD^2). In the case of full-information feedback, our results complement existing ones. In the case of bandit-information feedback we consider the online stochastic shortest path problem, a special case of the above MDP problems, and manage to improve the existing results by removing the previous restrictive assumption that the state-visitation probabilities are uniformly bounded away from zero under all policies.},
added-at = {2020-03-17T03:03:01.000+0100},
author = {Dick, T. and Gy{\"o}rgy, A. and Szepesv{\'a}ri, {Cs}.},
biburl = {https://www.bibsonomy.org/bibtex/24e5be2dd7313e150e041a79591ac417f/csaba},
booktitle = {ICML},
date-added = {2014-01-13 10:52:36 -0700},
date-modified = {2014-07-19 17:47:40 -0600},
interhash = {fa185285a825b3dcbd5b7e40de1926a7},
intrahash = {4e5be2dd7313e150e041a79591ac417f},
keywords = {MDPs, adversarial finite learning, online path problem, recurrent setting, shortest theory},
month = {January},
pages = {512--520},
pdf = {papers/omdp_icml.pdf},
timestamp = {2020-03-17T03:03:01.000+0100},
title = {Online Learning in {M}arkov Decision Processes with Changing Cost Sequences},
year = 2014
}