(This version is identical to the MLJ version except that in the proof of Theorem 2 a minor issue in the proof is corrected.) We consider the problem of model selection in the batch (offline, non-interactive) rein- forcement learning setting when the goal is to find an action-value function with the smallest Bellman error among a countable set of candidates functions. We propose a complexity regularization-based model selection algorithm, BErMin, and prove that it enjoys an oracle-like property: the estimator's error differs from that of an oracle, who selects the candidate with the minimum Bell- man error, by only a constant factor and a small remainder term that vanishes at a parametric rate as the number of samples increases. As an application, we consider a problem when the true action-value function belongs to an unknown member of a nested sequence of function spaces. We show that under some additional technical conditions BErMin leads to a procedure whose rate of convergence, up to a constant factor, matches that of an oracle who knows which of the nested function spaces the true action-value function belongs to, i.e., the procedure achieves adaptivity.
%0 Journal Article
%1 farahmand2010
%A Farahmand, A.m.
%A Szepesvári, Cs.
%D 2011
%J Machine Learning Journal
%K Bellman Decision Markov Processes, adaptivity learning, model nonparametrics, reinforcement residuals, selection, theory,
%N 3
%P 299--332
%R 10.1007/s10994-011-5254-7
%T Model Selection in Reinforcement Learning
%V 85
%X (This version is identical to the MLJ version except that in the proof of Theorem 2 a minor issue in the proof is corrected.) We consider the problem of model selection in the batch (offline, non-interactive) rein- forcement learning setting when the goal is to find an action-value function with the smallest Bellman error among a countable set of candidates functions. We propose a complexity regularization-based model selection algorithm, BErMin, and prove that it enjoys an oracle-like property: the estimator's error differs from that of an oracle, who selects the candidate with the minimum Bell- man error, by only a constant factor and a small remainder term that vanishes at a parametric rate as the number of samples increases. As an application, we consider a problem when the true action-value function belongs to an unknown member of a nested sequence of function spaces. We show that under some additional technical conditions BErMin leads to a procedure whose rate of convergence, up to a constant factor, matches that of an oracle who knows which of the nested function spaces the true action-value function belongs to, i.e., the procedure achieves adaptivity.
@article{farahmand2010,
abstract = {(This version is identical to the MLJ version except that in the proof of Theorem 2 a minor issue in the proof is corrected.) We consider the problem of model selection in the batch (offline, non-interactive) rein- forcement learning setting when the goal is to find an action-value function with the smallest Bellman error among a countable set of candidates functions. We propose a complexity regularization-based model selection algorithm, BErMin, and prove that it enjoys an oracle-like property: the estimator's error differs from that of an oracle, who selects the candidate with the minimum Bell- man error, by only a constant factor and a small remainder term that vanishes at a parametric rate as the number of samples increases. As an application, we consider a problem when the true action-value function belongs to an unknown member of a nested sequence of function spaces. We show that under some additional technical conditions BErMin leads to a procedure whose rate of convergence, up to a constant factor, matches that of an oracle who knows which of the nested function spaces the true action-value function belongs to, i.e., the procedure achieves adaptivity.},
added-at = {2020-03-17T03:03:01.000+0100},
author = {Farahmand, A.{m}. and Szepesv{\'a}ri, {Cs}.},
bdsk-url-1 = {http://dx.doi.org/10.1007/s10994-006-6888-8},
biburl = {https://www.bibsonomy.org/bibtex/29a6dd0e11dcdd11f23b95204cacdbd93/csaba},
date-added = {2011-07-03 21:08:30 -0600},
date-modified = {2012-06-03 14:11:12 -0600},
doi = {10.1007/s10994-011-5254-7},
interhash = {e80696229afd9024a49a521d3c5a7f1a},
intrahash = {9a6dd0e11dcdd11f23b95204cacdbd93},
journal = {Machine Learning Journal},
keywords = {Bellman Decision Markov Processes, adaptivity learning, model nonparametrics, reinforcement residuals, selection, theory,},
month = {December},
number = 3,
pages = {299--332},
pdf = {papers/RLModelSelect.pdf},
timestamp = {2020-03-17T03:03:01.000+0100},
title = {Model Selection in Reinforcement Learning},
volume = 85,
year = 2011
}