In this paper, we introduce a novel form of value function, $Q(s, s')$, that
expresses the utility of transitioning from a state $s$ to a neighboring state
$s'$ and then acting optimally thereafter. In order to derive an optimal
policy, we develop a forward dynamics model that learns to make next-state
predictions that maximize this value. This formulation decouples actions from
values while still learning off-policy. We highlight the benefits of this
approach in terms of value function transfer, learning within redundant action
spaces, and learning off-policy from state observations generated by
sub-optimal or completely random policies. Code and videos are available at
sites.google.com/view/qss-paper.
Description
[2002.09505] Estimating Q(s,s') with Deep Deterministic Dynamics Gradients
%0 Journal Article
%1 edwards2020estimating
%A Edwards, Ashley D.
%A Sahni, Himanshu
%A Liu, Rosanne
%A Hung, Jane
%A Jain, Ankit
%A Wang, Rui
%A Ecoffet, Adrien
%A Miconi, Thomas
%A Isbell, Charles
%A Yosinski, Jason
%D 2020
%K reinforcement-learning
%T Estimating Q(s,s') with Deep Deterministic Dynamics Gradients
%U http://arxiv.org/abs/2002.09505
%X In this paper, we introduce a novel form of value function, $Q(s, s')$, that
expresses the utility of transitioning from a state $s$ to a neighboring state
$s'$ and then acting optimally thereafter. In order to derive an optimal
policy, we develop a forward dynamics model that learns to make next-state
predictions that maximize this value. This formulation decouples actions from
values while still learning off-policy. We highlight the benefits of this
approach in terms of value function transfer, learning within redundant action
spaces, and learning off-policy from state observations generated by
sub-optimal or completely random policies. Code and videos are available at
sites.google.com/view/qss-paper.
@article{edwards2020estimating,
abstract = {In this paper, we introduce a novel form of value function, $Q(s, s')$, that
expresses the utility of transitioning from a state $s$ to a neighboring state
$s'$ and then acting optimally thereafter. In order to derive an optimal
policy, we develop a forward dynamics model that learns to make next-state
predictions that maximize this value. This formulation decouples actions from
values while still learning off-policy. We highlight the benefits of this
approach in terms of value function transfer, learning within redundant action
spaces, and learning off-policy from state observations generated by
sub-optimal or completely random policies. Code and videos are available at
\url{sites.google.com/view/qss-paper}.},
added-at = {2020-02-26T13:27:25.000+0100},
author = {Edwards, Ashley D. and Sahni, Himanshu and Liu, Rosanne and Hung, Jane and Jain, Ankit and Wang, Rui and Ecoffet, Adrien and Miconi, Thomas and Isbell, Charles and Yosinski, Jason},
biburl = {https://www.bibsonomy.org/bibtex/299c295254eaf04e7c5fc804bbd91abea/kirk86},
description = {[2002.09505] Estimating Q(s,s') with Deep Deterministic Dynamics Gradients},
interhash = {61fb3d603261576924682bd21110ef01},
intrahash = {99c295254eaf04e7c5fc804bbd91abea},
keywords = {reinforcement-learning},
note = {cite arxiv:2002.09505},
timestamp = {2020-02-26T13:27:25.000+0100},
title = {Estimating Q(s,s') with Deep Deterministic Dynamics Gradients},
url = {http://arxiv.org/abs/2002.09505},
year = 2020
}