Distributed applications are hard to debug because timing-dependent network communication is a source of non-deterministic behavior. Current approaches to debug non-deterministic failures include post-mortem debugging as well as record and replay. However, the first impairs system performance to gather data, whereas the latter requires developers to understand the timing-dependent communication at a lower level of abstraction than they develop at. Furthermore, both approaches require intrusive core library modifications to gather data from live systems. In this paper, we present the Peek-At-Talk debugger for investigating non-deterministic failures with low overhead in a systematic, top-down method, with a particular focus on tool-building issues in the following areas: First, we show how our debugging framework Path Tools guides developers from failures to their root causes and gathers run-time data with low overhead. Second, we present Peek-At-Talk, an extension to our Path Tools framework to record non-deterministic communication and refine behavioral data that connects source code with network events. Finally, we scope changes to the core library to record network communication without impacting other network applications.
Description
Implementing record and refinement for debugging timing-dependent communication - ScienceDirect
%0 Journal Article
%1 Felgentreff:2017
%A Felgentreff, Tim
%A Perscheid, Michael
%A Hirschfeld, Robert
%D 2017
%J Science of Computer Programming
%K Debugging Distribution Optimization Record Refine Replay
%P 4--18
%R 10.1016/j.scico.2015.11.006
%T Implementing record and refinement for debugging timing-dependent communication
%U http://www.sciencedirect.com/science/article/pii/S0167642315003585
%V 134
%X Distributed applications are hard to debug because timing-dependent network communication is a source of non-deterministic behavior. Current approaches to debug non-deterministic failures include post-mortem debugging as well as record and replay. However, the first impairs system performance to gather data, whereas the latter requires developers to understand the timing-dependent communication at a lower level of abstraction than they develop at. Furthermore, both approaches require intrusive core library modifications to gather data from live systems. In this paper, we present the Peek-At-Talk debugger for investigating non-deterministic failures with low overhead in a systematic, top-down method, with a particular focus on tool-building issues in the following areas: First, we show how our debugging framework Path Tools guides developers from failures to their root causes and gathers run-time data with low overhead. Second, we present Peek-At-Talk, an extension to our Path Tools framework to record non-deterministic communication and refine behavioral data that connects source code with network events. Finally, we scope changes to the core library to record network communication without impacting other network applications.
@article{Felgentreff:2017,
abstract = {Distributed applications are hard to debug because timing-dependent network communication is a source of non-deterministic behavior. Current approaches to debug non-deterministic failures include post-mortem debugging as well as record and replay. However, the first impairs system performance to gather data, whereas the latter requires developers to understand the timing-dependent communication at a lower level of abstraction than they develop at. Furthermore, both approaches require intrusive core library modifications to gather data from live systems. In this paper, we present the Peek-At-Talk debugger for investigating non-deterministic failures with low overhead in a systematic, top-down method, with a particular focus on tool-building issues in the following areas: First, we show how our debugging framework Path Tools guides developers from failures to their root causes and gathers run-time data with low overhead. Second, we present Peek-At-Talk, an extension to our Path Tools framework to record non-deterministic communication and refine behavioral data that connects source code with network events. Finally, we scope changes to the core library to record network communication without impacting other network applications.},
added-at = {2018-05-03T16:23:53.000+0200},
author = {Felgentreff, Tim and Perscheid, Michael and Hirschfeld, Robert},
biburl = {https://www.bibsonomy.org/bibtex/2f057cf5105c82b8baa5b6498cdd2646b/gron},
description = {Implementing record and refinement for debugging timing-dependent communication - ScienceDirect},
doi = {10.1016/j.scico.2015.11.006},
interhash = {e786122d50460812ece19ba2a39eabaf},
intrahash = {f057cf5105c82b8baa5b6498cdd2646b},
issn = {0167-6423},
journal = {Science of Computer Programming},
keywords = {Debugging Distribution Optimization Record Refine Replay},
note = {6th issue of Experimental Software and Toolkits (EST-6)},
pages = {4--18},
timestamp = {2018-05-03T16:23:53.000+0200},
title = {{Implementing record and refinement for debugging timing-dependent communication}},
url = {http://www.sciencedirect.com/science/article/pii/S0167642315003585},
volume = 134,
year = 2017
}