We introduce fast algorithms for selecting a random sample of n records without replacement from a pool of N records, where the value of N is unknown beforehand. The main result of the paper is the design and analysis of Algorithm Z; it does the sampling in one pass using constant space and in O(n(1 + log(N/n))) expected time, which is optimum, up to a constant factor. Several optimizations are studied that collectively improve the speed of the naive version of the algorithm by an order of magnitude. We give an efficient Pascal-like implementation that incorporates these modifications and that is suitable for general use. Theoretical and empirical results indicate that Algorithm Z outperforms current methods by a significant margin.
%0 Journal Article
%1 vitter85
%A Vitter, Jeffrey S.
%C New York, NY, USA
%D 1985
%I ACM
%J ACM Trans. Math. Softw.
%K algorithm probability randomized reservoir.sampling sampling streaming_model
%N 1
%P 37--57
%R 10.1145/3147.3165
%T Random Sampling with a Reservoir
%V 11
%X We introduce fast algorithms for selecting a random sample of n records without replacement from a pool of N records, where the value of N is unknown beforehand. The main result of the paper is the design and analysis of Algorithm Z; it does the sampling in one pass using constant space and in O(n(1 + log(N/n))) expected time, which is optimum, up to a constant factor. Several optimizations are studied that collectively improve the speed of the naive version of the algorithm by an order of magnitude. We give an efficient Pascal-like implementation that incorporates these modifications and that is suitable for general use. Theoretical and empirical results indicate that Algorithm Z outperforms current methods by a significant margin.
@article{vitter85,
abstract = {We introduce fast algorithms for selecting a random sample of n records without replacement from a pool of N records, where the value of N is unknown beforehand. The main result of the paper is the design and analysis of Algorithm Z; it does the sampling in one pass using constant space and in O(n(1 + log(N/n))) expected time, which is optimum, up to a constant factor. Several optimizations are studied that collectively improve the speed of the naive version of the algorithm by an order of magnitude. We give an efficient Pascal-like implementation that incorporates these modifications and that is suitable for general use. Theoretical and empirical results indicate that Algorithm Z outperforms current methods by a significant margin.},
acmid = {3165},
added-at = {2013-10-06T01:37:34.000+0200},
address = {New York, NY, USA},
author = {Vitter, Jeffrey S.},
biburl = {https://www.bibsonomy.org/bibtex/2f84bfb9fa8054f24e45dc2d4eb06cb70/ytyoun},
doi = {10.1145/3147.3165},
interhash = {e8302dda6029e965ccd221fe4a47b79b},
intrahash = {f84bfb9fa8054f24e45dc2d4eb06cb70},
issn = {0098-3500},
issue_date = {March 1985},
journal = {ACM Trans. Math. Softw.},
keywords = {algorithm probability randomized reservoir.sampling sampling streaming_model},
month = mar,
number = 1,
numpages = {21},
pages = {37--57},
publisher = {ACM},
timestamp = {2016-05-20T08:21:51.000+0200},
title = {Random Sampling with a Reservoir},
volume = 11,
year = 1985
}