The statistical problem of using an initial sample to estimate the number of unique species in a
larger sample has found important applications in fields far removed from ecology. Here we address
the general problem of estimating the number of species that will be represented at least r times, for
any r ≥ 1, in a future sample. We derive a procedure to construct estimators that apply universally for
a given population: once constructed, they can be evaluated as a simple function of r. Our approach
is based on a relationship between the number of species represented at least r times and the higher
derivatives of the number of unique species seen per unit of sampling. We further show the estimators
retain asymptotic behaviors that are essential for applications on large-scale data sets and for large r.
We validate practical performance of this approach by applying it to analyze Dickens’ vocabulary, the
topology of a Twitter social network, and DNA sequencing data.
%0 Journal Article
%1 deng2016estimating
%A Deng, Chao
%A Smith, Andrew D.
%D 2016
%J ArXiv e-prints
%K Pade_approximation diversity_estimation methods number_of_unseen_species species_richness
%T Estimating the number of species to attain sufficient representation in a random sample
%X The statistical problem of using an initial sample to estimate the number of unique species in a
larger sample has found important applications in fields far removed from ecology. Here we address
the general problem of estimating the number of species that will be represented at least r times, for
any r ≥ 1, in a future sample. We derive a procedure to construct estimators that apply universally for
a given population: once constructed, they can be evaluated as a simple function of r. Our approach
is based on a relationship between the number of species represented at least r times and the higher
derivatives of the number of unique species seen per unit of sampling. We further show the estimators
retain asymptotic behaviors that are essential for applications on large-scale data sets and for large r.
We validate practical performance of this approach by applying it to analyze Dickens’ vocabulary, the
topology of a Twitter social network, and DNA sequencing data.
@article{deng2016estimating,
abstract = {The statistical problem of using an initial sample to estimate the number of unique species in a
larger sample has found important applications in fields far removed from ecology. Here we address
the general problem of estimating the number of species that will be represented at least r times, for
any r ≥ 1, in a future sample. We derive a procedure to construct estimators that apply universally for
a given population: once constructed, they can be evaluated as a simple function of r. Our approach
is based on a relationship between the number of species represented at least r times and the higher
derivatives of the number of unique species seen per unit of sampling. We further show the estimators
retain asymptotic behaviors that are essential for applications on large-scale data sets and for large r.
We validate practical performance of this approach by applying it to analyze Dickens’ vocabulary, the
topology of a Twitter social network, and DNA sequencing data.},
added-at = {2016-07-13T02:31:04.000+0200},
adsnote = {Provided by the SAO/NASA Astrophysics Data System},
adsurl = {http://adsabs.harvard.edu/abs/2016arXiv160702804D},
archiveprefix = {arXiv},
author = {Deng, Chao and Smith, Andrew D.},
biburl = {https://www.bibsonomy.org/bibtex/2004dec2ac276f39872e645a9eaa064f5/peter.ralph},
eprint = {1607.02804},
interhash = {055963340468daa570993fb1701201bb},
intrahash = {004dec2ac276f39872e645a9eaa064f5},
journal = {ArXiv e-prints},
keywords = {Pade_approximation diversity_estimation methods number_of_unseen_species species_richness},
month = jul,
primaryclass = {stat.ME},
timestamp = {2016-07-13T02:31:04.000+0200},
title = {Estimating the number of species to attain sufficient representation in a random sample},
year = 2016
}