In online social networks, it is common to use predictions of node categories
to estimate measures of homophily and other relational properties. However,
online social network data often lacks basic demographic information about the
nodes. Researchers must rely on predicted node attributes to estimate measures
of homophily, but little is known about the validity of these measures. We show
that estimating homophily in a network can be viewed as a dyadic prediction
problem, and that homophily estimates are unbiased when dyad-level residuals
sum to zero in the network. Node-level prediction models, such as the use of
names to classify ethnicity or gender, do not generally have this property and
can introduce large biases into homophily estimates. Bias occurs due to error
autocorrelation along dyads. Importantly, node-level classification performance
is not a reliable indicator of estimation accuracy for homophily. We compare
estimation strategies that make predictions at the node and dyad levels,
evaluating performance in different settings. We propose a novel "ego-alter"
modeling approach that outperforms standard node and dyad classification
strategies. While this paper focuses on homophily, results generalize to other
relational measures which aggregate predictions along the dyads in a network.
We conclude with suggestions for research designs to study homophily in online
networks. Code for this paper is available at
https://github.com/georgeberry/autocorr.
%0 Generic
%1 berry2020going
%A Berry, George
%A Sirianni, Antonio
%A Weber, Ingmar
%A An, Jisun
%A Macy, Michael
%D 2020
%K homophily uncertainty
%T Going beyond accuracy: estimating homophily in social networks using
predictions
%U http://arxiv.org/abs/2001.11171
%X In online social networks, it is common to use predictions of node categories
to estimate measures of homophily and other relational properties. However,
online social network data often lacks basic demographic information about the
nodes. Researchers must rely on predicted node attributes to estimate measures
of homophily, but little is known about the validity of these measures. We show
that estimating homophily in a network can be viewed as a dyadic prediction
problem, and that homophily estimates are unbiased when dyad-level residuals
sum to zero in the network. Node-level prediction models, such as the use of
names to classify ethnicity or gender, do not generally have this property and
can introduce large biases into homophily estimates. Bias occurs due to error
autocorrelation along dyads. Importantly, node-level classification performance
is not a reliable indicator of estimation accuracy for homophily. We compare
estimation strategies that make predictions at the node and dyad levels,
evaluating performance in different settings. We propose a novel "ego-alter"
modeling approach that outperforms standard node and dyad classification
strategies. While this paper focuses on homophily, results generalize to other
relational measures which aggregate predictions along the dyads in a network.
We conclude with suggestions for research designs to study homophily in online
networks. Code for this paper is available at
https://github.com/georgeberry/autocorr.
@misc{berry2020going,
abstract = {In online social networks, it is common to use predictions of node categories
to estimate measures of homophily and other relational properties. However,
online social network data often lacks basic demographic information about the
nodes. Researchers must rely on predicted node attributes to estimate measures
of homophily, but little is known about the validity of these measures. We show
that estimating homophily in a network can be viewed as a dyadic prediction
problem, and that homophily estimates are unbiased when dyad-level residuals
sum to zero in the network. Node-level prediction models, such as the use of
names to classify ethnicity or gender, do not generally have this property and
can introduce large biases into homophily estimates. Bias occurs due to error
autocorrelation along dyads. Importantly, node-level classification performance
is not a reliable indicator of estimation accuracy for homophily. We compare
estimation strategies that make predictions at the node and dyad levels,
evaluating performance in different settings. We propose a novel "ego-alter"
modeling approach that outperforms standard node and dyad classification
strategies. While this paper focuses on homophily, results generalize to other
relational measures which aggregate predictions along the dyads in a network.
We conclude with suggestions for research designs to study homophily in online
networks. Code for this paper is available at
https://github.com/georgeberry/autocorr.},
added-at = {2020-01-31T10:27:11.000+0100},
author = {Berry, George and Sirianni, Antonio and Weber, Ingmar and An, Jisun and Macy, Michael},
biburl = {https://www.bibsonomy.org/bibtex/2eb9ba12b192ca61bc8bbe3764b310cf0/mstrohm},
interhash = {5af436288197a771e1c418438b85cd1f},
intrahash = {eb9ba12b192ca61bc8bbe3764b310cf0},
keywords = {homophily uncertainty},
note = {cite arxiv:2001.11171Comment: 19 pages, 4 figures, 2 tables},
timestamp = {2020-01-31T10:27:11.000+0100},
title = {Going beyond accuracy: estimating homophily in social networks using
predictions},
url = {http://arxiv.org/abs/2001.11171},
year = 2020
}