Estimating the genomic location and length of identical-by-descent (IBD) segments among individuals is a crucial step in many genetic analyses. However, the exponential growth in the size of biobank and direct-to-consumer (DTC) genetic data sets makes accurate IBD inference a significant computational challenge. Here we present the templated positional Burrows-Wheeler transform (TPBWT) to make fast IBD estimates robust to haplotype and phasing errors. Using haplotype data simulated over pedigrees with realistic genotyping and phasing errors we show that the TPBWT outperforms other state-of-the-art IBD inference algorithms in terms of speed and accuracy. For each phase-aware method, we explore the false positive and false negative rates of inferring IBD by segment length and characterize the types of error commonly found. Additionally we compare the performance of the TPBWT against a widely used phase-free IBD inference approach that is robust to phasing errors. We introduce both in-sample and out-of-sample TPBWT-based IBD inference algorithms and demonstrate their computational efficiency on massive-scale datasets with millions of samples. Furthermore we describe a binary file format for TPBWT-compressed haplotypes that results in fast and efficient out-of-sample IBD computes against very large cohort panels. Finally, we demonstrate the utility of the TPBWT in a brief empirical analysis exploring geographic patterns of haplotype sharing within Mexico. Hierarchical clustering of IBD shared across regions within Mexico reveals the geographic structure of Mexico’s rich genetic diversity. Our software implementation of the TPBWT is freely available in the code repository https://github.com/23andMe/phasedibd.Competing Interest StatementW.A.F., K.F.M., S.S.S., E.M.J., K.B., and A.A. are employed by 23andMe, Inc.
%0 Journal Article
%1 2020robust
%A Freyman, William A.
%A and McManus, Kimberly F.
%A and Shringarpure, Suyash S.
%A and Jewett, Ethan M.
%A and Bryc, Katarzyna
%A and the 23andMe Research Team,
%A and Auton, Adam
%D 2020
%I Cold Spring Harbor Laboratory
%J bioRxiv
%K IBD PBWT methods
%R 10.1101/2020.09.14.296939
%T Fast and robust identity-by-descent inference with the templated positional Burrows-Wheeler transform
%U https://www.biorxiv.org/content/early/2020/09/15/2020.09.14.296939
%X Estimating the genomic location and length of identical-by-descent (IBD) segments among individuals is a crucial step in many genetic analyses. However, the exponential growth in the size of biobank and direct-to-consumer (DTC) genetic data sets makes accurate IBD inference a significant computational challenge. Here we present the templated positional Burrows-Wheeler transform (TPBWT) to make fast IBD estimates robust to haplotype and phasing errors. Using haplotype data simulated over pedigrees with realistic genotyping and phasing errors we show that the TPBWT outperforms other state-of-the-art IBD inference algorithms in terms of speed and accuracy. For each phase-aware method, we explore the false positive and false negative rates of inferring IBD by segment length and characterize the types of error commonly found. Additionally we compare the performance of the TPBWT against a widely used phase-free IBD inference approach that is robust to phasing errors. We introduce both in-sample and out-of-sample TPBWT-based IBD inference algorithms and demonstrate their computational efficiency on massive-scale datasets with millions of samples. Furthermore we describe a binary file format for TPBWT-compressed haplotypes that results in fast and efficient out-of-sample IBD computes against very large cohort panels. Finally, we demonstrate the utility of the TPBWT in a brief empirical analysis exploring geographic patterns of haplotype sharing within Mexico. Hierarchical clustering of IBD shared across regions within Mexico reveals the geographic structure of Mexico’s rich genetic diversity. Our software implementation of the TPBWT is freely available in the code repository https://github.com/23andMe/phasedibd.Competing Interest StatementW.A.F., K.F.M., S.S.S., E.M.J., K.B., and A.A. are employed by 23andMe, Inc.
@article{2020robust,
abstract = {Estimating the genomic location and length of identical-by-descent (IBD) segments among individuals is a crucial step in many genetic analyses. However, the exponential growth in the size of biobank and direct-to-consumer (DTC) genetic data sets makes accurate IBD inference a significant computational challenge. Here we present the templated positional Burrows-Wheeler transform (TPBWT) to make fast IBD estimates robust to haplotype and phasing errors. Using haplotype data simulated over pedigrees with realistic genotyping and phasing errors we show that the TPBWT outperforms other state-of-the-art IBD inference algorithms in terms of speed and accuracy. For each phase-aware method, we explore the false positive and false negative rates of inferring IBD by segment length and characterize the types of error commonly found. Additionally we compare the performance of the TPBWT against a widely used phase-free IBD inference approach that is robust to phasing errors. We introduce both in-sample and out-of-sample TPBWT-based IBD inference algorithms and demonstrate their computational efficiency on massive-scale datasets with millions of samples. Furthermore we describe a binary file format for TPBWT-compressed haplotypes that results in fast and efficient out-of-sample IBD computes against very large cohort panels. Finally, we demonstrate the utility of the TPBWT in a brief empirical analysis exploring geographic patterns of haplotype sharing within Mexico. Hierarchical clustering of IBD shared across regions within Mexico reveals the geographic structure of Mexico{\textquoteright}s rich genetic diversity. Our software implementation of the TPBWT is freely available in the code repository https://github.com/23andMe/phasedibd.Competing Interest StatementW.A.F., K.F.M., S.S.S., E.M.J., K.B., and A.A. are employed by 23andMe, Inc.},
added-at = {2020-10-04T06:01:14.000+0200},
author = {Freyman, William A. and and McManus, Kimberly F. and and Shringarpure, Suyash S. and and Jewett, Ethan M. and and Bryc, Katarzyna and and {the 23andMe Research Team} and and Auton, Adam},
biburl = {https://www.bibsonomy.org/bibtex/2bec4fa75b6bdbeebc77e2289954144f3/peter.ralph},
doi = {10.1101/2020.09.14.296939},
elocation-id = {2020.09.14.296939},
eprint = {https://www.biorxiv.org/content/early/2020/09/15/2020.09.14.296939.full.pdf},
interhash = {52424d461c27c7c99f57a4cb401857cf},
intrahash = {bec4fa75b6bdbeebc77e2289954144f3},
journal = {bioRxiv},
keywords = {IBD PBWT methods},
publisher = {Cold Spring Harbor Laboratory},
timestamp = {2020-10-04T06:01:14.000+0200},
title = {Fast and robust identity-by-descent inference with the templated positional Burrows-Wheeler transform},
url = {https://www.biorxiv.org/content/early/2020/09/15/2020.09.14.296939},
year = 2020
}