The UK Biobank project is a large prospective cohort study of ~500,000 individuals from across the United Kingdom, aged between 40-69 at recruitment. A rich variety of phenotypic and health-related information is available on each participant, making the resource unprecedented in its size and scope. Here we describe the genome-wide genotype data (~805,000 markers) collected on all individuals in the cohort and its quality control procedures. Genotype data on this scale offers novel opportunities for assessing quality issues, although the wide range of ancestries of the individuals in the cohort also creates particular challenges. We also conducted a set of analyses that reveal properties of the genetic data (such as population structure and relatedness) that can be important for downstream analyses. In addition, we phased and imputed genotypes into the dataset, using computationally efficient methods combined with the Haplotype Reference Consortium (HRC) and UK10K haplotype resource. This increases the number of testable variants by over 100-fold to ~96 million variants. We also imputed classical allelic variation at 11 human leukocyte antigen (HLA) genes, and as a quality control check of this imputation, we replicate signals of known associations between HLA alleles and many common diseases. We describe tools that allow efficient genome-wide association studies (GWAS) of multiple traits and fast phenome-wide association studies (PheWAS), which work together with a new compressed file format that has been used to distribute the dataset. As a further check of the genotyped and imputed datasets, we performed a test-case genome-wide association scan on a well-studied human trait, standing height.
%0 Journal Article
%1 bycroft2017genomewide
%A Bycroft, Clare
%A Freeman, Colin
%A Petkova, Desislava
%A Band, Gavin
%A Elliott, Lloyd T
%A Sharp, Kevin
%A Motyer, Allan
%A Vukcevic, Damjan
%A Delaneau, Olivier
%A O’Connell, Jared
%A Cortes, Adrian
%A Welsh, Samantha
%A McVean, Gil
%A Leslie, Stephen
%A Donnelly, Peter
%A Marchini, Jonathan
%D 2017
%I Cold Spring Harbor Laboratory
%J bioRxiv
%K UK_biobank data_source human_genome
%R 10.1101/166298
%T Genome-wide genetic data on $\sim$500,000 UK Biobank participants
%U https://www.biorxiv.org/content/early/2017/07/20/166298
%X The UK Biobank project is a large prospective cohort study of ~500,000 individuals from across the United Kingdom, aged between 40-69 at recruitment. A rich variety of phenotypic and health-related information is available on each participant, making the resource unprecedented in its size and scope. Here we describe the genome-wide genotype data (~805,000 markers) collected on all individuals in the cohort and its quality control procedures. Genotype data on this scale offers novel opportunities for assessing quality issues, although the wide range of ancestries of the individuals in the cohort also creates particular challenges. We also conducted a set of analyses that reveal properties of the genetic data (such as population structure and relatedness) that can be important for downstream analyses. In addition, we phased and imputed genotypes into the dataset, using computationally efficient methods combined with the Haplotype Reference Consortium (HRC) and UK10K haplotype resource. This increases the number of testable variants by over 100-fold to ~96 million variants. We also imputed classical allelic variation at 11 human leukocyte antigen (HLA) genes, and as a quality control check of this imputation, we replicate signals of known associations between HLA alleles and many common diseases. We describe tools that allow efficient genome-wide association studies (GWAS) of multiple traits and fast phenome-wide association studies (PheWAS), which work together with a new compressed file format that has been used to distribute the dataset. As a further check of the genotyped and imputed datasets, we performed a test-case genome-wide association scan on a well-studied human trait, standing height.
@article{bycroft2017genomewide,
abstract = {The UK Biobank project is a large prospective cohort study of ~500,000 individuals from across the United Kingdom, aged between 40-69 at recruitment. A rich variety of phenotypic and health-related information is available on each participant, making the resource unprecedented in its size and scope. Here we describe the genome-wide genotype data (~805,000 markers) collected on all individuals in the cohort and its quality control procedures. Genotype data on this scale offers novel opportunities for assessing quality issues, although the wide range of ancestries of the individuals in the cohort also creates particular challenges. We also conducted a set of analyses that reveal properties of the genetic data (such as population structure and relatedness) that can be important for downstream analyses. In addition, we phased and imputed genotypes into the dataset, using computationally efficient methods combined with the Haplotype Reference Consortium (HRC) and UK10K haplotype resource. This increases the number of testable variants by over 100-fold to ~96 million variants. We also imputed classical allelic variation at 11 human leukocyte antigen (HLA) genes, and as a quality control check of this imputation, we replicate signals of known associations between HLA alleles and many common diseases. We describe tools that allow efficient genome-wide association studies (GWAS) of multiple traits and fast phenome-wide association studies (PheWAS), which work together with a new compressed file format that has been used to distribute the dataset. As a further check of the genotyped and imputed datasets, we performed a test-case genome-wide association scan on a well-studied human trait, standing height.},
added-at = {2018-07-11T20:56:10.000+0200},
author = {Bycroft, Clare and Freeman, Colin and Petkova, Desislava and Band, Gavin and Elliott, Lloyd T and Sharp, Kevin and Motyer, Allan and Vukcevic, Damjan and Delaneau, Olivier and O{\textquoteright}Connell, Jared and Cortes, Adrian and Welsh, Samantha and McVean, Gil and Leslie, Stephen and Donnelly, Peter and Marchini, Jonathan},
biburl = {https://www.bibsonomy.org/bibtex/2d88d6242b199b6a6762cb906effb3dd6/peter.ralph},
doi = {10.1101/166298},
eprint = {https://www.biorxiv.org/content/early/2017/07/20/166298.full.pdf},
interhash = {23f373ce90bb4dea6844b0011640df03},
intrahash = {d88d6242b199b6a6762cb906effb3dd6},
journal = {bioRxiv},
keywords = {UK_biobank data_source human_genome},
publisher = {Cold Spring Harbor Laboratory},
timestamp = {2018-07-11T20:56:10.000+0200},
title = {Genome-wide genetic data on $\sim$500,000 {UK} {Biobank} participants},
url = {https://www.biorxiv.org/content/early/2017/07/20/166298},
year = 2017
}