RNA-Seq is a promising new technology for accurately measuring gene expression levels. Expression estimation with RNA-Seq requires the mapping of relatively short sequencing reads to a reference genome or transcript set. Because reads are generally shorter than transcripts from which they are derived, a single read may map to multiple genes and isoforms, complicating expression analyses. Previous computational methods either discard reads that map to multiple locations or allocate them to genes heuristically.We present a generative statistical model and associated inference methods that handle read mapping uncertainty in a principled manner. Through simulations parameterized by real RNA-Seq data, we show that our method is more accurate than previous methods. Our improved accuracy is the result of handling read mapping uncertainty with a statistical model and the estimation of gene expression levels as the sum of isoform expression levels. Unlike previous methods, our method is capable of modeling non-uniform read distributions. Simulations with our method indicate that a read length of 20-25 bases is optimal for gene-level expression estimation from mouse and maize RNA-Seq data when sequencing throughput is fixed.
%0 Journal Article
%1 Li:2010:Bioinformatics:20022975
%A Li, B
%A Ruotti, V
%A Stewart, R M
%A Thomson, J A
%A Dewey, C N
%D 2010
%J Bioinformatics
%K SHOULDREAD fulltext rna-seq rsem
%N 4
%P 493-500
%R 10.1093/bioinformatics/btp692
%T RNA-Seq gene expression estimation with read mapping uncertainty
%U https://www.ncbi.nlm.nih.gov/pubmed?cmd=Search&doptcmdl=Citation&defaultField=Title%20Word&term=Li%5Bauthor%5D%20AND%20RNA-Seq%20gene%20expression%20estimation%20with%20read%20mapping%20uncertainty
%V 26
%X RNA-Seq is a promising new technology for accurately measuring gene expression levels. Expression estimation with RNA-Seq requires the mapping of relatively short sequencing reads to a reference genome or transcript set. Because reads are generally shorter than transcripts from which they are derived, a single read may map to multiple genes and isoforms, complicating expression analyses. Previous computational methods either discard reads that map to multiple locations or allocate them to genes heuristically.We present a generative statistical model and associated inference methods that handle read mapping uncertainty in a principled manner. Through simulations parameterized by real RNA-Seq data, we show that our method is more accurate than previous methods. Our improved accuracy is the result of handling read mapping uncertainty with a statistical model and the estimation of gene expression levels as the sum of isoform expression levels. Unlike previous methods, our method is capable of modeling non-uniform read distributions. Simulations with our method indicate that a read length of 20-25 bases is optimal for gene-level expression estimation from mouse and maize RNA-Seq data when sequencing throughput is fixed.
@article{Li:2010:Bioinformatics:20022975,
abstract = {RNA-Seq is a promising new technology for accurately measuring gene expression levels. Expression estimation with RNA-Seq requires the mapping of relatively short sequencing reads to a reference genome or transcript set. Because reads are generally shorter than transcripts from which they are derived, a single read may map to multiple genes and isoforms, complicating expression analyses. Previous computational methods either discard reads that map to multiple locations or allocate them to genes heuristically.We present a generative statistical model and associated inference methods that handle read mapping uncertainty in a principled manner. Through simulations parameterized by real RNA-Seq data, we show that our method is more accurate than previous methods. Our improved accuracy is the result of handling read mapping uncertainty with a statistical model and the estimation of gene expression levels as the sum of isoform expression levels. Unlike previous methods, our method is capable of modeling non-uniform read distributions. Simulations with our method indicate that a read length of 20-25 bases is optimal for gene-level expression estimation from mouse and maize RNA-Seq data when sequencing throughput is fixed.},
added-at = {2017-07-23T22:12:47.000+0200},
author = {Li, B and Ruotti, V and Stewart, R M and Thomson, J A and Dewey, C N},
biburl = {https://www.bibsonomy.org/bibtex/2b83d933bad6cb63cacff706b54b3f3cb/marcsaric},
description = {RNA-Seq gene expression estimation with read mapping uncertainty. - PubMed - NCBI},
doi = {10.1093/bioinformatics/btp692},
interhash = {ddf660ca09a5784ccd6039a1dd3d6473},
intrahash = {b83d933bad6cb63cacff706b54b3f3cb},
journal = {Bioinformatics},
keywords = {SHOULDREAD fulltext rna-seq rsem},
month = feb,
number = 4,
pages = {493-500},
pmid = {20022975},
timestamp = {2017-07-23T22:12:47.000+0200},
title = {RNA-Seq gene expression estimation with read mapping uncertainty},
url = {https://www.ncbi.nlm.nih.gov/pubmed?cmd=Search&doptcmdl=Citation&defaultField=Title%20Word&term=Li%5Bauthor%5D%20AND%20RNA-Seq%20gene%20expression%20estimation%20with%20read%20mapping%20uncertainty},
volume = 26,
year = 2010
}