The Cancer Genome Atlas (TCGA) RNA-Sequencing data are used widely for research. TCGA provides 'Level 3' data, which have been processed using a pipeline specific to that resource. However, we have found using experimentally derived data that this pipeline produces gene-expression values that vary considerably across biological replicates. In addition, some RNA-Sequencing analysis tools require integer-based read counts, which are not provided with the Level 3 data. As an alternative, we have reprocessed the data for 9264 tumor and 741 normal samples across 24 cancer types using the Rsubread package. We have also collated corresponding clinical data for these samples. We provide these data as a community resource.We compared TCGA samples processed using either pipeline and found that the Rsubread pipeline produced fewer zero-expression genes and more consistent expression levels across replicate samples than the TCGA pipeline. Additionally, we used a genomic-signature approach to estimate HER2 (ERBB2) activation status for 662 breast-tumor samples and found that the Rsubread data resulted in stronger predictions of HER2 pathway activity. Finally, we used data from both pipelines to classify 575 lung cancer samples based on histological type. This analysis identified various non-coding RNA that may influence lung-cancer histology.The RNA-Sequencing and clinical data can be downloaded from Gene Expression Omnibus (accession number GSE62944). Scripts and code that were used to process and analyze the data are available from https://github.com/srp33/TCGA\_RNASeq\_Clinical.stephen\_piccolo@byu.edu or andreab@genetics.utah.eduSupplementary material is available at Bioinformatics online.
%0 Journal Article
%1 Rahman:2015:Bioinformatics:26209429
%A Rahman, M
%A Jackson, L K
%A Johnson, W E
%A Li, D Y
%A Bild, A H
%A Piccolo, S R
%D 2015
%J Bioinformatics
%K SHOULDREAD cancer-research fpkm fulltext pipeline r-language rna-seq rpkm software statistics tcga tpm
%N 22
%P 3666-3672
%R 10.1093/bioinformatics/btv377
%T Alternative preprocessing of RNA-Sequencing data in The Cancer Genome Atlas leads to improved analysis results
%U https://www.ncbi.nlm.nih.gov/pubmed/26209429
%V 31
%X The Cancer Genome Atlas (TCGA) RNA-Sequencing data are used widely for research. TCGA provides 'Level 3' data, which have been processed using a pipeline specific to that resource. However, we have found using experimentally derived data that this pipeline produces gene-expression values that vary considerably across biological replicates. In addition, some RNA-Sequencing analysis tools require integer-based read counts, which are not provided with the Level 3 data. As an alternative, we have reprocessed the data for 9264 tumor and 741 normal samples across 24 cancer types using the Rsubread package. We have also collated corresponding clinical data for these samples. We provide these data as a community resource.We compared TCGA samples processed using either pipeline and found that the Rsubread pipeline produced fewer zero-expression genes and more consistent expression levels across replicate samples than the TCGA pipeline. Additionally, we used a genomic-signature approach to estimate HER2 (ERBB2) activation status for 662 breast-tumor samples and found that the Rsubread data resulted in stronger predictions of HER2 pathway activity. Finally, we used data from both pipelines to classify 575 lung cancer samples based on histological type. This analysis identified various non-coding RNA that may influence lung-cancer histology.The RNA-Sequencing and clinical data can be downloaded from Gene Expression Omnibus (accession number GSE62944). Scripts and code that were used to process and analyze the data are available from https://github.com/srp33/TCGA\_RNASeq\_Clinical.stephen\_piccolo@byu.edu or andreab@genetics.utah.eduSupplementary material is available at Bioinformatics online.
@article{Rahman:2015:Bioinformatics:26209429,
abstract = {The Cancer Genome Atlas (TCGA) RNA-Sequencing data are used widely for research. TCGA provides 'Level 3' data, which have been processed using a pipeline specific to that resource. However, we have found using experimentally derived data that this pipeline produces gene-expression values that vary considerably across biological replicates. In addition, some RNA-Sequencing analysis tools require integer-based read counts, which are not provided with the Level 3 data. As an alternative, we have reprocessed the data for 9264 tumor and 741 normal samples across 24 cancer types using the Rsubread package. We have also collated corresponding clinical data for these samples. We provide these data as a community resource.We compared TCGA samples processed using either pipeline and found that the Rsubread pipeline produced fewer zero-expression genes and more consistent expression levels across replicate samples than the TCGA pipeline. Additionally, we used a genomic-signature approach to estimate HER2 (ERBB2) activation status for 662 breast-tumor samples and found that the Rsubread data resulted in stronger predictions of HER2 pathway activity. Finally, we used data from both pipelines to classify 575 lung cancer samples based on histological type. This analysis identified various non-coding RNA that may influence lung-cancer histology.The RNA-Sequencing and clinical data can be downloaded from Gene Expression Omnibus (accession number GSE62944). Scripts and code that were used to process and analyze the data are available from https://github.com/srp33/TCGA\_RNASeq\_Clinical.stephen\_piccolo@byu.edu or andreab@genetics.utah.eduSupplementary material is available at Bioinformatics online.},
added-at = {2017-11-06T14:04:20.000+0100},
author = {Rahman, M and Jackson, L K and Johnson, W E and Li, D Y and Bild, A H and Piccolo, S R},
biburl = {https://www.bibsonomy.org/bibtex/2ccbb8b8e325302af5344a7912f47bf84/marcsaric},
doi = {10.1093/bioinformatics/btv377},
interhash = {da318ea07cb1ecea05bbb3f264f4e958},
intrahash = {ccbb8b8e325302af5344a7912f47bf84},
journal = {Bioinformatics},
keywords = {SHOULDREAD cancer-research fpkm fulltext pipeline r-language rna-seq rpkm software statistics tcga tpm},
month = nov,
number = 22,
pages = {3666-3672},
pmid = {26209429},
timestamp = {2019-07-07T10:08:56.000+0200},
title = {Alternative preprocessing of RNA-Sequencing data in The Cancer Genome Atlas leads to improved analysis results},
url = {https://www.ncbi.nlm.nih.gov/pubmed/26209429},
volume = 31,
year = 2015
}