@article{Mullikin:2003aa, title = {The phusion assembler}, author = {James C Mullikin and Zemin Ning}, journal = {Genome Res}, month = {Jan}, number = 1, pages = {81--90}, volume = 13, year = 2003, pmid = {12529309}, issue = {1}, affiliation = {Informatics Department, The Wellcome Trust Sanger Institute, Wellcome Trust Genome Campus, Hinxton, Cambridge CB10 1SA, UK. jcm@sanger.ac.uk}, language = {English}, uri = {papers://055852FE-1648-42FE-91D0-8CA474D2B905/Paper/p16}, url = {file://localhost/Users/danielzerbino/Documents/Papers/2003/Mullikin/Genome%20Res%202003%20Mullikin.pdf}, doi = {10.1101/gr.731003}, abstract = {The Phusion assembler has assembled the mouse genome from the whole-genome shotgun (WGS) dataset collected by the Mouse Genome Sequencing Consortium, at ~7.5x sequence coverage, producing a high-quality draft assembly 2.6 gigabases in size, of which 90% of these bases are in 479 scaffolds. For the mouse genome, which is a large and repeat-rich genome, the input dataset was designed to include a high proportion of paired end sequences of various size selected inserts, from 2-200 kbp lengths, into various host vector templates. Phusion uses sequence data, called reads, and information about reads that share common templates, called read pairs, to drive the assembly of this large genome to highly accurate results. The preassembly stage, which clusters the reads into sensible groups, is a key element of the entire assembler, because it permits a simple approach to parallelization of the assembly stage, as each cluster can be treated independent of the others. In addition to the application of Phusion to the mouse genome, we will also present results from the WGS assembly of Caenorhabditis briggsae sequenced to about 11x coverage. The C. briggsae assembly was accessioned through EMBL, http://www.ebi.ac.uk/services/index.html, using the series CAAC01000001-CAAC01000578, however, the Phusion mouse assembly described here was not accessioned. The mouse data was generated by the Mouse Genome Sequencing Consortium. The C. briggsae sequence was generated at The Wellcome Trust Sanger Institute and the Genome Sequencing Center, Washington University School of Medicine.}, biburl = {http://www.bibsonomy.org/bibtex/27d0e6e0eb472378e68c35c905e7afebb/dzerbino}, keywords = {elegans, Animals, Caenorhabditis Genome, Software, Mapping, Computational Mice Contig Biology,} } @article{Batzoglou:2002aa, title = {ARACHNE: a whole-genome shotgun assembler}, author = {Serafim Batzoglou and David B Jaffe and Ken Stanley and Jonathan Butler and Sante Gnerre and Evan Mauceli and Bonnie Berger and Jill P Mesirov and Eric S Lander}, journal = {Genome Res}, month = {Jan}, number = 1, pages = {177--89}, volume = 12, year = 2002, pmid = {11779843}, issue = {1}, affiliation = {Laboratory for Computer Science, Massachusetts Institute of Technology, Cambridge, Massachusetts 02139, USA.}, language = {English}, uri = {papers://055852FE-1648-42FE-91D0-8CA474D2B905/Paper/p14}, url = {file://localhost/Users/danielzerbino/Documents/Papers/2002/Batzoglou/Genome%20Res%202002%20Batzoglou.pdf}, doi = {10.1101/gr.208902}, abstract = {We describe a new computer system, called ARACHNE, for assembling genome sequence using paired-end whole-genome shotgun reads. ARACHNE has several key features, including an efficient and sensitive procedure for finding read overlaps, a procedure for scoring overlaps that achieves high accuracy by correcting errors before assembly, read merger based on forward-reverse links, and detection of repeat contigs by forward-reverse link inconsistency. To test ARACHNE, we created simulated reads providing approximately 10-fold coverage of the genomes of H. influenzae, S. cerevisiae, and D. melanogaster, as well as human chromosomes 21 and 22. The assemblies of these simulated reads yielded nearly complete coverage of the respective genomes, with a small number of contigs joined into a smaller number of supercontigs (or scaffolds). For example, analysis of the D. melanogaster genome yielded approximately 98% coverage with an N50 contig length of 324 kb and an N50 supercontig length of 5143 kb. The assembly accuracy was high, although not perfect: small errors occurred at a frequency of roughly 1 per 1 Mb (typically, deletion of approximately 1 kb in size), with a very small number of other misassemblies. The assembly was rapid: the Drosophila assembly required only 21 hours on a single 667 MHz processor and used 8.4 Gb of memory.}, biburl = {http://www.bibsonomy.org/bibtex/217ec918bb871c3d8239479bd26364a71/dzerbino}, keywords = {Fungal, Humans Human, melanogaster, influenzae, Consensus Haemophilus Algorithms, Alignment, Contig Animals, Genome, cerevisiae, Software, Saccharomyces Sequence, Mapping, Sequence Bacterial, Drosophila} } @article{Sommer:2007aa, title = {Minimus: a fast, lightweight genome assembler}, author = {Daniel D Sommer and Arthur L Delcher and Steven L Salzberg and Mihai Pop}, journal = {BMC Bioinformatics}, month = {Feb}, pages = 64, volume = 8, year = 2007, pii = {1471-2105-8-64}, pmid = {17324286}, affiliation = {Center for Bioinformatics and Computational Biology, University of Maryland, College Park, MD 20742, USA. dsommer@umiacs.umd.edu }, language = {English}, uri = {papers://055852FE-1648-42FE-91D0-8CA474D2B905/Paper/p22}, url = {file://localhost/Users/danielzerbino/Documents/Papers/2007/Sommer/BMC%20Bioinformatics%202007%20Sommer.pdf}, doi = {10.1186/1471-2105-8-64}, abstract = {BACKGROUND: Genome assemblers have grown very large and complex in response to the need for algorithms to handle the challenges of large whole-genome sequencing projects. Many of the most common uses of assemblers, however, are best served by a simpler type of assembler that requires fewer software components, uses less memory, and is far easier to install and run. RESULTS: We have developed the Minimus assembler to address these issues, and tested it on a range of assembly problems. We show that Minimus performs well on several small assembly tasks, including the assembly of viral genomes, individual genes, and BAC clones. In addition, we evaluate Minimus' performance in assembling bacterial genomes in order to assess its suitability as a component of a larger assembly pipeline. We show that, unlike other software currently used for these tasks, Minimus produces significantly fewer assembly errors, at the cost of generating a more fragmented assembly. CONCLUSION: We find that for small genomes and other small assembly tasks, Minimus is faster and far more flexible than existing tools. Due to its small size and modular design Minimus is perfectly suited to be a component of complex assembly pipelines. Minimus is released as an open-source software project and the code is available as part of the AMOS project at Sourceforge.}, biburl = {http://www.bibsonomy.org/bibtex/2150cd10c40aace2c238aaa20c8480e08/dzerbino}, keywords = {Software Base Chromosome User-Computer Algorithms, Alignment, Analysis, Interface Software, Mapping, Sequence, Sequence Design, Data, Molecular DNA,} } @article{Huang:2003aa, title = {PCAP: a whole-genome assembly program}, author = {Xiaoqiu Huang and Jianmin Wang and Srinivas Aluru and Shiaw-Pyng Yang and LaDeana Hillier}, journal = {Genome Res}, month = {Sep}, number = 9, pages = {2164--70}, volume = 13, year = 2003, pii = {13/9/2164}, pmid = {12952883}, issue = {9}, affiliation = {Department of Computer Science Iowa State University, Ames, Iowa 50011-1040, USA. xqhuang@cs.iastate.edu}, language = {English}, uri = {papers://055852FE-1648-42FE-91D0-8CA474D2B905/Paper/p27}, url = {file://localhost/Users/danielzerbino/Documents/Papers/2003/Huang/Genome%20Res%202003%20Huang.pdf}, doi = {10.1101/gr.1390403}, abstract = {We describe a whole-genome assembly program named PCAP for processing tens of millions of reads. The PCAP program has several features to address efficiency and accuracy issues in assembly. Multiple processors are used to perform most time-consuming computations in assembly. A more sensitive method is used to avoid missing overlaps caused by sequencing errors. Repetitive regions of reads are detected on the basis of many overlaps with other reads, instead of many shorter word matches with other reads. Contaminated end regions of reads are identified and removed. Generation of a consensus sequence for a contig is based on an alignment of reads in the contig, in which both base quality values and coverage information are used to determine every consensus base. The PCAP program was tested on a mouse whole-genome data set of 30 million reads and a human Chromosome 20 data set of 1.7 million reads. The program is freely available for academic use.}, biburl = {http://www.bibsonomy.org/bibtex/2e2924e542de58d497e99b128d8659faa/dzerbino}, keywords = {Animals, Genome, Software, Humans Mapping, Computational Sequence Alignment, Algorithms, Mice, Contig Biology,} } @article{Pevzner:2004aa, title = {De novo repeat classification and fragment assembly}, author = {Pavel A Pevzner and Paul A Pevzner and Haixu Tang and Glenn Tesler}, journal = {Genome Res}, month = {Sep}, number = 9, pages = {1786--96}, volume = 14, year = 2004, pii = {14/9/1786}, pmid = {15342561}, issue = {9}, affiliation = {Department of Computer Science and Engineering, University of California, San Diego, La Jolla, California 92093, USA.}, language = {English}, uri = {papers://055852FE-1648-42FE-91D0-8CA474D2B905/Paper/p26}, url = {file://localhost/Users/danielzerbino/Documents/Papers/2004/Pevzner/Genome%20Res%202004%20Pevzner.pdf}, doi = {10.1101/gr.2395204}, abstract = {Repetitive sequences make up a significant fraction of almost any genome, and an important and still open question in bioinformatics is how to represent all repeats in DNA sequences. We propose a new approach to repeat classification that represents all repeats in a genome as a mosaic of sub-repeats. Our key algorithmic idea also leads to new approaches to multiple alignment and fragment assembly. In particular, we show that our FragmentGluer assembler improves on Phrap and ARACHNE in assembly of BACs and bacterial genomes.}, biburl = {http://www.bibsonomy.org/bibtex/251d90db8174d7d1da830b654951e5ea0/dzerbino}, keywords = {Humans, Human, Multigene Linkage (Genetics), Algorithms, Alignment, Cluster Sequences, Nucleic Analysis, Biology, Contig Acid, Genome, Mapping, Computational Family Artificial, Chromosomes, Sequence Repetitive Bacterial,} } @article{Chaisson:2004aa, title = {Fragment assembly with short reads}, author = {Mark Chaisson and Pavel Pevzner and Haixu Tang}, journal = {Bioinformatics}, month = {Sep}, number = 13, pages = {2067--74}, volume = 20, year = 2004, pii = {bth205}, pmid = {15059830}, issue = {13}, affiliation = {Bioinformatics Program, University of California San Diego, La Jolla, CA 92093, USA. mchaisso@bioinf.ucsd.edu}, language = {English}, uri = {papers://055852FE-1648-42FE-91D0-8CA474D2B905/Paper/p25}, url = {file://localhost/Users/danielzerbino/Documents/Papers/2004/Chaisson/Bioinformatics%202004%20Chaisson.pdf}, doi = {10.1093/bioinformatics/bth205}, abstract = {MOTIVATION: Current DNA sequencing technology produces reads of about 500-750 bp, with typical coverage under 10x. New sequencing technologies are emerging that produce shorter reads (length 80-200 bp) but allow one to generate significantly higher coverage (30x and higher) at low cost. Modern assembly programs and error correction routines have been tuned to work well with current read technology but were not designed for assembly of short reads. RESULTS: We analyze the limitations of assembling reads generated by these new technologies and present a routine for base-calling in reads prior to their assembly. We demonstrate that while it is feasible to assemble such short reads, the resulting contigs will require significant (if not prohibitive) finishing efforts. AVAILABILITY: Available from the web at http://www.cse.ucsd.edu/groups/bioinformatics/software.html}, biburl = {http://www.bibsonomy.org/bibtex/2187f0308ae9e5fa2f47476b9ab180a20/dzerbino}, keywords = {Base Feasibility Gene Profiling, Studies, Algorithms, Alignment, Analysis, Contig Mapping, Expression Sequence Data, Molecular DNA,} } @article{Warren:2007aa, title = {Assembling millions of short DNA sequences using SSAKE}, author = {Ren{\'e} L Warren and Granger G Sutton and Steven J M Jones and Robert A Holt}, journal = {Bioinformatics}, month = {Feb}, number = 4, pages = {500--1}, volume = 23, year = 2007, pii = {btl629}, pmid = {17158514}, issue = {4}, affiliation = {British Columbia Cancer Agency, Genome Sciences Centre, 675 West 10th Avenue, Vancouver, BC V5Z 1L3, Canada. rwarren@bcgsc.ca}, language = {English}, uri = {papers://055852FE-1648-42FE-91D0-8CA474D2B905/Paper/p21}, url = {file://localhost/Users/danielzerbino/Documents/Papers/2007/Warren/Bioinformatics%202007%20Warren.pdf}, doi = {10.1093/bioinformatics/btl629}, abstract = {Novel DNA sequencing technologies with the potential for up to three orders magnitude more sequence throughput than conventional Sanger sequencing are emerging. The instrument now available from Solexa Ltd, produces millions of short DNA sequences of 25 nt each. Due to ubiquitous repeats in large genomes and the inability of short sequences to uniquely and unambiguously characterize them, the short read length limits applicability for de novo sequencing. However, given the sequencing depth and the throughput of this instrument, stringent assembly of highly identical sequences can be achieved. We describe SSAKE, a tool for aggressively assembling millions of short nucleotide sequences by progressively searching through a prefix tree for the longest possible overlap between any two sequences. SSAKE is designed to help leverage the information from short sequence reads by stringently assembling them into contiguous sequences that can be used to characterize novel sequencing targets. Availability: http://www.bcgsc.ca/bioinfo/software/ssake.}, biburl = {http://www.bibsonomy.org/bibtex/292574a7dfe4924f76351f0ac33eae32d/dzerbino}, keywords = {Base Chromosome Algorithms, Analysis, Contig Software, Mapping, Sequence, Mapping Sequence Data, Molecular DNA,} } @article{Myers:2000aa, title = {A whole-genome assembly of Drosophila}, author = {E W Myers and G G Sutton and A L Delcher and I M Dew and D P Fasulo and M J Flanigan and S A Kravitz and C M Mobarry and K H Reinert and K A Remington and E L Anson and R A Bolanos and H H Chou and C M Jordan and A L Halpern and S Lonardi and E M Beasley and R C Brandon and L Chen and P J Dunn and Z Lai and Y Liang and D R Nusskern and M Zhan and Q Zhang and X Zheng and G M Rubin and M D Adams and J C Venter}, journal = {Science}, month = {Mar}, number = 5461, pages = {2196--204}, volume = 287, year = 2000, pii = {8395}, pmid = {10731133}, issue = {5461}, affiliation = {Celera Genomics, Inc., 45 West Gude Drive, Rockville, MD 20850, USA. Gene.Myers@celera.com}, language = {English}, uri = {papers://055852FE-1648-42FE-91D0-8CA474D2B905/Paper/p20}, url = {file://localhost/Users/danielzerbino/Documents/Papers/2000/Myers/Science%202000%20Myers.pdf}, abstract = {We report on the quality of a whole-genome assembly of Drosophila melanogaster and the nature of the computer algorithms that accomplished it. Three independent external data sources essentially agree with and support the assembly's sequence and ordering of contigs across the euchromatic portion of the genome. In addition, there are isolated contigs that we believe represent nonrepetitive pockets within the heterochromatin of the centromeres. Comparison with a previously sequenced 2.9- megabase region indicates that sequencing accuracy within nonrepetitive segments is greater than 99. 99% without manual curation. As such, this initial reconstruction of the Drosophila sequence should be of substantial value to the scientific community.}, biburl = {http://www.bibsonomy.org/bibtex/211f0d08db68b4f73ce33ffab95ffef98/dzerbino}, keywords = {Genes, Tagged melanogaster, Chromosome Algorithms, Contig Analysis, Nucleic Sites, Computational Mapping, Sequence Repetitive Molecular Chromatin, Heterochromatin, Insect, Euchromatin, Sequences, Animals, Physical Acid, Genome, Drosophila Data, DNA, Biology} } @article{Jaffe:2003aa, title = {Whole-genome sequence assembly for mammalian genomes: Arachne 2}, author = {David B Jaffe and Jonathan Butler and Sante Gnerre and Evan Mauceli and Kerstin Lindblad-Toh and Jill P Mesirov and Michael C Zody and Eric S Lander}, journal = {Genome Res}, month = {Jan}, number = 1, pages = {91--6}, volume = 13, year = 2003, pmid = {12529310}, issue = {1}, affiliation = {Whitehead Institute/MIT Center for Genome Research, Cambridge, Massachusetts 02141, USA. jaffe@genome.wi.mit.edu}, language = {English}, uri = {papers://055852FE-1648-42FE-91D0-8CA474D2B905/Paper/p12}, url = {file://localhost/Users/danielzerbino/Documents/Papers/2003/Jaffe/Genome%20Res%202003%20Jaffe.pdf}, doi = {10.1101/gr.828403}, abstract = {We previously described the whole-genome assembly program Arachne, presenting assemblies of simulated data for small to mid-sized genomes. Here we describe algorithmic adaptations to the program, allowing for assembly of mammalian-size genomes, and also improving the assembly of smaller genomes. Three principal changes were simultaneously made and applied to the assembly of the mouse genome, during a six-month period of development: (1) Supercontigs (scaffolds) were iteratively broken and rejoined using several criteria, yielding a 64-fold increase in length (N50), and apparent elimination of all global misjoins; (2) gaps between contigs in supercontigs were filled (partially or completely) by insertion of reads, as suggested by pairing within the supercontig, increasing the N50 contig length by 50%; (3) memory usage was reduced fourfold. The outcome of this mouse assembly and its analysis are described in (Mouse Genome Sequencing Consortium 2002).}, biburl = {http://www.bibsonomy.org/bibtex/2c9787f374c7933c38f7ab5214fb93862/dzerbino}, keywords = {Animals, Genome, Software, Humans Mapping, Computational Mice, Contig Biology,} } @article{Havlak:2004aa, title = {The Atlas genome assembly system}, author = {Paul Havlak and Rui Chen and K James Durbin and Amy Egan and Yanru Ren and Xing-Zhi Song and George M Weinstock and Richard A Gibbs}, journal = {Genome Res}, month = {Apr}, number = 4, pages = {721--32}, volume = 14, year = 2004, pii = {14/4/721}, pmid = {15060016}, issue = {4}, affiliation = {Human Genome Sequencing Center, Baylor College of Medicine, Houston, Texas 77030, USA.}, language = {English}, uri = {papers://055852FE-1648-42FE-91D0-8CA474D2B905/Paper/p23}, url = {file://localhost/Users/danielzerbino/Documents/Papers/2004/Havlak/Genome%20Res%202004%20Havlak.pdf}, doi = {10.1101/gr.2264004}, abstract = {Atlas is a suite of programs developed for assembly of genomes by a "combined approach" that uses DNA sequence reads from both BACs and whole-genome shotgun (WGS) libraries. The BAC clones afford advantages of localized assembly with reduced computational load, and provide a robust method for dealing with repeated sequences. Inclusion of WGS sequences facilitates use of different clone insert sizes and reduces data production costs. A core function of Atlas software is recruitment of WGS sequences into appropriate BACs based on sequence overlaps. Because construction of consensus sequences is from local assembly of these reads, only small (<0.1%) units of the genome are assembled at a time. Once assembled, each BAC is used to derive a genomic layout. This "sequence-based" growth of the genome map has greater precision than with non-sequence-based methods. Use of BACs allows correction of artifacts due to repeats at each stage of the process. This is aided by ancillary data such as BAC fingerprint, other genomic maps, and syntenic relations with other genomes. Atlas was used to assemble a draft DNA sequence of the rat genome; its major components including overlapper and split-scaffold are also being used in pure WGS projects.}, biburl = {http://www.bibsonomy.org/bibtex/2a675b55e8044d62ac5f2515e3b8961be/dzerbino}, keywords = {Animals, Genome, Software, Mapping, Chromosomes, Artificial, Rats, Bacterial Contig} } @article{Pevzner:2001aa, title = {An Eulerian path approach to DNA fragment assembly}, author = {P A Pevzner and H Tang and M S Waterman}, journal = {Proc Natl Acad Sci USA}, month = {Aug}, number = 17, pages = {9748--53}, volume = 98, year = 2001, pii = {98/17/9748}, pmid = {11504945}, issue = {17}, affiliation = {Department of Computer Science and Engineering, University of California, San Diego, La Jolla, USA.}, language = {English}, uri = {papers://055852FE-1648-42FE-91D0-8CA474D2B905/Paper/p15}, url = {file://localhost/Users/danielzerbino/Documents/Papers/2001/Pevzner/Proc%20Natl%20Acad%20Sci%20USA%202001%20Pevzner.pdf}, doi = {10.1073/pnas.171285098}, abstract = {For the last 20 years, fragment assembly in DNA sequencing followed the "overlap-layout-consensus" paradigm that is used in all currently available assembly tools. Although this approach proved useful in assembling clones, it faces difficulties in genomic shotgun assembly. We abandon the classical "overlap-layout-consensus" approach in favor of a new euler algorithm that, for the first time, resolves the 20-year-old "repeat problem" in fragment assembly. Our main result is the reduction of the fragment assembly to a variation of the classical Eulerian path problem that allows one to generate accurate solutions of large-scale sequencing problems. euler, in contrast to the celera assembler, does not mask such repeats but uses them instead as a powerful fragment assembly tool.}, biburl = {http://www.bibsonomy.org/bibtex/285853a4fe7db3494508d6631d10f55ca/dzerbino}, keywords = {meningitidis, lactis Neisseria Campylobacter Theoretical, Algorithms, Alignment, Analysis, Contig jejuni, Genome, Models, Software, Mapping, Lactococcus Sequence Bacterial, DNA,} } @article{Pop:2004aa, title = {Hierarchical scaffolding with Bambus}, author = {Mihai Pop and Daniel S Kosack and Steven L Salzberg}, journal = {Genome Res}, month = {Jan}, number = 1, pages = {149--59}, volume = 14, year = 2004, pii = {14/1/149}, pmid = {14707177}, issue = {1}, affiliation = {The Institute for Genomic Research (TIGR), Rockville, Maryland 20850, USA. mpop@tigr.org}, language = {English}, uri = {papers://055852FE-1648-42FE-91D0-8CA474D2B905/Paper/p18}, url = {file://localhost/Users/danielzerbino/Documents/Papers/2004/Pop/Genome%20Res%202004%20Pop.pdf}, doi = {10.1101/gr.1536204}, abstract = {The output of a genome assembler generally comprises a collection of contiguous DNA sequences (contigs) whose relative placement along the genome is not defined. A procedure called scaffolding is commonly used to order and orient these contigs using paired read information. This ordering of contigs is an essential step when finishing and analyzing the data from a whole-genome shotgun project. Most recent assemblers include a scaffolding module; however, users have little control over the scaffolding algorithm or the information produced. We thus developed a general-purpose scaffolder, called Bambus, which affords users significant flexibility in controlling the scaffolding parameters. Bambus was used recently to scaffold the low-coverage draft dog genome data. Most significantly, Bambus enables the use of linking data other than that inferred from mate-pair information. For example, the sequence of a completed genome can be used to guide the scaffolding of a related organism. We present several applications of Bambus: support for finishing, comparative genomics, analysis of the haplotype structure of genomes, and scaffolding of a mammalian genome at low coverage. Bambus is available as an open-source package from our Web site.}, biburl = {http://www.bibsonomy.org/bibtex/2a94162d90a98876b571cb8460decb1c0/dzerbino}, keywords = {anthracis Software Haplotypes, Databases, Benchmarking, Wolbachia, Dogs, Shewanella, Algorithms, Staphylococcus, Biology, Contig Animals, Genome, Software, Mapping, Computational Validation, Genetic, Bacillus Bacterial, Brucella, Genomics,} }