@article{Crepinsek:2006:ENTCS, title = {Inferring Context-Free Grammars for Domain-Specific Languages}, author = {Matej Crepinsek and Marjan Mernik and Barrett R. Bryant and Faizan Javed and Alan Sprague}, journal = {Electronic Notes in Theoretical Computer Science}, month = {12 December}, note = {Proceedings of the Fifth Workshop on Language Descriptions, Tools, and Applications (LDTA 2005)}, number = 4, pages = {99--116}, volume = 141, year = 2005, issn = {1571-0661}, doi = {doi:10.1016/j.entcs.2005.02.055}, abstract = {In the area of programming languages, context-free grammars (CFGs) are of special importance since almost all programming languages employ CFG's in their design. Recent approaches to CFG induction are not able to infer context-free grammars for general-purpose programming languages. In this paper it is shown that syntax of a small domain-specific language can be inferred from positive and negative programs provided by domain experts. In our work we are using the genetic programming approach in grammatical inference. Grammar-specific heuristic operators and nonrandom construction of the initial population are proposed to achieve this task. Suitability of the approach is shown by examples where underlying context-free grammars are successfully inferred.}, biburl = {http://www.bibsonomy.org/bibtex/2b359032acefed37074158ec1c5cb61bc/brazovayeye}, keywords = {Learning inference, from examples, Exhaustive and programming, search genetic algorithms, negative positive Grammar induction,} } @mastersthesis{rennie2001naive, title = {Improving Multi-class Text Classification with Naive Bayes}, author = {Jason D. M. Rennie}, school = {Massachusetts Institute of Technology}, year = 2001, url = {http://people.csail.mit.edu/~jrennie/papers/sm-thesis.pdf}, abstract = {There are numerous text documents available in electronic form. More and more are becoming available every day. Such documents represent a massive amount of information that is easily accessible. Seeking value in this huge collection requires organization; much of the work of organizing documents can be automated through text classification. The accuracy and our understanding of such systems greatly influences their usefulness. In this paper, we seek 1) to advance the understanding of commonly used text classification techniques, and 2) through that understanding, improve the tools that are available for text classification. We begin by clarifying the assumptions made in the derivation of Naive Bayes, noting basic properties and proposing ways for its extension and improvement. Next, we investigate the quality of Naive Bayes parameter estimates and their impact on classification. Our analysis leads to a theorem which gives an explanation for the improvements that can be found in multiclass classification with Naive Bayes using Error-Correcting Output Codes. We use experimental evidence on two commonly-used data sets to exhibit an application of the theorem. Finally, we show fundamental flaws in a commonly-used feature selection algorithm and develop a statistics-based framework for text feature selection. Greater understanding of Naive Bayes and the properties of text allows us to make better use of it in text classification.}, biburl = {http://www.bibsonomy.org/bibtex/22896eb9538a6ee34f8e6c6757bdcf99e/jil}, keywords = {herleitung multinomial estimation bayes exhaustive likelihood thesis prior naive komplett maximum map deduction mle} }