| Authors: |
Frances V. Buontempo
and Xue Zhong Wang
and Mulaisho Mwense
and Nigel Horan
and Anita Young
and Daniel Osborn
|
| Tags: |
8.0,
C5.0
EPTree,
QSAR
S-Plus,
SIMCA-P
See5,
algorithms,
decision
ecotoxicity,
genetic
model
partitioning,
programming,
recursive
trees,
|
| Abstract: |
Automatic induction of decision trees and production
rules from data to develop structure-activity models
for toxicity prediction has recently received much
attention, and the majority of methodologies reported
in the literature are based upon recursive partitioning
employing greedy searches to choose the best splitting
attribute and value at each node. These approaches can
be successful; however, the greedy search will
necessarily miss regions of the search space. Recent
literature has demonstrated the applicability of
genetic programming to decision tree induction to
overcome this problem. This paper presents a variant of
this novel approach, using fewer mutation options and a
simpler fitness function, demonstrating its utility in
inducing decision trees for ecotoxicity data, via a
case study of two data sets giving improved accuracy
and generalization ability over a popular decision tree
inducer. |
@article{buontempo:2005:CIM,
title = {Genetic Programming for the Induction of Decision
Trees to Model Ecotoxicity Data},
author = {Frances V. Buontempo and Xue Zhong Wang and Mulaisho Mwense and Nigel Horan and Anita Young and Daniel Osborn},
journal = {Journal of Chemical Information and Modeling},
note = {ASAP article. Web Release Date: May 12, 2005},
volume = {45},
year = {2005},
abstract = {Automatic induction of decision trees and production
rules from data to develop structure-activity models
for toxicity prediction has recently received much
attention, and the majority of methodologies reported
in the literature are based upon recursive partitioning
employing greedy searches to choose the best splitting
attribute and value at each node. These approaches can
be successful; however, the greedy search will
necessarily miss regions of the search space. Recent
literature has demonstrated the applicability of
genetic programming to decision tree induction to
overcome this problem. This paper presents a variant of
this novel approach, using fewer mutation options and a
simpler fitness function, demonstrating its utility in
inducing decision trees for ecotoxicity data, via a
case study of two data sets giving improved accuracy
and generalization ability over a popular decision tree
inducer.},
notes = {
http://pubs.acs.org/journals/jcisd8/index.html
S1549-9596(04)09652-4 ACS Publications Division
cites EPtree \cite{delisle:2004:CIM} y-scrambling. at
least 10\% data coverage required of decision trees.
Tournament size 16. No parsimony fitness preassure.
Trees regrown. Lots of mutation if pop stagnated.
Elitist but gives no improvement. -Log(LC50) vibrio
fischeri. 1093 features. 60 training compounds. 100
generation. Pop 600. 1 second per
generation.
Department of Chemical Engineering and School of Civil
Engineering, University of Leeds, Leeds LS2 9JT, U.K.,
AstraZeneca UK Ltd., Brixham Environmental Laboratory,
Freshwater Quarry, Brixham, Devon TQ5 8BA, U.K., and
Centre of Ecology and Hydrology, Monks Wood, Huntingdon
PE28 2LS, U.K.}, doi = {doi:10.1021/ci049652n}, size = {9 pages},
keywords = {8.0, C5.0 EPTree, QSAR S-Plus, SIMCA-P See5, algorithms, decision ecotoxicity, genetic model partitioning, programming, recursive trees, }
}