| Authors: |
Margaret King
|
| Editors: |
Laila Dybkjær
and Holmer Hemsen
and Wolfgang Minker
|
| URL: |
http://dx.doi.org/10.1007/978-1-4020-5817-2_5 |
| Tags: |
interaction
interface
language
paper
processing
springer
test
user
v0805
|
| Abstract: |
This chapter is concerned with a particular perspective on the problem
of evaluation design. User-oriented evaluation takes as primary some
user or set of users who need to accomplish some task, and sets out
to discover through evaluation whether a given software system will
help them to do so effectively, productively, safely, and with a
sense of satisfaction. (Note that, following ISO, user here is used
in a very wide sense and encompasses much more than what has conventionally
been called end-user.) There is a clear tension between taking specific
user needs as primary and seeking common principles for the evaluation
of particular software applications. The chapter suggests that this
tension may be resolved by using an ISO standard for the evaluation
of software as an appropriate level of generalization (ISO 9126).
Quality models reflecting the characteristics of specific software
applications (machine translation, document retrieval, information
extraction systems, etc.) are then built on the skeleton set out
in the ISO standard. Particular user needs are taken into account
by picking out those parts of the appropriate quality model which
reflect the needs, where necessary imposing a relative order of importance
on the parts picked out. Execution of the evaluation then concentrates
on the parts of the quality model chosen as pertinent to the user
and the context of work. The focus of the chapter is on general design
questions rather than on the strengths and weaknesses of specific
metrics. However, there is some discussion of what it means for a
metric to be valid and reliable, and of the difficulty of finding
good metrics for those cases where system performance and human performance
in interaction with the system are inextricably linked. A suggestion
is made that it might be possible to automate an important part of
the process of evaluation design, and an attempt to do this for the
case of machine translation evaluations is briefly sketched. |
@incollection{King07p125,
title = {General Principles of User-Oriented Evaluation},
address = {Dordrecht},
author = {Margaret King},
booktitle = {Evaluation of Text and Speech Systems},
editor = {Laila Dybkjær and Holmer Hemsen and Wolfgang Minker},
pages = {125-161},
publisher = {Springer},
series = {Text, Speech and Language Technology},
url = {http://dx.doi.org/10.1007/978-1-4020-5817-2_5},
volume = {37},
year = {2007},
abstract = {This chapter is concerned with a particular perspective on the problem
of evaluation design. User-oriented evaluation takes as primary some
user or set of users who need to accomplish some task, and sets out
to discover through evaluation whether a given software system will
help them to do so effectively, productively, safely, and with a
sense of satisfaction. (Note that, following ISO, user here is used
in a very wide sense and encompasses much more than what has conventionally
been called end-user.) There is a clear tension between taking specific
user needs as primary and seeking common principles for the evaluation
of particular software applications. The chapter suggests that this
tension may be resolved by using an ISO standard for the evaluation
of software as an appropriate level of generalization (ISO 9126).
Quality models reflecting the characteristics of specific software
applications (machine translation, document retrieval, information
extraction systems, etc.) are then built on the skeleton set out
in the ISO standard. Particular user needs are taken into account
by picking out those parts of the appropriate quality model which
reflect the needs, where necessary imposing a relative order of importance
on the parts picked out. Execution of the evaluation then concentrates
on the parts of the quality model chosen as pertinent to the user
and the context of work. The focus of the chapter is on general design
questions rather than on the strengths and weaknesses of specific
metrics. However, there is some discussion of what it means for a
metric to be valid and reliable, and of the difficulty of finding
good metrics for those cases where system performance and human performance
in interaction with the system are inextricably linked. A suggestion
is made that it might be possible to automate an important part of
the process of evaluation design, and an attempt to do this for the
case of machine translation evaluations is briefly sketched.},
timestamp = {2008.05.01}, isbn = {978-1-4020-5815-8}, owner = {flint},
keywords = {interaction interface language paper processing springer test user v0805 }
}