Glyph Miner: A System for Efficiently Extracting Glyphs from Early Prints in the Context of OCR.
B. Budig, T. van Dijk, and F. Kirchner. Proceedings of the 16th ACM/IEEE-CS on Joint Conference on Digital Libraries, page 31--34. ACM, (2016)
Abstract
While off-the-shelf OCR systems work well on many modern documents, the heterogeneity of early prints provides a significant challenge. To achieve good recognition quality, existing software must be "trained" specifically to each particular corpus. This is a tedious process that involves significant user effort. In this paper we demonstrate a system that generically replaces a common part of the training pipeline with a more efficient workflow: Given a set of scanned pages of a historical document, our system uses an efficient user interaction to semi-automatically extract large numbers of occurrences of glyphs indicated by the user. In a preliminary case study, we evaluate the effectiveness of our approach by embedding our system into the workflow at the University Library Würzburg.
%0 Conference Paper
%1 conf/jcdl/BudigDK16
%A Budig, Benedikt
%A van Dijk, Thomas C.
%A Kirchner, Felix
%B Proceedings of the 16th ACM/IEEE-CS on Joint Conference on Digital Libraries
%D 2016
%E Adam, Nabil R.
%E Cassel, Lillian (Boots)
%E Yesha, Yelena
%E Furuta, Richard
%E Weigle, Michele C.
%I ACM
%K myown
%P 31--34
%T Glyph Miner: A System for Efficiently Extracting Glyphs from Early Prints in the Context of OCR.
%U https://doi.org/10.1145/2910896.2910915
%X While off-the-shelf OCR systems work well on many modern documents, the heterogeneity of early prints provides a significant challenge. To achieve good recognition quality, existing software must be "trained" specifically to each particular corpus. This is a tedious process that involves significant user effort. In this paper we demonstrate a system that generically replaces a common part of the training pipeline with a more efficient workflow: Given a set of scanned pages of a historical document, our system uses an efficient user interaction to semi-automatically extract large numbers of occurrences of glyphs indicated by the user. In a preliminary case study, we evaluate the effectiveness of our approach by embedding our system into the workflow at the University Library Würzburg.
%@ 978-1-4503-4229-2
@inproceedings{conf/jcdl/BudigDK16,
abstract = {While off-the-shelf OCR systems work well on many modern documents, the heterogeneity of early prints provides a significant challenge. To achieve good recognition quality, existing software must be "trained" specifically to each particular corpus. This is a tedious process that involves significant user effort. In this paper we demonstrate a system that generically replaces a common part of the training pipeline with a more efficient workflow: Given a set of scanned pages of a historical document, our system uses an efficient user interaction to semi-automatically extract large numbers of occurrences of glyphs indicated by the user. In a preliminary case study, we evaluate the effectiveness of our approach by embedding our system into the workflow at the University Library Würzburg.},
added-at = {2016-09-05T02:41:29.000+0200},
author = {Budig, Benedikt and van Dijk, Thomas C. and Kirchner, Felix},
biburl = {https://www.bibsonomy.org/bibtex/254d49e6bcc9d837e9010148781fc3620/thomasd},
booktitle = {Proceedings of the 16th ACM/IEEE-CS on Joint Conference on Digital Libraries},
crossref = {conf/jcdl/2016},
editor = {Adam, Nabil R. and Cassel, Lillian (Boots) and Yesha, Yelena and Furuta, Richard and Weigle, Michele C.},
ee = {http://doi.acm.org/10.1145/2910896.2910915},
interhash = {83f00d88841b6b9c664cc265cefcffd1},
intrahash = {54d49e6bcc9d837e9010148781fc3620},
isbn = {978-1-4503-4229-2},
keywords = {myown},
pages = {31--34},
publisher = {ACM},
series = {JCDL '16},
timestamp = {2019-03-11T10:19:45.000+0100},
title = {Glyph Miner: A System for Efficiently Extracting Glyphs from Early Prints in the Context of OCR.},
url = {https://doi.org/10.1145/2910896.2910915},
year = 2016
}