Historical archival records present many challenges for OCR
systems to correctly encode their content, due to visual complexity,
e.g. mixed printed text and handwritten annotations, paper
degradation and faded ink. This paper addresses the problem of
automatic identification and separation of handwritten and printed
text in historical archival documents, including the creation of an
artificial pixel-level annotated dataset and the presentation of a
new FCN-based model trained on historical data. Initial test
results indicate 18% IoU performance improvement on recognition
of printed pixels and 10% IoU performance improvement on
recognition of handwritten pixels in synthesised data when
compared to the state-of-the-art trained on modern documents.
Furthermore, an extrinsic OCR-based evaluation on the printed
layer extracted from real historical documents shows 26%
performance increase.
%0 Conference Paper
%1 vafaie2022handwritten
%A Vafaie, Mahsa
%A Bruns, Oleksandra
%A Pilz, Nastasja
%A Waitelonis, Jörg
%A Sack, Harald
%D 2022
%J Archiving Conference
%K archive fiziseown handwritten historical ise text_identification
%P 15--20
%R https://doi.org/10.2352/issn.2168-3204.2022.19.1.04
%T Handwritten and Printed Text Identification in Historical Archival Documents
%U https://library.imaging.org/admin/apis/public/api/ist/website/downloadArticle/archiving/19/1/4
%V 19
%X Historical archival records present many challenges for OCR
systems to correctly encode their content, due to visual complexity,
e.g. mixed printed text and handwritten annotations, paper
degradation and faded ink. This paper addresses the problem of
automatic identification and separation of handwritten and printed
text in historical archival documents, including the creation of an
artificial pixel-level annotated dataset and the presentation of a
new FCN-based model trained on historical data. Initial test
results indicate 18% IoU performance improvement on recognition
of printed pixels and 10% IoU performance improvement on
recognition of handwritten pixels in synthesised data when
compared to the state-of-the-art trained on modern documents.
Furthermore, an extrinsic OCR-based evaluation on the printed
layer extracted from real historical documents shows 26%
performance increase.
@inproceedings{vafaie2022handwritten,
abstract = {Historical archival records present many challenges for OCR
systems to correctly encode their content, due to visual complexity,
e.g. mixed printed text and handwritten annotations, paper
degradation and faded ink. This paper addresses the problem of
automatic identification and separation of handwritten and printed
text in historical archival documents, including the creation of an
artificial pixel-level annotated dataset and the presentation of a
new FCN-based model trained on historical data. Initial test
results indicate 18% IoU performance improvement on recognition
of printed pixels and 10% IoU performance improvement on
recognition of handwritten pixels in synthesised data when
compared to the state-of-the-art trained on modern documents.
Furthermore, an extrinsic OCR-based evaluation on the printed
layer extracted from real historical documents shows 26%
performance increase.},
added-at = {2022-09-08T11:26:59.000+0200},
author = {Vafaie, Mahsa and Bruns, Oleksandra and Pilz, Nastasja and Waitelonis, Jörg and Sack, Harald},
biburl = {https://www.bibsonomy.org/bibtex/254a5e95821085e85ed33ccbf6bde0ecf/vivienvetter},
doi = {https://doi.org/10.2352/issn.2168-3204.2022.19.1.04},
interhash = {48544809ccb52d8509a757115c76b4ad},
intrahash = {54a5e95821085e85ed33ccbf6bde0ecf},
journal = {Archiving Conference},
keywords = {archive fiziseown handwritten historical ise text_identification},
language = {English},
pages = {15--20},
timestamp = {2022-09-08T12:10:02.000+0200},
title = {Handwritten and Printed Text Identification in Historical Archival Documents},
url = {https://library.imaging.org/admin/apis/public/api/ist/website/downloadArticle/archiving/19/1/4},
volume = 19,
year = 2022
}