After decades of massive digitisation, an unprecedented amount of historical
documents is available in digital format, along with their machine-readable
texts. While this represents a major step forward with respect to preservation
and accessibility, it also opens up new opportunities in terms of content
mining and the next fundamental challenge is to develop appropriate
technologies to efficiently search, retrieve and explore information from this
'big data of the past'. Among semantic indexing opportunities, the recognition
and classification of named entities are in great demand among humanities
scholars. Yet, named entity recognition (NER) systems are heavily challenged
with diverse, historical and noisy inputs. In this survey, we present the array
of challenges posed by historical documents to NER, inventory existing
resources, describe the main approaches deployed so far, and identify key
priorities for future developments.
Description
Named Entity Recognition and Classification on Historical Documents: A Survey - 2109.11406.pdf
%0 Generic
%1 ehrmann2021named
%A Ehrmann, Maud
%A Hamdi, Ahmed
%A Pontes, Elvys Linhares
%A Romanello, Matteo
%A Doucet, Antoine
%D 2021
%K classification dh digital-humanities entity-linking hipe historical ner ner+l nlp text-mining
%T Named Entity Recognition and Classification on Historical Documents: A
Survey
%U http://arxiv.org/abs/2109.11406
%X After decades of massive digitisation, an unprecedented amount of historical
documents is available in digital format, along with their machine-readable
texts. While this represents a major step forward with respect to preservation
and accessibility, it also opens up new opportunities in terms of content
mining and the next fundamental challenge is to develop appropriate
technologies to efficiently search, retrieve and explore information from this
'big data of the past'. Among semantic indexing opportunities, the recognition
and classification of named entities are in great demand among humanities
scholars. Yet, named entity recognition (NER) systems are heavily challenged
with diverse, historical and noisy inputs. In this survey, we present the array
of challenges posed by historical documents to NER, inventory existing
resources, describe the main approaches deployed so far, and identify key
priorities for future developments.
@misc{ehrmann2021named,
abstract = {After decades of massive digitisation, an unprecedented amount of historical
documents is available in digital format, along with their machine-readable
texts. While this represents a major step forward with respect to preservation
and accessibility, it also opens up new opportunities in terms of content
mining and the next fundamental challenge is to develop appropriate
technologies to efficiently search, retrieve and explore information from this
'big data of the past'. Among semantic indexing opportunities, the recognition
and classification of named entities are in great demand among humanities
scholars. Yet, named entity recognition (NER) systems are heavily challenged
with diverse, historical and noisy inputs. In this survey, we present the array
of challenges posed by historical documents to NER, inventory existing
resources, describe the main approaches deployed so far, and identify key
priorities for future developments.},
added-at = {2022-09-27T14:55:06.000+0200},
author = {Ehrmann, Maud and Hamdi, Ahmed and Pontes, Elvys Linhares and Romanello, Matteo and Doucet, Antoine},
biburl = {https://www.bibsonomy.org/bibtex/206d76b05ecda4fbe13990cc4c00e499f/hangdong},
description = {Named Entity Recognition and Classification on Historical Documents: A Survey - 2109.11406.pdf},
interhash = {ca453a2be2f9921ea2951c86f7f57f0f},
intrahash = {06d76b05ecda4fbe13990cc4c00e499f},
keywords = {classification dh digital-humanities entity-linking hipe historical ner ner+l nlp text-mining},
note = {cite arxiv:2109.11406Comment: 39 pages},
timestamp = {2022-09-27T14:55:06.000+0200},
title = {Named Entity Recognition and Classification on Historical Documents: A
Survey},
url = {http://arxiv.org/abs/2109.11406},
year = 2021
}