The dark data extraction or knowledge base construction (KBC) problem is to populate a relational database with information from unstructured data sources, such as emails, webpages, and PDFs. KBC is a long-standing problem in industry and research that encompasses problems of data extraction, cleaning, and integration. We describe DeepDive, a system that combines database and machine learning ideas to help to develop KBC systems. The key idea in DeepDive is to frame traditional extract-transform-load (ETL) style data management problems as a single large statistical inference task that is declaratively defined by the user. DeepDive leverages the effectiveness and efficiency of statistical inference and machine learning for difficult extraction tasks, whereas not requiring users to directly write any probabilistic inference algorithms. Instead, domain experts interact with DeepDive by defining features or rules about the domain. DeepDive has been successfully applied to domains such as pharmacogenomics, paleobiology, and antihuman trafficking enforcement, achieving human-caliber quality at machine-caliber scale. We present the applications, abstractions, and techniques used in DeepDive to accelerate the construction of such dark data extraction systems.
%0 Journal Article
%1 ZhangReEtAl17cacm
%A Zhang, Ce
%A Ré, Christopher
%A Cafarella, Michael
%A De Sa, Christopher
%A Ratner, Alex
%A Shin, Jaeho
%A Wang, Feiran
%A Wu, Sen
%D 2017
%J Communications of the ACM
%K 01801 acm numerical ai knowledge processing data analysis database engineering learn tool
%N 5
%P 93--102
%R 10.1145/3060586
%T DeepDive: Declarative Knowledge Base Construction
%V 60
%X The dark data extraction or knowledge base construction (KBC) problem is to populate a relational database with information from unstructured data sources, such as emails, webpages, and PDFs. KBC is a long-standing problem in industry and research that encompasses problems of data extraction, cleaning, and integration. We describe DeepDive, a system that combines database and machine learning ideas to help to develop KBC systems. The key idea in DeepDive is to frame traditional extract-transform-load (ETL) style data management problems as a single large statistical inference task that is declaratively defined by the user. DeepDive leverages the effectiveness and efficiency of statistical inference and machine learning for difficult extraction tasks, whereas not requiring users to directly write any probabilistic inference algorithms. Instead, domain experts interact with DeepDive by defining features or rules about the domain. DeepDive has been successfully applied to domains such as pharmacogenomics, paleobiology, and antihuman trafficking enforcement, achieving human-caliber quality at machine-caliber scale. We present the applications, abstractions, and techniques used in DeepDive to accelerate the construction of such dark data extraction systems.
@article{ZhangReEtAl17cacm,
abstract = {The dark data extraction or knowledge base construction (KBC) problem is to populate a relational database with information from unstructured data sources, such as emails, webpages, and PDFs. KBC is a long-standing problem in industry and research that encompasses problems of data extraction, cleaning, and integration. We describe DeepDive, a system that combines database and machine learning ideas to help to develop KBC systems. The key idea in DeepDive is to frame traditional extract-transform-load (ETL) style data management problems as a single large statistical inference task that is declaratively defined by the user. DeepDive leverages the effectiveness and efficiency of statistical inference and machine learning for difficult extraction tasks, whereas not requiring users to directly write any probabilistic inference algorithms. Instead, domain experts interact with DeepDive by defining features or rules about the domain. DeepDive has been successfully applied to domains such as pharmacogenomics, paleobiology, and antihuman trafficking enforcement, achieving human-caliber quality at machine-caliber scale. We present the applications, abstractions, and techniques used in DeepDive to accelerate the construction of such dark data extraction systems.},
added-at = {2017-05-14T09:24:39.000+0200},
author = {Zhang, Ce and R{\'e}, Christopher and Cafarella, Michael and De Sa, Christopher and Ratner, Alex and Shin, Jaeho and Wang, Feiran and Wu, Sen},
biburl = {https://www.bibsonomy.org/bibtex/29da545a7a254e5f5536475127c8bce27/flint63},
doi = {10.1145/3060586},
file = {ACM Digital Library:2017/ZhangReEtAl17cacm.pdf:PDF},
groups = {public},
interhash = {e7dc7d2575a1cbc02af49dcde31d59f8},
intrahash = {9da545a7a254e5f5536475127c8bce27},
issn = {0001-0782},
journal = {Communications of the ACM},
keywords = {01801 acm numerical ai knowledge processing data analysis database engineering learn tool},
month = {#may#},
number = 5,
pages = {93--102},
timestamp = {2018-04-16T12:34:34.000+0200},
title = {{DeepDive}: Declarative Knowledge Base Construction},
username = {flint63},
volume = 60,
year = 2017
}