Common Deep Metric Learning (DML) datasets specify only one notion of
similarity, e.g., two images in the Cars196 dataset are deemed similar if they
show the same car model. We argue that depending on the application, users of
image retrieval systems have different and changing similarity notions that
should be incorporated as easily as possible. Therefore, we present
Language-Guided Zero-Shot Deep Metric Learning (LanZ-DML) as a new DML setting
in which users control the properties that should be important for image
representations without training data by only using natural language. To this
end, we propose InDiReCT (Image representations using Dimensionality Reduction
on CLIP embedded Texts), a model for LanZ-DML on images that exclusively uses a
few text prompts for training. InDiReCT utilizes CLIP as a fixed feature
extractor for images and texts and transfers the variation in text prompt
embeddings to the image embedding space. Extensive experiments on five datasets
and overall thirteen similarity notions show that, despite not seeing any
images during training, InDiReCT performs better than strong baselines and
approaches the performance of fully-supervised models. An analysis reveals that
InDiReCT learns to focus on regions of the image that correlate with the
desired similarity notion, which makes it a fast to train and easy to use
method to create custom embedding spaces only using natural language.
Description
InDiReCT: Language-Guided Zero-Shot Deep Metric Learning for Images
%0 Generic
%1 kobs2022indirect
%A Kobs, Konstantin
%A Steininger, Michael
%A Hotho, Andreas
%D 2022
%K 2022 CLIP deep from:hotho image language learning metric myown selected shot zero
%T InDiReCT: Language-Guided Zero-Shot Deep Metric Learning for Images
%U http://arxiv.org/abs/2211.12760
%X Common Deep Metric Learning (DML) datasets specify only one notion of
similarity, e.g., two images in the Cars196 dataset are deemed similar if they
show the same car model. We argue that depending on the application, users of
image retrieval systems have different and changing similarity notions that
should be incorporated as easily as possible. Therefore, we present
Language-Guided Zero-Shot Deep Metric Learning (LanZ-DML) as a new DML setting
in which users control the properties that should be important for image
representations without training data by only using natural language. To this
end, we propose InDiReCT (Image representations using Dimensionality Reduction
on CLIP embedded Texts), a model for LanZ-DML on images that exclusively uses a
few text prompts for training. InDiReCT utilizes CLIP as a fixed feature
extractor for images and texts and transfers the variation in text prompt
embeddings to the image embedding space. Extensive experiments on five datasets
and overall thirteen similarity notions show that, despite not seeing any
images during training, InDiReCT performs better than strong baselines and
approaches the performance of fully-supervised models. An analysis reveals that
InDiReCT learns to focus on regions of the image that correlate with the
desired similarity notion, which makes it a fast to train and easy to use
method to create custom embedding spaces only using natural language.
@misc{kobs2022indirect,
abstract = {Common Deep Metric Learning (DML) datasets specify only one notion of
similarity, e.g., two images in the Cars196 dataset are deemed similar if they
show the same car model. We argue that depending on the application, users of
image retrieval systems have different and changing similarity notions that
should be incorporated as easily as possible. Therefore, we present
Language-Guided Zero-Shot Deep Metric Learning (LanZ-DML) as a new DML setting
in which users control the properties that should be important for image
representations without training data by only using natural language. To this
end, we propose InDiReCT (Image representations using Dimensionality Reduction
on CLIP embedded Texts), a model for LanZ-DML on images that exclusively uses a
few text prompts for training. InDiReCT utilizes CLIP as a fixed feature
extractor for images and texts and transfers the variation in text prompt
embeddings to the image embedding space. Extensive experiments on five datasets
and overall thirteen similarity notions show that, despite not seeing any
images during training, InDiReCT performs better than strong baselines and
approaches the performance of fully-supervised models. An analysis reveals that
InDiReCT learns to focus on regions of the image that correlate with the
desired similarity notion, which makes it a fast to train and easy to use
method to create custom embedding spaces only using natural language.},
added-at = {2022-11-25T03:11:01.000+0100},
author = {Kobs, Konstantin and Steininger, Michael and Hotho, Andreas},
biburl = {https://www.bibsonomy.org/bibtex/2e992d907a0cfdca8abf42e1322ee65e8/dmir},
description = {InDiReCT: Language-Guided Zero-Shot Deep Metric Learning for Images},
interhash = {6727d2689db8d4a69af21a39dbc03898},
intrahash = {e992d907a0cfdca8abf42e1322ee65e8},
keywords = {2022 CLIP deep from:hotho image language learning metric myown selected shot zero},
note = {cite arxiv:2211.12760Comment: Accepted to WACV 2023},
timestamp = {2024-01-18T10:31:52.000+0100},
title = {InDiReCT: Language-Guided Zero-Shot Deep Metric Learning for Images},
url = {http://arxiv.org/abs/2211.12760},
year = 2022
}