Abstract: Cover song identification (CSI) focuses on finding the same music
with different versions in reference anchors given a query track. In this
paper, we propose a novel system named CoverHunter that overcomes the
shortcomings of existing detection schemes by exploring richer features with
refined attention and alignments. CoverHunter contains three key modules: 1) A
convolution-augmented transformer (i.e., Conformer) structure that captures
both local and global feature interactions in contrast to previous methods
mainly relying on convolutional neural networks; 2) An attention-based time
pooling module that further exploits the attention in the time dimension; 3) A
novel coarse-to-fine training scheme that first trains a network to roughly
align the song chunks and then refines the network by training on the aligned
chunks. At the same time, we also summarize some important training tricks used
in our system that help achieve better results. Experiments on several standard
CSI datasets show that our method significantly improves over state-of-the-art
methods with an embedding size of 128 (2.3% on SHS100K-TEST and 17.7% on
DaTacos).
Description
[2306.09025] CoverHunter: Cover Song Identification with Refined Attention and Alignments
%0 Generic
%1 liu2023coverhunter
%A Liu, Feng
%A Tuo, Deyi
%A Xu, Yinan
%A Han, Xintong
%D 2023
%K cover cqt csi identification information mir music plk retrieval song
%T CoverHunter: Cover Song Identification with Refined Attention and
Alignments
%U http://arxiv.org/abs/2306.09025
%X Abstract: Cover song identification (CSI) focuses on finding the same music
with different versions in reference anchors given a query track. In this
paper, we propose a novel system named CoverHunter that overcomes the
shortcomings of existing detection schemes by exploring richer features with
refined attention and alignments. CoverHunter contains three key modules: 1) A
convolution-augmented transformer (i.e., Conformer) structure that captures
both local and global feature interactions in contrast to previous methods
mainly relying on convolutional neural networks; 2) An attention-based time
pooling module that further exploits the attention in the time dimension; 3) A
novel coarse-to-fine training scheme that first trains a network to roughly
align the song chunks and then refines the network by training on the aligned
chunks. At the same time, we also summarize some important training tricks used
in our system that help achieve better results. Experiments on several standard
CSI datasets show that our method significantly improves over state-of-the-art
methods with an embedding size of 128 (2.3% on SHS100K-TEST and 17.7% on
DaTacos).
@misc{liu2023coverhunter,
abstract = {Abstract: Cover song identification (CSI) focuses on finding the same music
with different versions in reference anchors given a query track. In this
paper, we propose a novel system named CoverHunter that overcomes the
shortcomings of existing detection schemes by exploring richer features with
refined attention and alignments. CoverHunter contains three key modules: 1) A
convolution-augmented transformer (i.e., Conformer) structure that captures
both local and global feature interactions in contrast to previous methods
mainly relying on convolutional neural networks; 2) An attention-based time
pooling module that further exploits the attention in the time dimension; 3) A
novel coarse-to-fine training scheme that first trains a network to roughly
align the song chunks and then refines the network by training on the aligned
chunks. At the same time, we also summarize some important training tricks used
in our system that help achieve better results. Experiments on several standard
CSI datasets show that our method significantly improves over state-of-the-art
methods with an embedding size of 128 (2.3% on SHS100K-TEST and 17.7% on
DaTacos).},
added-at = {2023-11-28T13:54:47.000+0100},
author = {Liu, Feng and Tuo, Deyi and Xu, Yinan and Han, Xintong},
biburl = {https://www.bibsonomy.org/bibtex/2bba842f6740cd2912290bb92d73a8edf/simonha94},
description = {[2306.09025] CoverHunter: Cover Song Identification with Refined Attention and Alignments},
interhash = {22603909664f201ab9ab088b1aa35e78},
intrahash = {bba842f6740cd2912290bb92d73a8edf},
keywords = {cover cqt csi identification information mir music plk retrieval song},
note = {cite arxiv:2306.09025Comment: 6 pages, 3 figures},
timestamp = {2023-11-28T13:56:09.000+0100},
title = {CoverHunter: Cover Song Identification with Refined Attention and
Alignments},
url = {http://arxiv.org/abs/2306.09025},
year = 2023
}