We present CECA, a configurable framework for embodied conversational AI agents in Unity-based extended reality (XR) applications. CECA employs a client–server architecture to decouple agent logic from game engine–based embodiment. Built on LiveKit Agents, our approach integrates speech-to-text (STT), large language models (LLMs), and text-to-speech (TTS) into a unified, streaming voice-to-voice pipeline configured via metadata rather than code changes. We outline how this architecture flexibly integrates local and cloud AI providers while mitigating limited provider SDK support in Unity. Finally, we highlight opportunities for future work, including multi-agent scenarios, higher-level templates for XR research, and systematic user studies.
%0 Conference Paper
%1 kern2026configurable
%A Kern, Florian
%A Polifke, Lukas
%A Friedrich, Paula
%A Latoschik, Marc Erich
%A Wienrich, Carolin
%A Obremski, David
%B 2026 IEEE Conference on Virtual Reality and 3D User Interfaces Abstracts and Workshops (VRW)
%D 2026
%K c.wienrich latoschik myown obremski piis xrhub
%T CECA - A Configurable Framework for Embodied Conversational AI Agents in Extended Reality
%X We present CECA, a configurable framework for embodied conversational AI agents in Unity-based extended reality (XR) applications. CECA employs a client–server architecture to decouple agent logic from game engine–based embodiment. Built on LiveKit Agents, our approach integrates speech-to-text (STT), large language models (LLMs), and text-to-speech (TTS) into a unified, streaming voice-to-voice pipeline configured via metadata rather than code changes. We outline how this architecture flexibly integrates local and cloud AI providers while mitigating limited provider SDK support in Unity. Finally, we highlight opportunities for future work, including multi-agent scenarios, higher-level templates for XR research, and systematic user studies.
@inproceedings{kern2026configurable,
abstract = {We present CECA, a configurable framework for embodied conversational AI agents in Unity-based extended reality (XR) applications. CECA employs a client–server architecture to decouple agent logic from game engine–based embodiment. Built on LiveKit Agents, our approach integrates speech-to-text (STT), large language models (LLMs), and text-to-speech (TTS) into a unified, streaming voice-to-voice pipeline configured via metadata rather than code changes. We outline how this architecture flexibly integrates local and cloud AI providers while mitigating limited provider SDK support in Unity. Finally, we highlight opportunities for future work, including multi-agent scenarios, higher-level templates for XR research, and systematic user studies.},
added-at = {2026-02-09T10:56:24.000+0100},
author = {Kern, Florian and Polifke, Lukas and Friedrich, Paula and Latoschik, Marc Erich and Wienrich, Carolin and Obremski, David},
biburl = {https://www.bibsonomy.org/bibtex/2002eac7a70138491bd2e9d575b7b5556/hci-uwb},
booktitle = {2026 IEEE Conference on Virtual Reality and 3D User Interfaces Abstracts and Workshops (VRW)},
interhash = {562b4dde335e21c23b330f959d0b3371},
intrahash = {002eac7a70138491bd2e9d575b7b5556},
keywords = {c.wienrich latoschik myown obremski piis xrhub},
note = {To be published},
timestamp = {2026-03-09T13:56:43.000+0100},
title = {CECA - A Configurable Framework for Embodied Conversational AI Agents in Extended Reality},
year = 2026
}