@article{18179, author = {Pegah Salehi and Sajad Sheshkal and Vajira Thambawita and Sushant Gautam and Saeed Sabet and Dag Johansen and Michael Riegler and P{\r a}l Halvorsen}, title = {Comparative Analysis of Audio Feature Extraction for Real-Time Talking Portrait Synthesis}, abstract = {This paper explores advancements in real-time talking-head generation, focusing on overcoming challenges in Audio Feature Extraction (AFE), which often introduces latency and limits responsiveness in real-time applications. To address these issues, we propose and implement a fully integrated system that replaces conventional AFE models with OpenAI{\textquoteright}s Whisper, leveraging its encoder to optimize processing and improve overall system efficiency. Our evaluation of two open-source real-time models across three different datasets shows that Whisper not only accelerates processing but also improves specific aspects of rendering quality, resulting in more realistic and responsive talking-head interactions. Although interviewer training systems are considered a potential application, the primary contribution of this work is the improvement of the technical foundations necessary for creating responsive AI avatars. These advancements enable more immersive interactions and expand the scope of AI-driven applications, including educational tools and simulated training environments.}, year = {2025}, journal = {Big Data and Cognitive Computing}, volume = {9}, month = {0372025}, publisher = {MDPI}, url = {https://www.mdpi.com/2504-2289/9/3/59}, doi = {10.3390/bdcc9030059}, }