@misc{18205, author = {Hanna Borgli and Michael Riegler and H{\r a}kon Stensland and P{\r a}l Halvorsen}, title = {Automatic Prompt Generation for Zero-Shot Single Object Frame Segmentation in Videos Using Classification Models: A Polyp Case Study}, abstract = {Video object segmentation is vital for applications like medical diagnostics, but acquiring dense pixel-level annotations, especially for specialized domains like polyp segmentation, remains a major bottleneck. Foundational models offer zero-shot segmentation but typically require manual prompting, which is impractical for long videos. We propose Map2VidSeg, a novel pipeline that automatically generates prompts from image-level classification labels. It leverages localization cues (attention maps/CAMs) from a trained image classifier (ViT/CNN) to create bounding box prompts. These guide an efficient model (YOLOE) with tracking (BOT-SORT) and bidirectional propagation for initial segmentation. Optionally, a high-fidelity model (SAM-2) refines these masks using temporal memory and fusion. Demonstrated on the challenging SUN-SEG benchmark, fine-tuned DINOv2 (ViT) prompts significantly outperform DenseNet-121 (CNN). Our best configuration (DINOv2+YOLOE+SAM-2 Bidirectional) achieves Dice/mIoU 0.76/0.70 (Easy Unseen) and 0.66/0.60 (Hard Unseen), showcasing the viability of robust video segmentation without segmentation training data.}, year = {2025}, journal = {IEEE International Symposium on Computer-Based Medical Systems (CBMS)}, month = {06/2025}, publisher = {IEEE}, }