2026
Campos, Ron; Vayani, Ashmal; Kulkarni, Parth Parag; Gupta, Rohit; Zafar, Aizan; Dutta, Aritra; Shah, Mubarak
GAEA: A Geolocation Aware Conversational Model Conference
The IEEE/CVF Winter conference on Applications of Computer Vision, 2026.
Abstract | Tags: WACV | Links:
@conference{nokey,
title = {GAEA: A Geolocation Aware Conversational Model},
author = {Ron Campos and Ashmal Vayani and Parth Parag Kulkarni and Rohit Gupta and Aizan Zafar and Aritra Dutta and Mubarak Shah},
url = {https://ucf-crcv.github.io/GAEA/#bibtex
https://arxiv.org/abs/2503.16423
https://huggingface.co/collections/ucf-crcv/gaea-67d514a61d48eb1708b13a08
https://github.com/UCF-CRCV/GAEA},
year = {2026},
date = {2026-03-06},
publisher = {The IEEE/CVF Winter conference on Applications of Computer Vision},
abstract = {Image geolocalization, in which an AI model traditionally predicts the precise GPS coordinates of an image, is a challenging task with many downstream applications. However, the user cannot utilize the model to further their knowledge beyond the GPS coordinates; the model lacks an understanding of the location and the conversational ability to communicate with the user. In recent days, with the tremendous progress of large multimodal models (LMMs)---proprietary and open-source---researchers have attempted to geolocalize images via LMMs. However, the issues remain unaddressed; beyond general tasks, for more specialized downstream tasks, such as geolocalization, LMMs struggle. In this work, we propose solving this problem by introducing a conversational model, GAEA, that provides information regarding the location of an image as the user requires. No large-scale dataset enabling the training of such a model exists. Thus, we propose GAEA-1.4M, a comprehensive dataset comprising over 800k images and approximately 1.4M question-answer pairs, constructed by leveraging OpenStreetMap (OSM) attributes and geographical context clues. For quantitative evaluation, we propose a diverse benchmark, GAEA-Bench, comprising 3.5k image-text pairs to evaluate conversational capabilities equipped with diverse question types. We consider 11 state-of-the-art open-source and proprietary LMMs and demonstrate that GAEA significantly outperforms the best open-source model, LLaVA-OneVision, by 18.2% and the best proprietary model, GPT-4o, by 7.2%.},
keywords = {WACV},
pubstate = {published},
tppubtype = {conference}
}
Shen, Xu; Wang, Song; Tan, Zhen; Yao, Laura; Zhao, Xinyu; Xu, Kaidi; Wang, Xin; Chen, Tianlong
FaithCoT-Bench: Benchmarking Instance-Level Faithfulness of Chain-of-Thought Reasoning Conference
Fourteenth International Conference on Learning Representations (ICLR), 2026.
Abstract | Tags: ICLR | Links:
@conference{nokeyi,
title = {FaithCoT-Bench: Benchmarking Instance-Level Faithfulness of Chain-of-Thought Reasoning},
author = {Xu Shen and Song Wang and Zhen Tan and Laura Yao and Xinyu Zhao and Kaidi Xu and Xin Wang and Tianlong Chen
},
url = {https://arxiv.org/abs/2510.04040},
year = {2026},
date = {2026-01-29},
urldate = {2026-01-29},
booktitle = {Fourteenth International Conference on Learning Representations (ICLR)},
abstract = {Large language models (LLMs) increasingly rely on Chain-of-Thought (CoT) prompting to improve problem-solving and provide seemingly transparent explanations. However, growing evidence shows that CoT often fail to faithfully represent the underlying reasoning process, raising concerns about their reliability in high-risk applications. Although prior studies have focused on mechanism-level analyses showing that CoTs can be unfaithful, they leave open the practical challenge of deciding whether a specific trajectory is faithful to the internal reasoning of the model. To address this gap, we introduce FaithCoT-Bench, a unified benchmark for instance-level CoT unfaithfulness detection. Our framework establishes a rigorous task formulation that formulates unfaithfulness detection as a discriminative decision problem, and provides FINE-CoT (Faithfulness instance evaluation for Chain-of-Thought), an expert-annotated collection of over 1,000 trajectories generated by four representative LLMs across four domains, including more than 300 unfaithful instances with fine-grained causes and step-level evidence. We further conduct a systematic evaluation of eleven representative detection methods spanning counterfactual, logit-based, and LLM-as-judge paradigms, deriving empirical insights that clarify the strengths and weaknesses of existing approaches and reveal the increased challenges of detection in knowledge-intensive domains and with more advanced models. To the best of our knowledge, FaithCoT-Bench establishes the first comprehensive benchmark for instance-level CoT faithfulness,},
keywords = {ICLR},
pubstate = {published},
tppubtype = {conference}
}
Feng, Sicheng; TUO, Kaiwen; Wang, Song; Kong, Lingdong; Zhu, Jianke; Wang, Huan
RewardMap: Tackling Sparse Rewards in Fine-grained Visual Reasoning via Multi-Stage Reinforcement Learning Conference
Fourteenth International Conference on Learning Representations (ICLR), 2026.
Abstract | Tags: ICLR | Links:
@conference{nokeyj,
title = {RewardMap: Tackling Sparse Rewards in Fine-grained Visual Reasoning via Multi-Stage Reinforcement Learning},
author = {Sicheng Feng and Kaiwen TUO and Song Wang and Lingdong Kong and Jianke Zhu and Huan Wang
},
url = {https://arxiv.org/abs/2510.02240, https://fscdc.github.io/RewardMap/},
year = {2026},
date = {2026-01-26},
urldate = {2026-01-29},
booktitle = {Fourteenth International Conference on Learning Representations (ICLR)},
abstract = {Distilling robust reasoning capabilities from large language models (LLMs) into smaller, computationally efficient student models remains an unresolved challenge. Despite recent advances, distilled models frequently suffer from superficial pattern memorization and subpar generalization. To overcome these limitations, we introduce a novel distillation framework that moves beyond simple mimicry to instill a deeper conceptual understanding. Our framework features two key innovations. underline{textit{First}}, to address pattern memorization, Explanatory Inversion (EI) generates targeted ``explanatory probes'' that compel the student to articulate the underlying logic behind an answer, rather than just memorizing it. underline{textit{Second}}, to improve generalization, Explanatory GRPO (texttt{EXGRPO}) uses a reinforcement learning algorithm with a novel Dialogue Structure Utility Bonus, which explicitly rewards the student for maintaining a coherent reasoning process across these probes. Extensive evaluations on 12 datasets demonstrate significant improvements. Using Gemma-7b as the student model, our method yields an average textbf{20.39%} increase over zero-shot performance and a textbf{6.02%} improvement over the state-of-the-art distillation baselines. Moreover, models distilled with our method show remarkable training efficiency (e.g., surpassing vanilla fine-tuning with textbf{10-25%} training data) and strong generalization to out-of-distribution tasks.},
keywords = {ICLR},
pubstate = {published},
tppubtype = {conference}
}
Tan, Zhen; Zhao, Chengshuai; Wang, Song; Li, Jundong; Chen, Tianlong; huan liu,
Probing to Refine: Reinforcement Distillation of LLM Reasoners via Explanatory Inversion Conference
The Fourteenth International Conference on Learning Representations, 2026.
Abstract | Tags: ICLR | Links:
@conference{nokeyk,
title = {Probing to Refine: Reinforcement Distillation of LLM Reasoners via Explanatory Inversion},
author = {Zhen Tan and Chengshuai Zhao and Song Wang and Jundong Li and Tianlong Chen and huan liu
},
url = {https://openreview.net/forum?id=rkIw2GqYEt},
year = {2026},
date = {2026-01-26},
urldate = {2026-01-26},
booktitle = {The Fourteenth International Conference on Learning Representations},
abstract = {Distilling robust reasoning capabilities from large language models (LLMs) into smaller, computationally efficient student models remains an unresolved challenge. Despite recent advances, distilled models frequently suffer from superficial pattern memorization and subpar generalization. To overcome these limitations, we introduce a novel distillation framework that moves beyond simple mimicry to instill a deeper conceptual understanding. Our framework features two key innovations. underline{textit{First}}, to address pattern memorization, Explanatory Inversion (EI) generates targeted ``explanatory probes'' that compel the student to articulate the underlying logic behind an answer, rather than just memorizing it. underline{textit{Second}}, to improve generalization, Explanatory GRPO (texttt{EXGRPO}) uses a reinforcement learning algorithm with a novel Dialogue Structure Utility Bonus, which explicitly rewards the student for maintaining a coherent reasoning process across these probes. Extensive evaluations on 12 datasets demonstrate significant improvements. Using Gemma-7b as the student model, our method yields an average textbf{20.39%} increase over zero-shot performance and a textbf{6.02%} improvement over the state-of-the-art distillation baselines. Moreover, models distilled with our method show remarkable training efficiency (e.g., surpassing vanilla fine-tuning with textbf{10-25%} training data) and strong generalization to out-of-distribution tasks.},
keywords = {ICLR},
pubstate = {published},
tppubtype = {conference}
}
Yu, Hanxun; Li, Wentong; Qu, Xuan; Wang, Song; Chen, Junbo; Zhu, Jianke
VisionTrim: Unified Vision Token Compression for Training-Free MLLM Acceleration Conference
Fourteenth International Conference on Learning Representations (ICLR), 2026., 2026.
Abstract | Tags: ICLR | Links:
@conference{nokeyl,
title = {VisionTrim: Unified Vision Token Compression for Training-Free MLLM Acceleration},
author = {Hanxun Yu and Wentong Li and Xuan Qu and Song Wang and Junbo Chen and Jianke Zhu
},
url = {https://arxiv.org/abs/2601.22674},
year = {2026},
date = {2026-01-26},
booktitle = {Fourteenth International Conference on Learning Representations (ICLR), 2026.},
abstract = {Multimodal large language models (MLLMs) suffer from high computational costs due to excessive visual tokens, particularly in high-resolution and video-based scenarios. Existing token reduction methods typically focus on isolated pipeline components and often neglect textual alignment, leading to performance degradation. In this paper, we propose VisionTrim, a unified framework for training-free MLLM acceleration, integrating two effective plug-and-play modules: 1) the Dominant Vision Token Selection (DVTS) module, which preserves essential visual tokens via global-local view, and 2) the Text-Guided Vision Complement (TGVC) module, which facilitates context-aware token merging guided by textual cues. Extensive experiments across diverse image and video multimodal benchmarks demonstrate the performance superiority of our VisionTrim, advancing practical MLLM deployment in real-world applications. Our full implementation will be publicly available.},
keywords = {ICLR},
pubstate = {published},
tppubtype = {conference}
}
Zou, Yuanhao; Jin, Shengji; Deng, Andong; Zhao, Youpeng; Wang, Jun; Chen, Chen
A.I.R.: Enabling Adaptive, Iterative, and Reasoning-based Frame Selection For Video Question Answering Conference
The Fourteenth International Conference on Learning Representations, 2026.
Abstract | Tags: ICLR | Links:
@conference{nokeym,
title = {A.I.R.: Enabling Adaptive, Iterative, and Reasoning-based Frame Selection For Video Question Answering},
author = {Yuanhao Zou and Shengji Jin and Andong Deng and Youpeng Zhao and Jun Wang and Chen Chen
},
url = {https://ucf-air.github.io/},
year = {2026},
date = {2026-01-26},
urldate = {2026-01-26},
booktitle = {The Fourteenth International Conference on Learning Representations},
abstract = {Effectively applying Vision-Language Models (VLMs) to Video Question Answering (VideoQA) hinges on selecting a concise yet comprehensive set of frames, as processing entire videos is computationally infeasible. However, current frame selection methods face a critical trade-off: approaches relying on lightweight similarity models, such as CLIP, often fail to capture the nuances of complex queries, resulting in inaccurate similarity scores that cannot reflect the authentic query-frame relevance, which further undermines frame selection. Meanwhile, methods that leverage a VLM for deeper analysis achieve higher accuracy but incur prohibitive computational costs. To address these limitations, we propose A.I.R., a training-free approach for Adaptive, Iterative, and Reasoning-based frame selection. We leverage a powerful VLM to perform deep, semantic analysis on complex queries, and this analysis is deployed within a cost-effective iterative loop that processes only a small batch of the most high-potential frames at a time. Extensive experiments on various VideoQA benchmarks demonstrate that our approach outperforms existing frame selection methods, significantly boosts the performance of the foundation VLM, and achieves substantial gains in computational efficiency over other VLM-based techniques.},
keywords = {ICLR},
pubstate = {published},
tppubtype = {conference}
}
Kowsher, Md; Prottasha, Nusrat; Xu, Shiyun; Mohanto, Shetu; GARIBAY, OZLEM; Yousefi, Niloofar; Chen, Chen
FlowNIB: An Information Bottleneck Analysis of Bidirectional vs. Unidirectional Language Models Conference
The Fourteenth International Conference on Learning Representations, 2026.
Abstract | Tags: ICLR | Links:
@conference{nokeyn,
title = {FlowNIB: An Information Bottleneck Analysis of Bidirectional vs. Unidirectional Language Models},
author = {Md Kowsher and Nusrat Prottasha and Shiyun Xu and Shetu Mohanto and OZLEM GARIBAY and Niloofar Yousefi and Chen Chen
},
url = {https://github.com/Kowsher/BidiVsUniLM},
year = {2026},
date = {2026-01-26},
booktitle = {The Fourteenth International Conference on Learning Representations},
abstract = {Bidirectional language models (LMs) consistently show stronger context understanding than unidirectional models, yet the theoretical reason remains unclear. We present a simple information bottleneck (IB) perspective: bidirectional representations preserve more mutual information (MI) about both the input and the target, yielding richer features for downstream tasks. We adopt a layer–wise view and hypothesize that, at comparable capacity, bidirectional layers retain more useful signal than unidirectional ones. To test this claim empirically, we present Flow Neural Information Bottleneck (FlowNIB), a lightweight, post-hoc framework capable of estimating comparable mutual information values for individual layers in LMs, quantifying how much mutual information each layer carries for a dataset. FlowNIB takes three inputs—(i) the original LM’s inputs/dataset, (ii) ground–truth labels, and (iii) layer activations—simultaneously estimates the mutual information for both the input–layer and layer–label pairs. Empirically, bidirectional LM layers exhibit higher mutual information than similar—and even larger—unidirectional LMs. As a result, bidirectional LMs outperform unidirectional LMs across extensive experiments on NLU benchmarks (e.g., GLUE), commonsense reasoning, and regression tasks, demonstrating superior context understanding.},
keywords = {ICLR},
pubstate = {published},
tppubtype = {conference}
}
Liu, Xinxin; Li, Ming; Lyu, Zonglin; Shang, Yuzhang; Chen, Chen
Learning from Noisy Preferences: A Semi-Supervised Learning Approach to Direct Preference Optimization Conference
The Fourteenth International Conference on Learning Representations, 2026.
Abstract | Tags: ICLR | Links:
@conference{nokeyo,
title = {Learning from Noisy Preferences: A Semi-Supervised Learning Approach to Direct Preference Optimization},
author = {Xinxin Liu and Ming Li and Zonglin Lyu and Yuzhang Shang and Chen Chen},
url = {https://openreview.net/forum?id=rRc04jyoAk},
year = {2026},
date = {2026-01-26},
urldate = {2026-01-26},
booktitle = {The Fourteenth International Conference on Learning Representations},
abstract = {Human visual preferences are inherently multi-dimensional, encompassing aspects of aesthetics, detail fidelity, and semantic alignment. However, existing open-source preference datasets provide only single, holistic annotations, resulting in severe label noise—images that excel in some dimensions (e.g., compositional) but are deficient in others (e.g., details) are simply marked as winner" orloser". We theoretically demonstrate that this compression of multi-dimensional preferences into binary labels generates conflicting gradient signals that misguide the optimization process in Diffusion Direct Preference Optimization (DPO). To address this label noise from conflicting multi-dimensional preferences, we propose Semi-DPO, a semi-supervised learning approach. We treat pairs with consistent preferences across all dimensions as clean labeled data, while those with conflicting signals are considered noisy unlabeled data. Our method first trains a model on a clean, consensus-filtered data subset. This model then acts as its own implicit classifier to generate pseudo-labels for the larger, noisy set, which are used to iteratively refine the model's alignment. This approach effectively mitigates label noise and enhances image generation quality, achieving better alignment with multi-dimensional human preferences. Experimental results demonstrate that Semi-DPO significantly improves alignment with multi-dimensional human preferences, achieving state-of-the-art performance without requiring additional human annotation or the need to train a dedicated reward models.},
keywords = {ICLR},
pubstate = {published},
tppubtype = {conference}
}
Li, Ming; Wu, Jie; Cui, Jiaxing; Li, Xiaojie; Wang, Rui; Chen, Chen
ViPO: Visual Preference Optimization at Scale Conference
The Fourteenth International Conference on Learning Representations, 2026.
Abstract | Tags: ICLR | Links:
@conference{nokeyp,
title = {ViPO: Visual Preference Optimization at Scale},
author = {Ming Li and Jie Wu and Jiaxing Cui and Xiaojie Li and Rui Wang and Chen Chen
},
url = {https://openreview.net/pdf?id=x5zP3k64Nl},
year = {2026},
date = {2026-01-26},
booktitle = {The Fourteenth International Conference on Learning Representations},
abstract = {While preference optimization is crucial for improving visual generative models, how
to effectively scale this paradigm for visual generation remains largely unexplored.
Current open-source preference datasets typically contain substantial conflicting
preference patterns, where winners excel in some dimensions but underperform in
others. Naively optimizing on such noisy datasets fails to learn meaningful preferences,
fundamentally hindering effective scaling. To enhance the robustness of preference
algorithms against noise, we propose Poly-DPO, which extends the DPO objective
with an additional polynomial term that dynamically adjusts model confidence during
training based on dataset characteristics, enabling effective learning across diverse
data distributions from noisy to trivially simple patterns. Beyond biased patterns,
existing datasets suffer from low resolution, limited prompt diversity, and imbalanced
distributions. To facilitate large-scale visual preference optimization by tackling key
data bottlenecks, we construct ViPO, a massive-scale preference dataset with 1M
image pairs (1024px) across five categories and 300K video pairs (720p+) across three
categories. Leveraging state-of-the-art generative models and diverse prompts ensures
consistent, reliable preference signals with balanced distributions. Remarkably, when
applying Poly-DPO to our high-quality dataset, the optimal configuration converges
to standard DPO. This convergence validates both our dataset quality and Poly-DPO’s
adaptive nature: sophisticated optimization becomes unnecessary with sufficient data
quality, yet remains valuable for imperfect datasets. We comprehensively validate our
approach across various visual generation models. On noisy datasets like Pick-a-Pic V2,
Poly-DPO achieves 6.87 and 2.32 gains over Diffusion-DPO on GenEval for SD1.5 and
SDXL, respectively. For our high-quality ViPO dataset, models achieve performance
far exceeding those trained on existing open-source preference datasets. These results
confirm that addressing both algorithmic adaptability and data quality is essential for
scaling visual preference optimization. All models and datasets will be released.},
keywords = {ICLR},
pubstate = {published},
tppubtype = {conference}
}
to effectively scale this paradigm for visual generation remains largely unexplored.
Current open-source preference datasets typically contain substantial conflicting
preference patterns, where winners excel in some dimensions but underperform in
others. Naively optimizing on such noisy datasets fails to learn meaningful preferences,
fundamentally hindering effective scaling. To enhance the robustness of preference
algorithms against noise, we propose Poly-DPO, which extends the DPO objective
with an additional polynomial term that dynamically adjusts model confidence during
training based on dataset characteristics, enabling effective learning across diverse
data distributions from noisy to trivially simple patterns. Beyond biased patterns,
existing datasets suffer from low resolution, limited prompt diversity, and imbalanced
distributions. To facilitate large-scale visual preference optimization by tackling key
data bottlenecks, we construct ViPO, a massive-scale preference dataset with 1M
image pairs (1024px) across five categories and 300K video pairs (720p+) across three
categories. Leveraging state-of-the-art generative models and diverse prompts ensures
consistent, reliable preference signals with balanced distributions. Remarkably, when
applying Poly-DPO to our high-quality dataset, the optimal configuration converges
to standard DPO. This convergence validates both our dataset quality and Poly-DPO’s
adaptive nature: sophisticated optimization becomes unnecessary with sufficient data
quality, yet remains valuable for imperfect datasets. We comprehensively validate our
approach across various visual generation models. On noisy datasets like Pick-a-Pic V2,
Poly-DPO achieves 6.87 and 2.32 gains over Diffusion-DPO on GenEval for SD1.5 and
SDXL, respectively. For our high-quality ViPO dataset, models achieve performance
far exceeding those trained on existing open-source preference datasets. These results
confirm that addressing both algorithmic adaptability and data quality is essential for
scaling visual preference optimization. All models and datasets will be released.
Liu, Jialin; Ding, Lisang; Yin, Wotao; Osher, Stanley J
Implicit Models: Expressive Power Scales with Test-Time Compute Conference
The Fourteenth International Conference on Learning Representations, 2026.
Abstract | Tags: ICLR | Links:
@conference{nokeyq,
title = {Implicit Models: Expressive Power Scales with Test-Time Compute},
author = {Jialin Liu and Lisang Ding and Wotao Yin and Stanley J Osher
},
url = {https://arxiv.org/abs/2510.03638},
year = {2026},
date = {2026-01-26},
booktitle = {The Fourteenth International Conference on Learning Representations},
abstract = {Implicit models, an emerging model class, compute outputs by iterating a single parameter block to a fixed point. This architecture realizes an infinite-depth, weight-tied network that trains with constant memory, significantly reducing memory needs for the same level of performance compared to explicit models. While it is empirically known that these compact models can often match or even exceed the accuracy of larger explicit networks by allocating more test-time compute, the underlying reasons are not yet well understood. We study this gap through a non-parametric analysis of expressive power. We provide a strict mathematical characterization, showing that a simple and regular implicit operator can, through iteration, progressively express more complex mappings. We prove that for a broad class of implicit models, this process allows the model's expressive power to grow with test-time compute, ultimately matching a much richer function class. The theory is validated across four domains: imaging, scientific computing, operations research, and LLM reasoning, demonstrating that as test-time iterations increase, the complexity of the learned mapping rises, while the solution quality simultaneously improves and stabilizes},
keywords = {ICLR},
pubstate = {published},
tppubtype = {conference}
}
Lin, Wu; Lowe, Scott C; Dangel, Felix; Eschenhagen, Runa; Xu, Zikun; Grosse, Roger
Understanding and improving Shampoo and SOAP via Kullback-Leibler Minimization Conference
The Fourteenth International Conference on Learning Representations, 2026.
Abstract | Tags: ICLR | Links:
@conference{nokeyr,
title = {Understanding and improving Shampoo and SOAP via Kullback-Leibler Minimization},
author = {Wu Lin and Scott C Lowe and Felix Dangel and Runa Eschenhagen and Zikun Xu and Roger Grosse
},
url = {https://arxiv.org/abs/2509.03378},
year = {2026},
date = {2026-01-26},
urldate = {2026-01-26},
booktitle = {The Fourteenth International Conference on Learning Representations},
abstract = {Shampoo and its efficient, Adam-stabilized variant SOAP, employ structured second-moment estimation and have received growing attention for their effectiveness. In practice, Shampoo requires step-size grafting with Adam to achieve competitive performance. SOAP mitigates this by applying Adam in Shampoo's eigenbasis and further reducing per-iteration runtime. However, reliance on Adam introduces additional memory overhead in both methods. Prior theoretical interpretations have primarily examined their estimation schemes using the Frobenius norm. Motivated by the natural correspondence between the second moment and a covariance matrix, we reinterpret the estimation procedures in Shampoo and SOAP as instances of covariance estimation through the lens of Kullback–Leibler (KL) divergence minimization. This perspective reveals a previously overlooked theoretical limitation and motivates principled improvements to their design. Building on the KL perspective, we propose practical estimation schemes---KL-Shampoo and KL-SOAP---that match or exceed the performance of Shampoo and SOAP for pre-training a range of neural network models while maintaining SOAP-level per-iteration runtime. Notably, KL-Shampoo does not rely on Adam to achieve superior performance, thereby avoiding the associated memory overhead. Surprisingly, KL-Shampoo consistently outperforms the other methods in our experiments.},
keywords = {ICLR},
pubstate = {published},
tppubtype = {conference}
}
Fang, Pengjun; He, Yingqing; Xing, Yazhou; Chen, Qifeng; Lim, Ser-Nam; Yang, Harry
AC-Foley: Reference-Audio-Guided Video-to-Audio Synthesis with Acoustic Transfer Conference
The Fourteenth International Conference on Learning Representations, 2026.
Abstract | Tags: ICLR | Links:
@conference{nokeys,
title = {AC-Foley: Reference-Audio-Guided Video-to-Audio Synthesis with Acoustic Transfer},
author = {Pengjun Fang and Yingqing He and Yazhou Xing and Qifeng Chen and Ser-Nam Lim and Harry Yang
},
url = {https://arxiv.org/abs/2509.03378},
year = {2026},
date = {2026-01-26},
booktitle = {The Fourteenth International Conference on Learning Representations},
abstract = {Existing video-to-audio (V2A) generation methods predominantly rely on text prompts alongside visual information to synthesize audio. However, two critical bottlenecks persist: semantic granularity gaps in training data (e.g., conflating acoustically distinct sounds like different dog barks under coarse labels), and textual ambiguity in describing microacoustic features (e.g., "metallic clang" failing to distinguish impact transients and resonance decay). These bottlenecks make it difficult to perform fine-grained sound synthesis using text-controlled modes. To address these limitations, we propose AC-Foley, an audio-conditioned V2A model that directly leverages reference audio to achieve precise and fine-grained control over generated sounds. This approach enables: fine-grained sound synthesis (e.g., footsteps with distinct timbres on wood, marble, or gravel), timbre transfer (e.g., transforming a violin’s melody into the bright, piercing tone of a suona), zero-shot generation of sounds (e.g., creating unique weapon sound effects without training on firearm datasets) and better audio quality. By directly conditioning on audio signals, our approach bypasses the semantic ambiguities of text descriptions while enabling precise manipulation of acoustic attributes. Empirically, AC-Foley achieves state-of-the-art performance for Foley generation when conditioned on reference audio, while remaining competitive with SOTA video-to-audio methods even without audio conditioning.},
keywords = {ICLR},
pubstate = {published},
tppubtype = {conference}
}
Cui, Xuanming; Cheng, Jianpeng; Chen, Hong-You; Shukla, Satya Narayan; Awasthi, Abhijeet; Pan, Xichen; Ahuja, Chaitanya; Mishra, Shlok; Tian, Taipeng; Guo, Qi; Lim, Ser-Nam; Singh, Aashu; Fan, Xiangjun
Think Then Embed: Generative Context Improves Multimodal Embedding Conference
The Fourteenth International Conference on Learning Representations, 2026.
Abstract | Tags: ICLR | Links:
@conference{nokeyt,
title = {Think Then Embed: Generative Context Improves Multimodal Embedding},
author = {Xuanming Cui and Jianpeng Cheng and Hong-You Chen and Satya Narayan Shukla and Abhijeet Awasthi and Xichen Pan and Chaitanya Ahuja and Shlok Mishra and Taipeng Tian and Qi Guo and Ser-Nam Lim and Aashu Singh and Xiangjun Fan},
url = {https://arxiv.org/pdf/2510.05014},
year = {2026},
date = {2026-01-26},
booktitle = {The Fourteenth International Conference on Learning Representations},
abstract = {There is a growing interest in Universal Multimodal Embeddings (UME), where models are required to generate task-specific representations. While recent studies show that Multimodal Large Language Models (MLLMs) perform well on such tasks, they treat MLLMs solely as encoders, overlooking their generative capacity. However, such an encoding paradigm becomes less effective as instructions become more complex and require compositional reasoning. Inspired by the proven effectiveness of chain-of-thought reasoning, we propose a general Think-Then-Embed (TTE) framework for UME, composed of a reasoner and an embedder. The reasoner MLLM first generates reasoning traces that explain complex queries, followed by an embedder that produces representations conditioned on both the original query and the intermediate reasoning. This explicit reasoning step enables more nuanced understanding of complex multimodal instructions. Our contributions are threefold. First, by leveraging a powerful MLLM reasoner, we achieve state-of-the-art performance on the MMEB-V2 benchmark, surpassing proprietary models trained on massive in-house datasets. Second, to reduce the dependency on large MLLM reasoners, we finetune a smaller MLLM reasoner using high-quality embedding-centric reasoning traces, achieving the best performance among open-source models with a 7% absolute gain over recently proposed models. Third, we investigate strategies for integrating the reasoner and embedder into a unified model for improved efficiency without sacrificing performance.},
keywords = {ICLR},
pubstate = {published},
tppubtype = {conference}
}
Bao, Wenrui; Chen, Zhiben; Xu, Dan; Shang, Yuzhang
Learning to Parallel: Accelerating Diffusion Large Language Models via Adaptive Parallel Decoding Conference
The Fourteenth International Conference on Learning Representations, 2026.
Abstract | Tags: ICLR | Links:
@conference{nokeyu,
title = {Learning to Parallel: Accelerating Diffusion Large Language Models via Adaptive Parallel Decoding},
author = {Wenrui Bao and Zhiben Chen and Dan Xu and Yuzhang Shang},
url = {https://arxiv.org/abs/2509.25188},
year = {2026},
date = {2026-01-26},
booktitle = {The Fourteenth International Conference on Learning Representations},
abstract = {Autoregressive decoding in large language models (LLMs) requires sequential steps for O(n) tokens, fundamentally limiting inference throughput. Recent diffusion-based LLMs (dLLMs) enable parallel token generation through iterative denoising. However, current parallel decoding strategies rely on fixed, input-agnostic heuristics (e.g., confidence thresholds), which fail to adapt to input-specific characteristics, resulting in suboptimal speed-quality trade-offs across diverse NLP tasks. In this work, we explore a more flexible and dynamic approach to parallel decoding. We propose **Learning to Parallel Decode (Learn2PD)**, a framework that trains a lightweight and adaptive filter model to predict, for each token position, whether the current prediction matches the final output. This learned filter approximates an oracle parallel decoding strategy that unmasks tokens only when correctly predicted. Importantly, the filter model is learned in a post-training manner, requiring only a small amount of computation to optimize it (minute-level GPU time). Additionally, we introduce **End-of-Text Prediction (EoTP)** to detect decoding completion at the end of sequence, avoiding redundant decoding of padding tokens. Experiments on the LLaDA benchmark demonstrate that our method achieves up to **22.58×** speedup without any performance drop, and up to **57.51×** when combined with KV-Cache.},
keywords = {ICLR},
pubstate = {published},
tppubtype = {conference}
}
Wang, Haoxuan; Zhang, Gengyu; Yan, Yan; Shang, Yuzhang; Kompella, Ramana; Liu, Gaowen
Real-Time Robot Execution with Masked Action Chunking Conference
The Fourteenth International Conference on Learning Representations , 2026.
Abstract | Tags: ICLR | Links:
@conference{nokeyv,
title = {Real-Time Robot Execution with Masked Action Chunking},
author = {Haoxuan Wang and Gengyu Zhang and Yan Yan and Yuzhang Shang and Ramana Kompella and Gaowen Liu
},
url = {https://arxiv.org/abs/2601.20130},
year = {2026},
date = {2026-01-26},
booktitle = {The Fourteenth International Conference on Learning Representations
},
abstract = {Real-time execution is essential for cyber-physical systems such as robots. These systems operate in dynamic real-world environments where even small delays can undermine responsiveness and compromise performance. Asynchronous inference has recently emerged as a system-level paradigm for real-time robot manipulation, enabling the next action chunk to be predicted while the current one is being executed. While this approach achieves real-time responsiveness, naive integration often results in execution failure. Previous methods attributed this failure to inter-chunk discontinuity and developed test-time algorithms to smooth chunk boundaries. In contrast, we identify another critical yet overlooked factor: intra-chunk inconsistency, where the robot’s executed action chunk partially misaligns with its current perception. To address this, we propose REMAC, which learns corrective adjustments on the pretrained policy through masked action chunking, enabling the policy to remain resilient under mismatches between intended actions and actual execution during asynchronous inference. In addition, we introduce a prefix-preserved sampling procedure to reinforce inter-chunk continuity. Overall, our method delivers more reliable policies without incurring additional latency. Extensive experiments in both simulation and real-world settings demonstrate that our method enables faster task execution, maintains robustness across varying delays, and consistently achieves higher completion rates.},
keywords = {ICLR},
pubstate = {published},
tppubtype = {conference}
}
Essoullami, Mostapha; Bergou, El Houcine; Dutta, Aritra
LEGACY: A Lightweight Dynamic Gradient Compression Strategy for Distributed Deep Learning Conference
The Fourteenth International Conference on Learning Representations, 2026.
@conference{nokeyw,
title = {LEGACY: A Lightweight Dynamic Gradient Compression Strategy for Distributed Deep Learning},
author = {Mostapha Essoullami and El Houcine Bergou and Aritra Dutta
},
year = {2026},
date = {2026-01-26},
booktitle = {The Fourteenth International Conference on Learning Representations},
abstract = {Distributed learning has achieved remarkable success in training deep neural networks (DNNs) on large datasets, but the communication bottleneck limits its scalability. Various compression techniques have been proposed to alleviate this limitation; however, they either use fixed parameters throughout training or rely on complex and computationally intensive methods to adapt compression parameters. Instead of the hard-to-tune hyperparameters required by adaptive compressors, this paper investigates the impact of two fundamental factors in DNN training—the layer size of the networks and their training phases—to design a simple yet efficient dynamic scheduler for any compressor, guiding the selection of compression parameters. We present a Lightweight Efficient GrAdient Compression strategyY or LEGACY, which, in theory, can work with any compression technique to produce a simple dynamic counterpart. We benchmark LEGACY on distributed and federated training, involving seven different DNN architectures, ranging from ResNet, Transformer-XL, to GPT-2, across large and challenging datasets, including ImageNet, WikiText-103, and OpenWebText. On ImageNet-1K, with an equivalent average data volume, LEGACY's dynamic compression strategies improve the Top-1 accuracy of ResNet-50 by 7-11% compared to uniform Top-0.1% compression, while on WikiText-103, the layer-based dynamic strategy reduces the perplexity of Transformer-XL by ~26% relative to the same baseline. In addition, we evaluate LEGACY under constrained and federated settings, and demonstrate that it...},
keywords = {ICLR},
pubstate = {published},
tppubtype = {conference}
}
Luo, Yan Li · Zhenyi Wang · Guanghao Li · Wei Xue · Yike Guo · Wenhan
Pixel-Perfect Puppetry: Precision-Guided Enhancement for Face Image and Video Editing Conference
The Fourteenth International Conference on Learning Representations , 2026.
@conference{nokeyx,
title = {Pixel-Perfect Puppetry: Precision-Guided Enhancement for Face Image and Video Editing},
author = {Yan Li · Zhenyi Wang · Guanghao Li · Wei Xue · Yike Guo · Wenhan Luo
},
year = {2026},
date = {2026-01-26},
booktitle = {The Fourteenth International Conference on Learning Representations
},
abstract = {Preserving identity while precisely manipulating attributes is a central challenge in face editing for both images and videos. Existing methods often introduce visual artifacts or fail to maintain temporal consistency. We present FlowGuide, a unified framework that achieves fine-grained control over face editing in diffusion models. Our approach is founded on the local linearity of the UNet bottleneck’s latent space, which allows us to treat semantic attributes as corresponding to specific linear subspaces, providing a mathematically sound basis for disentanglement. FlowGuide first identifies a set of orthogonal basis vectors that span these semantic subspaces for both the original content and the target edit, a representation that efficiently captures the most salient features of each. We then introduce a novel guidance mechanism that quantifies the geometric alignment between these bases to dynamically steer the denoising trajectory at each step. This approach offers superior control by ensuring edits are confined to the desired attribute’s semantic axis while preserving orthogonal components related to identity. Extensive experiments demonstrate that FlowGuide achieves state-of-the-art performance, producing high-quality edits with superior identity preservation and temporal coherence.},
keywords = {ICLR},
pubstate = {published},
tppubtype = {conference}
}
Singh, Utsav; Chakraborty, Souradip; Suttle, Wesley; Sadler, Brian; Asher, Derrik; Sahu, Anit Kumar; Shah, Mubarak; Namboodiri, Vinay Purushothaman; Bedi, Amrit
Direct Preference Optimization for Primitive-Enabled Hierarchical RL: A Bilevel Approach Conference
Fourteenth International Conference on Learning Representations (ICLR), 2026., 2026.
Abstract | Tags: ICLR | Links:
@conference{nokeyy,
title = {Direct Preference Optimization for Primitive-Enabled Hierarchical RL: A Bilevel Approach},
author = {Utsav Singh and Souradip Chakraborty and Wesley Suttle and Brian Sadler and Derrik Asher and Anit Kumar Sahu and Mubarak Shah and Vinay Purushothaman Namboodiri and Amrit Bedi
},
url = {https://arxiv.org/html/2411.00361v3},
year = {2026},
date = {2026-01-26},
booktitle = {Fourteenth International Conference on Learning Representations (ICLR), 2026.},
journal = {Fourteenth International Conference on Learning Representations (ICLR), 2026.},
abstract = {Hierarchical reinforcement learning (HRL) enables agents to solve complex, long-horizon tasks by decomposing them into manageable sub-tasks. However, HRL methods face two fundamental challenges: (i) non-stationarity caused by the evolving lower-level policy during training, which destabilizes higher-level learning, and (ii) the generation of infeasible subgoals that lower-level policies cannot achieve. To address these challenges, we introduce DIPPER, a novel HRL framework that formulates goal-conditioned HRL as a bi-level optimization problem and leverages direct preference optimization (DPO) to train the higher-level policy. By learning from preference comparisons over subgoal sequences rather than rewards that depend on the evolving lower-level policy, DIPPER mitigates the impact of non-stationarity on higher-level learning. To address infeasible subgoals, DIPPER incorporates lower-level value function regularization that encourages the higher-level policy to propose achievable subgoals. We introduce two novel metrics to quantitatively verify that DIPPER mitigates non-stationarity and infeasible subgoal generation issues in HRL. Empirical evaluation on challenging robotic navigation and manipulation benchmarks shows that DIPPER achieves upto 40% improvements over state-of-the-art baselines on challenging sparse-reward scenarios, highlighting the potential of preference-based learning for addressing longstanding HRL limitations.},
keywords = {ICLR},
pubstate = {published},
tppubtype = {conference}
}
Lee, Jihoon; Moon, Hoyeon; Zhai, Kevin; Chithanar, Arun; Sahu, Anit Kumar; Kar, Soummya; Lee, Chul; Chakraborty, Souradip; Bedi, Amrit
TEST-TIME SCALING IN DIFFUSION LLMS VIA HIDDEN SEMI-AUTOREGRESSIVE EXPERTS Conference
Fourteenth International Conference on Learning Representations (ICLR), 2026. , 2026.
@conference{nokeyz,
title = {TEST-TIME SCALING IN DIFFUSION LLMS VIA HIDDEN SEMI-AUTOREGRESSIVE EXPERTS},
author = {Jihoon Lee and Hoyeon Moon and Kevin Zhai and Arun Chithanar and Anit Kumar Sahu and Soummya Kar and Chul Lee and Souradip Chakraborty and Amrit Bedi
},
year = {2026},
date = {2026-01-26},
booktitle = {Fourteenth International Conference on Learning Representations (ICLR), 2026.
},
abstract = {Diffusion-based large language models (dLLMs) are trained to model extreme flexibility/dependence in the data-distribution; however, how to best utilize this at inference time remains an open problem. In this work, we uncover an interesting property of these models: dLLMs {trained on textual data} implicitly learn a mixture of semi-autoregressive experts, where different generation orders reveal different specialized behaviors. We show that committing to any single, fixed inference time schedule, a common practice, collapses performance by failing to leverage this latent ensemble. To address this, we introduce HEX (Hidden semi-autoregressive EXperts for test-time scaling), a training-free inference method that ensembles across heterogeneous block schedules. By doing a majority vote over diverse block-sized generation paths, HEX robustly avoids failure modes associated with any single fixed schedule. On reasoning benchmarks such as GSM8K, it boosts accuracy by up to 3.56× (from 24.72% to 88.10%), outperforming top-K margin inference and specialized fine-tuned methods like GRPO, without additional training. HEX even yields significant gains on MATH benchmark from 16.40% to 40.00%, scientific reasoning on ARC-C from 54.18% to 87.80%, and TruthfulQA from 28.36% to 57.46%. Our results establish test-time scaling as a powerful principle for dLLMs, showing that the sequence in which masking is done can play a significant role in test-time scaling/inferencing of dLLMs.},
keywords = {ICLR},
pubstate = {published},
tppubtype = {conference}
}
Zhang, Yu; Yang, Bin; KHAN, ARIJIT; Akcora, Cuneyt
ATEX-CF: Attack-Informed Counterfactual Explanations for Graph Neural Networks Conference
Fourteenth International Conference on Learning Representations (ICLR), 2026., 2026.
@conference{nokey_27,
title = {ATEX-CF: Attack-Informed Counterfactual Explanations for Graph Neural Networks},
author = {Yu Zhang and Bin Yang and ARIJIT KHAN and Cuneyt Akcora
},
year = {2026},
date = {2026-01-26},
booktitle = {Fourteenth International Conference on Learning Representations (ICLR), 2026.},
abstract = {Counterfactual explanations offer an intuitive way to interpret graph neural networks (GNNs) by identifying minimal changes that alter a model’s prediction, thereby answering “what must differ for a different outcome?”. In this work, we propose a novel framework, ATEX-CF that unifies adversarial attack techniques with counterfactual explanation generation—a connection made feasible by theirshared goal of flipping a node’s prediction, yet differing in perturbation strategy:adversarial attacks often rely on edge additions, while counterfactual methods typically use deletions. Unlike traditional approaches that treat explanation and attack separately, our method efficiently integrates both edge additions and deletions, grounded in theory, leveraging adversarial insights to explore impactful counterfactuals. In addition, by jointly optimizing fidelity, sparsity, and plausibility under a constrained perturbation budget, our method produces instance-level explanations that are both informative and realistic. Experiments on synthetic and real-world node classification benchmarks demonstrate that ATEX-CF generates faithful, concise, and plausible explanations, highlighting the effectiveness of integrating adversarial insights into counterfactual reasoning for GNNs.},
keywords = {ICLR},
pubstate = {published},
tppubtype = {conference}
}
Uddin, Md Joshem; Tola, Astrit; Akcora, Cuneyt; Coskunuzer, Baris
TopoFormer: Topology Meets Attention for Graph Learning Conference
Fourteenth International Conference on Learning Representations (ICLR), 2026. , 2026.
@conference{nokey_28,
title = {TopoFormer: Topology Meets Attention for Graph Learning},
author = {Md Joshem Uddin and Astrit Tola and Cuneyt Akcora and Baris Coskunuzer
},
year = {2026},
date = {2026-01-26},
booktitle = {Fourteenth International Conference on Learning Representations (ICLR), 2026.
},
abstract = {We introduce TopoFormer, a lightweight and scalable framework for graph representation learning that encodes topological structure into attention-friendly sequences. At the core of our method is Topo-Scan, a novel module that decomposes a graph into a short, ordered sequence of topological tokens by slicing over node or edge filtrations. These sequences capture multi-scale structural patterns, from local motifs to global organization, and are processed by a Transformer to produce expressive graph-level embeddings. Unlike traditional persistent homology pipelines, Topo-Scan is parallelizable, avoids costly diagram computations, and integrates seamlessly with standard deep learning architectures. We provide theoretical guarantees on the stability of our topological encodings and demonstrate state-of-the-art performance across graph classification and molecular property prediction benchmarks. Our results show that TopoFormer matches or exceeds strong GNN and topology-based baselines while offering predictable and efficient compute. This work opens a new path for parallelizable and unifying approaches to graph representation learning that integrate topological inductive biases into attention frameworks.},
keywords = {ICLR},
pubstate = {published},
tppubtype = {conference}
}
Fioresi, Joseph; Dave, Ishan Rajendrakumar; Shah, Mubarak
Privacy Beyond Pixels: Latent Anonymization for Privacy-Preserving Video Understanding Conference
Fourteenth International Conference on Learning Representations (ICLR), 2026. , 2026.
Abstract | Tags: ICLR | Links:
@conference{nokey_29,
title = {Privacy Beyond Pixels: Latent Anonymization for Privacy-Preserving Video Understanding},
author = {Joseph Fioresi and Ishan Rajendrakumar Dave and Mubarak Shah
},
url = {https://arxiv.org/abs/2511.08666},
year = {2026},
date = {2026-01-26},
booktitle = {Fourteenth International Conference on Learning Representations (ICLR), 2026.
},
abstract = {We introduce a novel formulation of visual privacy preservation for video foundation models that operates entirely in the latent space. While spatio-temporal features learned by foundation models have deepened general understanding of video content, sharing or storing these extracted visual features for downstream tasks inadvertently reveals sensitive personal information like skin color, gender, or clothing. Current privacy preservation methods focus on input-pixel level anonymization, which requires retraining the entire utility video model and results in task-specific anonymization, making them unsuitable for recent video foundational models. To address these challenges, we introduce a lightweight Anonymizing Adapter Module (AAM) that removes private information from video features while retaining general task utility. AAM can be applied in a plug and play fashion to frozen video encoders, minimizing the computational burden of finetuning and re-extracting features. Our framework employs three newly designed training objectives: (1) a clip-level self-supervised privacy objective to reduce mutual information between static clips, (2) a co-training objective to retain utility across seen tasks, and (3) a latent consistency loss for generalization on unseen tasks. Our extensive evaluations demonstrate a significant 35% reduction in privacy leakage while maintaining near-baseline utility performance across various downstream tasks: Action Recognition (Kinetics400, UCF101, HMDB51), Temporal Action Detection (THUMOS14), and Anomaly Detection (UCF-Crime). We also provide an....},
keywords = {ICLR},
pubstate = {published},
tppubtype = {conference}
}
Ashraf, Tajamul; Saqib, Amal; Gani, Hanan; AlMahri, Muhra; Li, Yuhao; Ahsan, Noor; Nawaz, Umair; Lahoud, Jean; Cholakkal, Hisham; amd Philip Torr, Mubarak Shah; Khan, Fahad; Anwer, Rao; Khan, Salman
Agent-X: Evaluating Deep Multimodal Reasoning in Vision-Centric Agentic Tasks Conference
Fourteenth International Conference on Learning Representations (ICLR), 2026. , 2026.
@conference{nokey_30,
title = {Agent-X: Evaluating Deep Multimodal Reasoning in Vision-Centric Agentic Tasks},
author = {Tajamul Ashraf and Amal Saqib and Hanan Gani and Muhra AlMahri and Yuhao Li and Noor Ahsan and Umair Nawaz and Jean Lahoud and Hisham Cholakkal and Mubarak Shah amd Philip Torr and Fahad Khan and Rao Anwer and Salman Khan},
year = {2026},
date = {2026-01-26},
booktitle = {Fourteenth International Conference on Learning Representations (ICLR), 2026.
},
abstract = {Deep reasoning is fundamental for solving complex tasks, especially in vision-centric scenarios that demand sequential, multimodal understanding. However, existing benchmarks typically evaluate agents with fully synthetic, single-turn queries, limited visual modalities, and lack a framework to assess reasoning quality over multiple steps as required in real-world settings. To address this, we introduce Agent-X, a large-scale benchmark for evaluating vision-centric agents’ multistep and deep reasoning capabilities in real-world, multimodal settings. AgentX features 828 agentic tasks with authentic visual contexts, including images, multi-image comparisons, videos, and instructional text. These tasks span six major agentic environments: general visual reasoning, web browsing, security and surveillance, autonomous driving, sports, and math reasoning. Our benchmark requires agents to integrate tool use with explicit, stepwise decision-making in these diverse settings. In addition, we propose a fine-grained, step-level evaluation framework that assesses the correctness and logical coherence of each reasoning step and the effectiveness of tool usage throughout the task. Our results reveal that even the best-performing models, including GPT, Gemini, and Qwen families, struggle to solve multi-step vision tasks, achieving less than 50% full-chain success. These findings highlight key bottlenecks in current LMM reasoning and tool-use capabilities and identify future research directions in vision-centric agentic reasoning models},
keywords = {ICLR},
pubstate = {published},
tppubtype = {conference}
}
Farhat, Zain Ulabedeen; Ghosh, Debamita; Atia, George; Wang, Yue
Sample-Efficient Distributionally Robust Multi-Agent Reinforcement Learning via Online Interaction Conference
Fourteenth International Conference on Learning Representations (ICLR), 2026. , 2026.
@conference{nokey_31,
title = {Sample-Efficient Distributionally Robust Multi-Agent Reinforcement Learning via Online Interaction},
author = {Zain Ulabedeen Farhat and Debamita Ghosh and George Atia and Yue Wang
},
year = {2026},
date = {2026-01-26},
booktitle = {Fourteenth International Conference on Learning Representations (ICLR), 2026.
},
abstract = {Well-trained multi-agent systems can fail when deployed in real-world environments due to model mismatches between the training and deployment environments, caused by environment uncertainties including noise or adversarial attacks. Distributionally Robust Markov Games (DRMGs) enhance system resilience by optimizing for worst-case performance over a defined set of environmental uncertainties. However, current methods are limited by their dependence on simulators or large offline datasets, which are often unavailable. This paper pioneers the study of online learning in DRMGs, where agents learn directly from environmental interactions without prior data. We introduce the Multiplayer Optimistic Robust Nash Value Iteration (MORNAVI) algorithm and provide the first provable guarantees for this setting. Our theoretical analysis demonstrates that the algorithm achieves low regret and efficiently finds the optimal robust policy for uncertainty sets measured by Total Variation divergence and Kullback-Leibler divergence. These results establish a new, practical path toward developing truly robust multi-agent systems.},
keywords = {ICLR},
pubstate = {published},
tppubtype = {conference}
}
2025
Hu, Zixuan; Shen, Li; Wang, Zhenyi; Wei, Yongxian; Tao, Dacheng
Adaptive Defense against Harmful Fine-Tuning via Bayesian Data Scheduler Conference
The Thirty-Ninth Annual Conference on Neural Information Processing Systems, 2025.
Abstract | Tags: NeurIPS | Links:
@conference{Hu2025,
title = {Adaptive Defense against Harmful Fine-Tuning via Bayesian Data Scheduler},
author = {Zixuan Hu and Li Shen and Zhenyi Wang and Yongxian Wei and Dacheng Tao},
url = {https://neurips.cc/virtual/2025/poster/115659},
year = {2025},
date = {2025-11-30},
publisher = {The Thirty-Ninth Annual Conference on Neural Information Processing Systems},
abstract = {Harmful fine-tuning poses critical safety risks to fine-tuning-as-a-service for large language models. Existing defense strategies preemptively build robustness via attack simulation but suffer from fundamental limitations: (i) the infeasibility of performing attack simulation due to lacking prior knowledge about potential attack data, and (ii) limited adaptability to varying attack settings, as simulation fails to capture their variability and complexity. To address these challenges, we propose Bayesian Data Scheduler (BDS), an adaptive tuning-stage defense strategy with no need for attack simulation. BDS formulates harmful fine-tuning defense as a Bayesian inference problem, learning the posterior distribution of each data point's safety attribute, conditioned on the fine-tuning and alignment datasets. The fine-tuning process is then constrained by weighting data with their safety attributes sampled from the posterior, thus mitigating the influence of harmful data. By leveraging the post hoc nature of Bayesian inference, the posterior is conditioned on the fine-tuning dataset, enabling BDS to tailor its defense to the specific dataset, thereby achieving adaptive defense. Furthermore, we introduce a neural scheduler based on amortized Bayesian learning, enabling efficient transfer to new data without retraining. Comprehensive results across diverse attack and defense settings demonstrate the state-of-the-art performance of our approach.},
keywords = {NeurIPS},
pubstate = {published},
tppubtype = {conference}
}
Tola, Astrit; Taiwo, Funmilola Mary; Akcora, Cuneyt Gurcan; Coskunuzer, Baris
TopER: Topological Embeddings in Graph Representation Learning Conference
The Thirty-Ninth Annual Conference on Neural Information Processing Systems, 2025.
Abstract | Tags: NeurIPS | Links:
@conference{Tola2025,
title = {TopER: Topological Embeddings in Graph Representation Learning},
author = {Astrit Tola and Funmilola Mary Taiwo and Cuneyt Gurcan Akcora and Baris Coskunuzer},
url = {https://arxiv.org/abs/2410.01778},
year = {2025},
date = {2025-11-30},
publisher = {The Thirty-Ninth Annual Conference on Neural Information Processing Systems},
abstract = {Graph embeddings play a critical role in graph representation learning, allowing machine learning models to explore and interpret graph-structured data. However, existing methods often rely on opaque, high-dimensional embeddings, limiting interpretability and practical visualization.
In this work, we introduce Topological Evolution Rate (TopER), a novel, low-dimensional embedding approach grounded in topological data analysis. TopER simplifies a key topological approach, Persistent Homology, by calculating the evolution rate of graph substructures, resulting in intuitive and interpretable visualizations of graph data. This approach not only enhances the exploration of graph datasets but also delivers competitive performance in graph clustering and classification tasks. Our TopER-based models achieve or surpass state-of-the-art results across molecular, biological, and social network datasets in tasks such as classification, clustering, and visualization.},
keywords = {NeurIPS},
pubstate = {published},
tppubtype = {conference}
}
In this work, we introduce Topological Evolution Rate (TopER), a novel, low-dimensional embedding approach grounded in topological data analysis. TopER simplifies a key topological approach, Persistent Homology, by calculating the evolution rate of graph substructures, resulting in intuitive and interpretable visualizations of graph data. This approach not only enhances the exploration of graph datasets but also delivers competitive performance in graph clustering and classification tasks. Our TopER-based models achieve or surpass state-of-the-art results across molecular, biological, and social network datasets in tasks such as classification, clustering, and visualization.
Shamsi, Kiarash; Ngo, Tran Gia Bao; Shirzadkhani, Razieh; Huang, Shenyang; Poursafaei, Farimah; Azad, Poupak; Rabbany, Reihaneh; Coskunuzer, Baris; Rabusseau, Guillaume; Akcora, Cuneyt Gurcan
MiNT: Multi-Network Transfer Benchmark for Temporal Graph Learning Conference
The Thirty-Ninth Annual Conference on Neural Information Processing Systems, 2025.
Abstract | Tags: NeurIPS | Links:
@conference{Shamsi2025,
title = {MiNT: Multi-Network Transfer Benchmark for Temporal Graph Learning},
author = {Kiarash Shamsi and Tran Gia Bao Ngo and Razieh Shirzadkhani and Shenyang Huang and Farimah Poursafaei and Poupak Azad and Reihaneh Rabbany and Baris Coskunuzer and Guillaume Rabusseau and Cuneyt Gurcan Akcora},
url = {https://neurips.cc/virtual/2025/poster/121574
https://github.com/benjaminnNgo/ScalingTGNs},
year = {2025},
date = {2025-11-30},
publisher = {The Thirty-Ninth Annual Conference on Neural Information Processing Systems},
abstract = {Temporal Graph Learning (TGL) aims to discover patterns in evolving networks or temporal graphs and leverage these patterns to predict future interactions. However, most existing research focuses on learning from a single network in isolation, leaving the challenges of within-domain and cross-domain generalization largely unaddressed. In this study, we introduce a new benchmark of 84 real-world temporal transaction networks and propose Temporal Multi-network Transfer (MiNT), a pre-training framework designed to capture transferable temporal dynamics across diverse networks. We train MiNT models on up to 64 transaction networks and evaluate their generalization ability on 20 held-out, unseen networks. Our results show that MiNT consistently outperforms individually trained models, revealing a strong relation between the number of pre-training networks and transfer performance. These findings highlight scaling trends in temporal graph learning and underscore the importance of network diversity in improving generalization. This work establishes the first large-scale benchmark for studying transferability in TGL and lays the groundwork for developing Temporal Graph Foundation Models. Our code is available at url{https://github.com/benjaminnNgo/ScalingTGNs}},
keywords = {NeurIPS},
pubstate = {published},
tppubtype = {conference}
}
Lyu, Zonglin; Li, Ming; Liu, Xinxin; Chen, Chen
CPO: Condition Preference Optimization for Controllable Image Generation Conference
The Thirty-Ninth Annual Conference on Neural Information Processing Systems, 2025.
Abstract | Tags: NeurIPS | Links:
@conference{Lyu2025b,
title = {CPO: Condition Preference Optimization for Controllable Image Generation},
author = {Zonglin Lyu and Ming Li and Xinxin Liu and Chen Chen},
url = {https://neurips.cc/virtual/2025/poster/117815},
year = {2025},
date = {2025-11-30},
publisher = {The Thirty-Ninth Annual Conference on Neural Information Processing Systems},
abstract = {To enhance controllability in text-to-image generation, ControlNet introduces image-based control signals, while ControlNet++ improves pixel-level cycle consistency between generated images and the input control signal. To avoid the prohibitive cost of back-propagating through the sampling process, ControlNet++ optimizes only low-noise timesteps (e.g.,
) using a single-step approximation, which not only ignores the contribution of high-noise timesteps but also introduces additional approximation errors. A straightforward alternative for optimizing controllability across all timesteps is Direct Preference Optimization (DPO), a fine-tuning method that increases model preference for more controllable images (
) over less controllable ones (
). However, due to uncertainty in generative models, it is difficult to ensure that win--lose image pairs differ only in controllability while keeping other factors, such as image quality, fixed. To address this, we propose performing preference learning over control conditions rather than generated images. Specifically, we construct winning and losing control signals,
and
, and train the model to prefer
. This method, which we term textit{Condition Preference Optimization} (CPO), eliminates confounding factors and yields a low-variance training objective. Our approach theoretically exhibits lower contrastive loss variance than DPO and empirically achieves superior results. Moreover, CPO requires less computation and storage for dataset curation. Extensive experiments show that CPO significantly improves controllability over the state-of-the-art ControlNet++ across multiple control types: over
% error rate reduction in segmentation,
-
% in human pose, and consistent
%-
% reductions in edge and depth maps. Here, the error rate is defined as the difference between evaluated controllability and oracle results.},
keywords = {NeurIPS},
pubstate = {published},
tppubtype = {conference}
}
) using a single-step approximation, which not only ignores the contribution of high-noise timesteps but also introduces additional approximation errors. A straightforward alternative for optimizing controllability across all timesteps is Direct Preference Optimization (DPO), a fine-tuning method that increases model preference for more controllable images (
) over less controllable ones (
). However, due to uncertainty in generative models, it is difficult to ensure that win--lose image pairs differ only in controllability while keeping other factors, such as image quality, fixed. To address this, we propose performing preference learning over control conditions rather than generated images. Specifically, we construct winning and losing control signals,
and
, and train the model to prefer
. This method, which we term textit{Condition Preference Optimization} (CPO), eliminates confounding factors and yields a low-variance training objective. Our approach theoretically exhibits lower contrastive loss variance than DPO and empirically achieves superior results. Moreover, CPO requires less computation and storage for dataset curation. Extensive experiments show that CPO significantly improves controllability over the state-of-the-art ControlNet++ across multiple control types: over
% error rate reduction in segmentation,
-
% in human pose, and consistent
%-
% reductions in edge and depth maps. Here, the error rate is defined as the difference between evaluated controllability and oracle results.
Zhang, Yancheng; Sun, Guangyu; Chen, Chen
EGGS: Exchangeable 2D/3D Gaussian Splatting for Geometry-Appearance Balanced Novel View Synthesis Conference
The Thirty-Ninth Annual Conference on Neural Information Processing Systems, 2025.
Abstract | Tags: NeurIPS | Links:
@conference{Zhang2025d,
title = {EGGS: Exchangeable 2D/3D Gaussian Splatting for Geometry-Appearance Balanced Novel View Synthesis},
author = {Yancheng Zhang and Guangyu Sun and Chen Chen},
url = {https://neurips.cc/virtual/2025/poster/120173},
year = {2025},
date = {2025-11-30},
publisher = {The Thirty-Ninth Annual Conference on Neural Information Processing Systems},
abstract = {Novel view synthesis (NVS) is crucial in computer vision and graphics, with wide applications in AR, VR, and autonomous driving. While 3D Gaussian Splatting (3DGS) enables real-time rendering with high appearance fidelity, it suffers from multi-view inconsistencies, limiting geometric accuracy. In contrast, 2D Gaussian Splatting (2DGS) enforces multi-view consistency but compromises texture details. To address these limitations, we propose Exchangeable Gaussian Splatting (EGGS), a hybrid representation that integrates 2D and 3D Gaussians to balance appearance and geometry. To achieve this, we introduce Hybrid Gaussian Rasterization for unified rendering, Adaptive Type Exchange for dynamic adaptation between 2D and 3D Gaussians, and Frequency-Decoupled Optimization that effectively exploits the strengths of each type of Gaussian representation. Our CUDA-accelerated implementation ensures efficient training and inference. Extensive experiments demonstrate that EGGS outperforms existing methods in rendering quality, geometric accuracy, and efficiency, providing a practical solution for high-quality NVS.},
keywords = {NeurIPS},
pubstate = {published},
tppubtype = {conference}
}
(Ed.)
BetaConform: Efficient MAP Estimation of LLM Ensemble Judgment Performance with Prior Transfer Collection
The Thirty-Ninth Annual Conference on Neural Information Processing Systems, 2025.
Abstract | Tags: NeurIPS | Links:
@collection{Qu2025,
title = {BetaConform: Efficient MAP Estimation of LLM Ensemble Judgment Performance with Prior Transfer},
author = {Huaizhi Qu and Inyoung Choi and Zhen Tan and Song Wang and Sukwon Yun and Qi Long and Faizan Siddiqui and Kwonjoon Lee and Tianlong Chen},
url = {https://arxiv.org/abs/2504.12589},
year = {2025},
date = {2025-11-30},
publisher = {The Thirty-Ninth Annual Conference on Neural Information Processing Systems},
abstract = {LLM ensembles are widely used for LLM judges. However, how to estimate their accuracy, especially in an efficient way, is unknown. In this paper, we present a principled maximum a posteriori (MAP) framework for an economical and precise estimation of the performance of LLM ensemble judgment. We first propose a mixture of Beta-Binomial distributions to model the judgment distribution, revising from the vanilla Binomial distribution. Next, we introduce a conformal prediction-driven approach that enables adaptive stopping during iterative sampling to balance accuracy with efficiency. Furthermore, we design a prior transfer mechanism that utilizes learned distributions on open-source datasets to improve estimation on a target dataset when only scarce annotations are available. Finally, we present BetaConform, a framework that integrates our distribution assumption, adaptive stopping, and the prior transfer mechanism to deliver a theoretically guaranteed distribution estimation of LLM ensemble judgment with minimum labeled samples. BetaConform is also validated empirically. For instance, with only 10 samples from the TruthfulQA dataset, for a Llama ensembled judge, BetaConform gauges its performance with error margin as small as 3.37%.},
keywords = {NeurIPS},
pubstate = {published},
tppubtype = {collection}
}
Hu, Tianyu; Tan, Zhen; Wang, Song; Qu, Huaizhi; Chen, Tianlong
Multi-Agent Debate for LLM Judges with Adaptive Stability Detection Conference
2025.
Abstract | Tags: NeurIPS | Links:
@conference{Hu2025b,
title = {Multi-Agent Debate for LLM Judges with Adaptive Stability Detection},
author = {Tianyu Hu and Zhen Tan and Song Wang and Huaizhi Qu and Tianlong Chen},
url = {https://neurips.cc/virtual/2025/poster/117644},
year = {2025},
date = {2025-11-30},
abstract = {With advancements in reasoning capabilities, Large Language Models (LLMs) are increasingly employed for automated judgment tasks.While LLMs-as-Judges offer promise in automating evaluations, current approaches often rely on simplistic aggregation methods (e.g., majority voting), which can fail even when individual agents provide correct answers.To address this, we propose a multi-agent debate judge framework where agents collaboratively reason and iteratively refine their responses. We formalize the debate process mathematically, analyzing agent interactions and proving that debate amplifies correctness compared to static ensembles.To enhance efficiency, we introduce a stability detection mechanism that models judge consensus dynamics via a time-varying Beta-Binomial mixture, with adaptive stopping based on distributional similarity (Kolmogorov-Smirnov test).This mechanism models the judges' collective correct rate dynamics using a time-varying mixture of Beta-Binomial distributions and employs an adaptive stopping criterion based on distributional similarity (Kolmogorov-Smirnov statistic). Experiments across multiple benchmarks and models demonstrate that our framework improves judgment accuracy over majority voting while maintaining computational efficiency.},
keywords = {NeurIPS},
pubstate = {published},
tppubtype = {conference}
}
He, Yinhan; Zheng, Wendy; Wang, Song; Zheng, Zaiyi; Dong, Yushun; Zhu, Yaochen; Li, Jundong
Hierarchical Demonstration Order Optimization for Many-shot In-Context Learning Conference
The Thirty-Ninth Annual Conference on Neural Information Processing Systems, 2025.
Abstract | Tags: NeurIPS | Links:
@conference{He2025,
title = {Hierarchical Demonstration Order Optimization for Many-shot In-Context Learning},
author = {Yinhan He and Wendy Zheng and Song Wang and Zaiyi Zheng and Yushun Dong and Yaochen Zhu and Jundong Li},
url = {https://neurips.cc/virtual/2025/poster/119561
https://anonymous.4open.science/r/HIDO-B2DE/},
year = {2025},
date = {2025-11-30},
publisher = {The Thirty-Ninth Annual Conference on Neural Information Processing Systems},
abstract = {In-Context Learning (ICL) is a technique where large language models (LLMs) leverage multiple demonstrations (i.e., examples) to perform tasks. With the recent expansion of LLM context windows, many-shot ICL (generally with more than 50 demonstrations) can lead to significant performance improvements on a variety of language tasks such as text classification and question answering.Nevertheless, ICL faces the issue of demonstration order instability (ICL-DOI), which means that performance varies significantly depending on the order of demonstrations. Moreover, ICL-DOI persists in many-shot ICL, validated by our thorough experimental investigation.Current strategies for handling ICL-DOI are not applicable to many-shot ICL due to two critical challenges: (1) Most existing methods assess demonstration order quality by first prompting the LLM, then using heuristic metrics based on the LLM's predictions. In the many-shot scenarios, these metrics without theoretical grounding become unreliable, where the LLMs struggle to effectively utilize information from long input contexts, making order distinctions less clear.(2) The requirement to examine all orders for the large number of demonstrations is computationally infeasible due to the super-exponential complexity of the order space in many-shot ICL. To tackle the first challenge, we design a demonstration order evaluation metric based on information theory for measuring order quality, which effectively quantifies the usable information gain of a given demonstration order.To address the second challenge, we propose a hierarchical demonstration order optimization method named HIDO that enables a more refined exploration of the order space, achieving high ICL performance without the need to evaluate all possible orders. Extensive experiments on multiple LLMs and real-world datasets demonstrate that our HIDO method consistently and efficiently outperforms other baselines. Our code project can be found at https://anonymous.4open.science/r/HIDO-B2DE/.},
keywords = {NeurIPS},
pubstate = {published},
tppubtype = {conference}
}
Gupta, Animesh; Parmar, Jay; Dave, Ishan Rajendrakumar; Shah, Mubarak
From Play to Replay: Composed Video Retrieval for Sports Highlights Conference
The Thirty-Ninth Annual Conference on Neural Information Processing Systems, 2025.
Abstract | Tags: NeurIPS | Links:
@conference{Gupta2025,
title = {From Play to Replay: Composed Video Retrieval for Sports Highlights},
author = {Animesh Gupta and Jay Parmar and Ishan Rajendrakumar Dave and Mubarak Shah},
url = {https://neurips.cc/virtual/2025/poster/121717
https://animesh-007.github.io/TF-CoVR-WEBSITE/},
year = {2025},
date = {2025-11-30},
publisher = {The Thirty-Ninth Annual Conference on Neural Information Processing Systems},
abstract = {Composed Video Retrieval (CoVR) retrieves a target video given a query video and a modification text describing the intended change. Existing CoVR benchmarks emphasize appearance shifts or coarse event changes and therefore do not test the ability to capture subtle, fast-paced temporal differences. We introduce TF-CoVR, the first large-scale benchmark dedicated to temporally fine-grained CoVR. TF-CoVR focuses on gymnastics and diving and provides 1.8 M triplets drawn from FineGym and FineDiving. Previous CoVR benchmarks focusing on temporal aspect, link each query to a single target segment taken from the same video, limiting practical usefulness. In TF-CoVR, we instead construct each pair by prompting an LLM with the label differences between clips drawn from different videos; every pair is thus associated with multiple valid target videos (3.9 on average), reflecting real-world tasks such as sports-highlight generation. To model these temporal dynamics we propose TF-CoVR-Base, a concise two-stage training framework: (i) pre-train a video encoder on fine-grained action classification to obtain temporally discriminative embeddings; (ii) align the composed query with candidate videos using contrastive learning. We conduct the first comprehensive study of image, video, and general multimodal embedding (GME) models on temporally fine-grained composed retrieval in both zero-shot and fine-tuning regimes. On TF-CoVR, TF-CoVR-Base improves zero-shot mAP@50 from 5.92 (LanguageBind) to 7.51, and after fine-tuning raises the state of the art from 19.83 to 25.82.},
keywords = {NeurIPS},
pubstate = {published},
tppubtype = {conference}
}
Carnemolla, Simone; Pennisi, Matteo; Samarasinghe, Sarinda; Bellitto, Giovanni; Palazzo, Simone; Giordano, Daniela; Shah, Mubarak; Spampinato, Concetto
DEXTER: Diffusion-Guided EXplanations with TExtual Reasoning for Vision Models Conference
The Thirty-Ninth Annual Conference on Neural Information Processing Systems, 2025.
Abstract | Tags: NeurIPS | Links:
@conference{Carnemolla2025,
title = {DEXTER: Diffusion-Guided EXplanations with TExtual Reasoning for Vision Models},
author = {Simone Carnemolla and Matteo Pennisi and Sarinda Samarasinghe and Giovanni Bellitto and Simone Palazzo and Daniela Giordano and Mubarak Shah and Concetto Spampinato},
url = {https://neurips.cc/virtual/2025/poster/117167},
year = {2025},
date = {2025-11-30},
publisher = {The Thirty-Ninth Annual Conference on Neural Information Processing Systems},
abstract = {Understanding and explaining the behavior of machine learning models is essential for building transparent and trustworthy AI systems. We introduce DEXTER, a data-free framework that combines diffusion models and large language models to generate global, textual explanations of visual classifiers. DEXTER operates by optimizing text prompts to synthesize class-conditional images that strongly activate a target classifier. These synthetic samples are then used to elicit detailed natural language reports that describe class-specific decision patterns and biases. Unlike prior work, DEXTER enables natural language reasoning about a classifier's decision process without access to training data or ground-truth labels. We demonstrate DEXTER's flexibility across three tasks—activation maximization, slice discovery and debiasing, and bias explanation—each illustrating its ability to uncover the internal mechanisms of visual classifiers. Quantitative and qualitative evaluations, including a user study, show that DEXTER produces accurate, interpretable outputs. Experiments on ImageNet, Waterbirds, CelebA, and FairFaces confirm that DEXTER outperforms existing approaches in global model explanation and class-level bias reporting.
},
keywords = {NeurIPS},
pubstate = {published},
tppubtype = {conference}
}
Ghosh, Ipsita; Nguyen, Ethan; Kümmerle, Christian
Q3R: Quadratic Reweighted Rank Regularizer for Effective Low-Rank Training Conference
The Thirty-Ninth Annual Conference on Neural Information Processing Systems, 2025.
Abstract | Tags: NeurIPS | Links:
@conference{Ghosh2025,
title = {Q3R: Quadratic Reweighted Rank Regularizer for Effective Low-Rank Training},
author = {Ipsita Ghosh and Ethan Nguyen and Christian Kümmerle},
url = {https://neurips.cc/virtual/2025/poster/117315},
year = {2025},
date = {2025-11-30},
publisher = {The Thirty-Ninth Annual Conference on Neural Information Processing Systems},
abstract = {Parameter-efficient training, based on low-rank optimization, has become a highly successful tool for fine-tuning large deep-learning models. However, these methods fail at low-rank pretraining tasks where maintaining the low-rank structure and the objective remains a challenging task. We propose the Quadratic Reweighted Rank Regularizer dubbed QER, which leads to a novel low-rank inducing training strategy inspired by the iteratively reweighted least squares (IRLS) framework. QER is based on a quadratic regularizer term which majorizes a smoothed log determinant serving as rank surrogate objective. Unlike other low-rank training techniques, QER is able to train weight matrices with prescribed, low target ranks of models that achieve comparable predictive performance as dense models, with small computational overhead, while remaining fully compatible with existing architectures. In experiments, we are able to truncate 60% of the parameters of a ViT-Tiny parameters with marginal loss in CIFAR-10 performance and up to 80% with only 4% accuracy drop. The efficacy of QER is confirmed on Transformers across both image and language tasks. To demonstrate QER task agnosticism, we fine-tune RoBERTa using QER regularized dense layers models on GLUE tasks, achieving performance comparable to state-of-the-art low-rank adapters.},
keywords = {NeurIPS},
pubstate = {published},
tppubtype = {conference}
}
Zhao, Zhenghao; Wang, Haoxuan; Wu, Junyi; Shang, Yuzhang; Liu, Gaowen; Yan, Yan
Efficient Multimodal Dataset Distillation via Generative Models Conference
The Thirty-Ninth Annual Conference on Neural Information Processing Systems, 2025.
Abstract | Tags: NeurIPS | Links:
@conference{Zhao2025,
title = {Efficient Multimodal Dataset Distillation via Generative Models},
author = {Zhenghao Zhao and Haoxuan Wang and Junyi Wu and Yuzhang Shang and Gaowen Liu and Yan Yan},
url = {https://neurips.cc/virtual/2025/poster/119089},
year = {2025},
date = {2025-11-30},
publisher = {The Thirty-Ninth Annual Conference on Neural Information Processing Systems},
abstract = {Dataset distillation aims to synthesize a small dataset from a large dataset, enabling the model trained on it to perform well on the original dataset. With the blooming of large language models and multimodal large language models, the importance of multimodal datasets, particularly image-text datasets, has grown significantly. However, existing multimodal dataset distillation methods are constrained by the Matching Training Trajectories algorithm, which significantly increases the computing resource requirement, and takes days to process the distillation. In this work, we introduce EDGE, a generative distillation method for efficient multimodal dataset distillation. Specifically, we identify two key challenges of distilling multimodal datasets with generative models: 1) The lack of correlation between generated images and captions.2) The lack of diversity among generated samples.To address the aforementioned issues, we propose a novel generative model training workflow with a bi-directional contrastive loss and a diversity loss. Furthermore, we propose a caption synthesis strategy to further improve text-to-image retrieval performance by introducing more text information. Our method is evaluated on Flickr30K, COCO, and CC3M datasets, demonstrating superior performance and efficiency compared to existing approaches. Notably, our method achieves results 18
faster than the state-of-the-art method.},
keywords = {NeurIPS},
pubstate = {published},
tppubtype = {conference}
}
faster than the state-of-the-art method.
Xue, Jiaqi; Kumar, Mayank; Shang, Yuzhang; Gao, Shangqian; Ning, Rui; Zheng, Mengxin; Jiang, Xiaoqian; Lou, Qian
DictPFL: Efficient and Private Federated Learning on Encrypted Gradients Conference
The Thirty-Ninth Annual Conference on Neural Information Processing Systems, 2025.
Abstract | Tags: NeurIPS | Links:
@conference{Xue2025,
title = {DictPFL: Efficient and Private Federated Learning on Encrypted Gradients},
author = {Jiaqi Xue and Mayank Kumar and Yuzhang Shang and Shangqian Gao and Rui Ning and Mengxin Zheng and Xiaoqian Jiang and Qian Lou},
url = {https://neurips.cc/virtual/2025/poster/119806},
year = {2025},
date = {2025-11-30},
publisher = {The Thirty-Ninth Annual Conference on Neural Information Processing Systems},
abstract = {Federated learning (FL) enables institutions to collaboratively train machine learning models by aggregating local gradients without sharing sensitive data. However, sharing gradients still poses privacy risks, e.g., gradient inversion attacks. Homomorphic encryption (HE) can be used in FL to encrypt gradients at the data owner's side, enabling secure aggregation without decryption on the server. Existing HE approaches to FL lie at two extremes. One encrypts every gradient update, providing strong privacy but incurring prohibitive computation and bandwidth costs. The other encrypts only a subset of gradients, reducing overhead yet leaving the remaining plaintext updates vulnerable to privacy attacks. Our proposed DictPFL bridges this gap. It encrypts every gradient that must be transmitted to the server—protecting all shared information—while keeping the rest of the (unencrypted) gradients on the client, where they never leave the device. By safeguarding every transmitted update, DictPFL achieves the same privacy guarantees as fully encrypted FL, but its selective-encryption strategy slashes computational and communication overhead. DictPFL comprises two modules: Decompose-for-Partial-Encrypt (DePE) and Prune-for-Minimum-Encrypt (PrME). In DePE, we decompose model weights to be trained into a dictionary and a lookup table. Only the gradients of the lookup table are encrypted and aggregated securely while the dictionary remains fixed and is not transmitted for aggregation. In PrME, we aim to further minimize the encrypted parameters with an encryption-aware pruning technique that ensures a consistent pruning mask across clients by leveraging the history of global gradients. Experimental results demonstrate that DictPFL significantly reduces communication overhead by 402 to 748 times and speeds training by 28 to 65 times compared to fully encrypted method. It also outperforms the state-of-the-art selectively encrypted gradient by lowering overhead by 51 to 155 times and accelerating training by 4 to 19 times. DictPFL increases training time by even less than a 2
factor compared with its plaintext counterpart without gradients protection, demonstrating—for the first time—that HE–based private federated learning is practical for real-world deployment.},
keywords = {NeurIPS},
pubstate = {published},
tppubtype = {conference}
}
factor compared with its plaintext counterpart without gradients protection, demonstrating—for the first time—that HE–based private federated learning is practical for real-world deployment.
Adak, Deepan; Rawat, Yogesh; Vyas, Shruti
MolVision: Molecular Property Prediction with Vision Language Models Conference
The Thirty-Ninth Annual Conference on Neural Information Processing Systems, 2025.
Abstract | Tags: NeurIPS | Links:
@conference{Adak2025,
title = {MolVision: Molecular Property Prediction with Vision Language Models},
author = {Deepan Adak and Yogesh Rawat and Shruti Vyas},
url = {https://neurips.cc/virtual/2025/poster/121822
https://chemvision.github.io/chemvision/},
year = {2025},
date = {2025-11-30},
publisher = {The Thirty-Ninth Annual Conference on Neural Information Processing Systems},
abstract = {Molecular property prediction is a fundamental task in computational chemistry with critical applications in drug discovery and materials science. While recent works have explored Large Language Models (LLMs) for this task, they primarily rely on textual molecular representations such as SMILES/SELFIES, which can be ambiguous and structurally uninformative. In this work, we introduce MolVision, a novel approach that leverages Vision-Language Models (VLMs) by integrating both molecular structure images and textual descriptions to enhance property prediction. We construct a benchmark spanning nine diverse datasets, covering both classification and regression tasks. Evaluating nine different VLMs in zero-shot, few-shot, and fine-tuned settings, we find that visual information improves prediction performance, particularly when combined with efficient fine-tuning strategies such as LoRA. Our results reveal that while visual information alone is insufficient, multimodal fusion significantly enhances generalization across molecular properties. Adaptation of vision encoder for molecular images in conjunction with LoRA further improves the performance. The code and data is available at : https://chemvision.github.io/chemvision/.},
keywords = {NeurIPS},
pubstate = {published},
tppubtype = {conference}
}
Chen, Haodong; Huang, Haojian; Chen, Qifeng; Yang, Harry; Lim, Ser Nam
Hierarchical Fine-grained Preference Optimization for Physically Plausible Video Generation Conference
The Thirty-Ninth Annual Conference on Neural Information Processing Systems, 2025.
Abstract | Tags: NeurIPS | Links:
@conference{Chen2025d,
title = {Hierarchical Fine-grained Preference Optimization for Physically Plausible Video Generation},
author = {Haodong Chen and Haojian Huang and Qifeng Chen and Harry Yang and Ser Nam Lim},
url = {https://neurips.cc/virtual/2025/poster/115193},
year = {2025},
date = {2025-11-30},
publisher = {The Thirty-Ninth Annual Conference on Neural Information Processing Systems},
abstract = {Recent advancements in video generation have enabled the creation of high-quality, visually compelling videos. However, generating videos that adhere to the laws of physics remains a critical challenge for applications requiring realism and accuracy. In this work, we propose PhysHPO, a novel framework for Hierarchical Cross-Modal Direct Preference Optimization, to tackle this challenge by enabling fine-grained preference alignment for physically plausible video generation. PhysHPO optimizes video alignment across four hierarchical granularities: a) Instance Level, aligning the overall video content with the input prompt; b) State Level, ensuring temporal consistency using boundary frames as anchors; c) Motion Level, modeling motion trajectories for realistic dynamics; and d) Semantic Level, maintaining logical consistency between narrative and visuals. Recognizing that real-world videos are the best reflections of physical phenomena, we further introduce an automated data selection pipeline to efficiently identify and utilize "good data" from existing large-scale text-video datasets, thereby eliminating the need for costly and time-intensive dataset construction. Extensive experiments on both physics-focused and general capability benchmarks demonstrate that PhysHPO significantly improves physical plausibility and overall video generation quality of advanced models. To the best of our knowledge, this is the first work to explore fine-grained preference alignment and data selection for video generation, paving the way for more realistic and human-preferred video generation paradigms.},
keywords = {NeurIPS},
pubstate = {published},
tppubtype = {conference}
}
Huang, Jiani; Keoliya, Mayank; Kuo, Matthew; Velingker, Neelay; Sethi, Amish; Jung, JungHo; Li, Ziyang; Lim, Ser Nam; Naik, Mayur
ESCA: Contextualizing Embodied Agents via Scene-Graph Generation Conference
The Thirty-Ninth Annual Conference on Neural Information Processing Systems, 2025.
Abstract | Tags: NeurIPS | Links:
@conference{Huang2025b,
title = {ESCA: Contextualizing Embodied Agents via Scene-Graph Generation},
author = {Jiani Huang and Mayank Keoliya and Matthew Kuo and Neelay Velingker and Amish Sethi and JungHo Jung and Ziyang Li and Ser Nam Lim and Mayur Naik},
url = {https://neurips.cc/virtual/2025/poster/117064},
year = {2025},
date = {2025-11-30},
publisher = {The Thirty-Ninth Annual Conference on Neural Information Processing Systems},
abstract = {Multi-modal large language models (MLLMs) are making rapid progress toward general-purpose embodied agents. However, current training pipelines primarily rely on high-level vision-sound-text pairs and lack fine-grained, structured alignment between pixel-level visual content and textual semantics. To overcome this challenge, we propose ESCA, a new framework for contextualizing embodied agents through structured spatial-temporal understanding. At its core is SGClip, a novel CLIP-based, open-domain, and promptable model for generating scene graphs. SGClip is trained on 87K+ open-domain videos via a neurosymbolic learning pipeline, which harnesses model-driven self-supervision from video-caption pairs and structured reasoning, thereby eliminating the need for human-labeled scene graph annotations. We demonstrate that SGClip supports both prompt-based inference and task-specific fine-tuning, excelling in scene graph generation and action localization benchmarks. ESCA with SGClip consistently improves both open-source and commercial MLLMs, achieving state-of-the-art performance across two embodied environments. Notably, it significantly reduces agent perception errors and enables open-source models to surpass proprietary baselines.},
keywords = {NeurIPS},
pubstate = {published},
tppubtype = {conference}
}
Shu, Yan; Lin, Hangui; Liu, Yexin; Zhang, Yan; Zeng, Gangyan; Li, Yan; Zhou, Yu; Lim, Ser Nam; Yang, Harry; Sebe, Nicu
When Semantics Mislead Vision: Mitigating Large Multimodal Models Hallucinations in Scene Text Spotting and Understanding Conference
The Thirty-Ninth Annual Conference on Neural Information Processing Systems, 2025.
Abstract | Tags: NeurIPS | Links:
@conference{Shu2025,
title = {When Semantics Mislead Vision: Mitigating Large Multimodal Models Hallucinations in Scene Text Spotting and Understanding},
author = {Yan Shu and Hangui Lin and Yexin Liu and Yan Zhang and Gangyan Zeng and Yan Li and Yu Zhou and Ser Nam Lim and Harry Yang and Nicu Sebe},
url = {https://neurips.cc/virtual/2025/poster/119366},
year = {2025},
date = {2025-11-30},
publisher = {The Thirty-Ninth Annual Conference on Neural Information Processing Systems},
abstract = {Large Multimodal Models (LMMs) have achieved impressive progress in visual perception and reasoning. However, when confronted with visually ambiguous or non-semantic scene text, they often struggle to accurately spot and understand the content, frequently generating semantically plausible yet visually incorrect answers, which we refer to as semantic hallucination.In this work, we investigate the underlying causes of semantic hallucination and identify a key finding: Transformer layers in LLM with stronger attention focus on scene text regions are less prone to producing semantic hallucinations. Thus, we propose a training-free semantic hallucination mitigation framework comprising two key components: (1) ZoomText, a coarse-to-fine strategy that identifies potential text regions without external detectors; and (2) Grounded Layer Correction, which adaptively leverages the internal representations from layers less prone to hallucination to guide decoding, correcting hallucinated outputs for non-semantic samples while preserving the semantics of meaningful ones. To enable rigorous evaluation, we introduce TextHalu-Bench, a benchmark of over 1,730 samples spanning both semantic and non-semantic cases, with manually curated question–answer pairs designed to probe model hallucinations.Extensive experiments demonstrate that our method not only effectively mitigates semantic hallucination but also achieves strong performance on public benchmarks for scene text spotting and understanding.},
keywords = {NeurIPS},
pubstate = {published},
tppubtype = {conference}
}
Ghosal, Soumya Suvra; Chakraborty, Souradip; Reddy, Avinash; Lu, Yifu; Wang, Mengdi; Manocha, Dinesh; Huang, Furong; Ghavamzadeh, Mohammad; Bedi, Amrit Singh
Does Thinking More Always Help? Understanding Test-Time Scaling in Reasoning Models Conference
The Thirty-Ninth Annual Conference on Neural Information Processing Systems, 2025.
Abstract | Tags: NeurIPS | Links:
@conference{Ghosal2025b,
title = {Does Thinking More Always Help? Understanding Test-Time Scaling in Reasoning Models},
author = {Soumya Suvra Ghosal and Souradip Chakraborty and Avinash Reddy and Yifu Lu and Mengdi Wang and Dinesh Manocha and Furong Huang and Mohammad Ghavamzadeh and Amrit Singh Bedi},
url = {https://neurips.cc/virtual/2025/poster/115605},
year = {2025},
date = {2025-11-30},
publisher = {The Thirty-Ninth Annual Conference on Neural Information Processing Systems},
abstract = {Recent trends in test-time scaling for reasoning models (e.g., OpenAI o1, DeepSeek R1) have led to a popular belief that extending thinking traces using prompts like “Wait” or “Let me rethink” can improve performance. This raises a natural question: Does thinking more at test-time truly lead to better reasoning? To answer this question, we perform a detailed empirical study across models and benchmarks, which reveals a consistent pattern of initial performance improvements from additional thinking followed by a decline, due to "overthinking". To understand this non-monotonic trend, we consider a simple probabilistic model, which reveals that additional thinking increases output variance—creating an illusion of improved reasoning while ultimately undermining precision. Thus, observed gains from "more thinking" are not true indicators of improved reasoning, but artifacts stemming from the connection between model uncertainty and evaluation metric. This suggests that test-time scaling through extended thinking is not an effective way to utilize the inference thinking budget. Recognizing these limitations, we introduce an alternative test-time scaling approach, parallel thinking, inspired by Best-of-N sampling. Our method generates multiple independent reasoning paths within the same inference budget and selects the most consistent response via majority vote, achieving up to 20% higher accuracy compared to extended thinking. This provides a simple yet effective mechanism for test-time scaling of reasoning models.},
keywords = {NeurIPS},
pubstate = {published},
tppubtype = {conference}
}
Mudit,; Singh, Utsav; Bedi, Amrit Singh; Pasupathy, Raghu; Aggarwal, Vaneet
On the Sample Complexity Bounds of Bilevel Reinforcement Learning Conference
The Thirty-Ninth Annual Conference on Neural Information Processing Systems, 2025.
Abstract | Tags: NeurIPS | Links:
@conference{Mudit2025,
title = {On the Sample Complexity Bounds of Bilevel Reinforcement Learning},
author = {Mudit and Utsav Singh and Amrit Singh Bedi and Raghu Pasupathy and Vaneet Aggarwal},
url = {https://neurips.cc/virtual/2025/poster/116657},
year = {2025},
date = {2025-11-30},
publisher = {The Thirty-Ninth Annual Conference on Neural Information Processing Systems},
abstract = {Bilevel reinforcement learning (BRL) has emerged as a powerful framework for aligning generative models, yet its theoretical foundations, especially sample complexity bounds, remain underexplored. In this work, we present the first sample complexity bound for BRL, establishing a rate of
in continuous state-action spaces. Traditional MDP analysis techniques do not extend to BRL due to its nested structure and non-convex lower-level problems. We overcome these challenges by leveraging the Polyak-Łojasiewicz (PL) condition and the MDP structure to obtain closed-form gradients, enabling tight sample complexity analysis. Our analysis also extends to general bi-level optimization settings with non-convex lower levels, where we achieve state-of-the-art sample complexity results of
improving upon existing bounds of
. Additionally, we address the computational bottleneck of hypergradient estimation by proposing a fully first-order, Hessian-free algorithm suitable for large-scale problems.},
keywords = {NeurIPS},
pubstate = {published},
tppubtype = {conference}
}
in continuous state-action spaces. Traditional MDP analysis techniques do not extend to BRL due to its nested structure and non-convex lower-level problems. We overcome these challenges by leveraging the Polyak-Łojasiewicz (PL) condition and the MDP structure to obtain closed-form gradients, enabling tight sample complexity analysis. Our analysis also extends to general bi-level optimization settings with non-convex lower levels, where we achieve state-of-the-art sample complexity results of
improving upon existing bounds of
. Additionally, we address the computational bottleneck of hypergradient estimation by proposing a fully first-order, Hessian-free algorithm suitable for large-scale problems.
Barakat, Anas; Chakraborty, Souradip; Yu, Peihong; Tokekar, Pratap; Bedi, Amrit Singh
On the Global Optimality of Policy Gradient Methods in General Utility Reinforcement Learning Conference
The Thirty-Ninth Annual Conference on Neural Information Processing Systems, 2025.
Abstract | Tags: NeurIPS | Links:
@conference{Barakat2025,
title = {On the Global Optimality of Policy Gradient Methods in General Utility Reinforcement Learning},
author = {Anas Barakat and Souradip Chakraborty and Peihong Yu and Pratap Tokekar and Amrit Singh Bedi},
url = {https://neurips.cc/virtual/2025/poster/117237},
year = {2025},
date = {2025-11-30},
publisher = {The Thirty-Ninth Annual Conference on Neural Information Processing Systems},
abstract = {Reinforcement learning with general utilities (RLGU) offers a unifying framework to capture several problems beyond standard expected returns, including imitation learning, pure exploration, and safe RL. Despite recent fundamental advances in the theoretical analysis of policy gradient (PG) for standard RL and recent efforts in RLGU, the understanding of PG methods and their scope of application in RLGU still remain limited. In this work, we establish global optimality guarantees of PG methods for RLGU in which the objective is a general concave utility function of the state-action occupancy measure. In the tabular setting, we provide global optimality results using a new proof technique building on recent theoretical developments on the convergence of PG methods for standard RL using gradient domination. Our proof technique opens avenues for analyzing policy parameterizations beyond the direct policy parameterization for RLGU. In addition, we provide global optimality results for large state action space settings beyond prior work which has mostly focused on the tabular setting. In this large scale setting, we adapt PG methods by approximating occupancy measures within a function approximation class using maximum likelihood estimation. Our sample complexity only scales with the dimension of our function approximation class rather than the size of the state action space.},
keywords = {NeurIPS},
pubstate = {published},
tppubtype = {conference}
}
Rai, Daking; Miller, Samuel; Moran, Kevin; Yao, Ziyu
Failure by Interference: Language Models Make Balanced Parentheses Errors When Faulty Mechanisms Overshadow Sound Ones Conference
The Thirty-Ninth Annual Conference on Neural Information Processing Systems, 2025.
Abstract | Tags: NeurIPS | Links:
@conference{Rai2025,
title = {Failure by Interference: Language Models Make Balanced Parentheses Errors When Faulty Mechanisms Overshadow Sound Ones},
author = {Daking Rai and Samuel Miller and Kevin Moran and Ziyu Yao},
url = {https://neurips.cc/virtual/2025/poster/120187},
year = {2025},
date = {2025-11-30},
publisher = {The Thirty-Ninth Annual Conference on Neural Information Processing Systems},
abstract = {Despite remarkable advances in coding capabilities, language models (LMs) still struggle with simple syntactic tasks such as generating balanced parentheses. In this study, we investigate the underlying mechanisms behind the persistence of these errors across LMs of varying sizes (124M–7B) to both understand and mitigate the errors. Our study reveals that LMs rely on a number of components (attention heads and FF neurons) that independently make their own predictions. While some components reliably promote correct answers across a generalized range of inputs (i.e., implementing "sound mechanisms''), others are less reliable and introduce noise by promoting incorrect tokens (i.e., implementing "faulty mechanisms''). Errors occur when the faulty mechanisms overshadow the sound ones and dominantly affect the predictions. Motivated by this insight, we introduce RASteer, a steering method to systematically identify and increase the contribution of reliable components for improving model performance. RASteer substantially improves performance on balanced parentheses tasks, boosting accuracy of some models from 0% to around 100% without impairing the models' general coding ability. We further demonstrate its broader applicability in arithmetic reasoning tasks, achieving performance gains of up to around 20%.},
keywords = {NeurIPS},
pubstate = {published},
tppubtype = {conference}
}
Shafique, Bhuiyan Sanjid; Vayani, Ashmal; Maaz, Muhammad; Rasheed, Hanoona Abdul; Dissanayake, Dinura; Kurpath, Mohammed Irfan; Hmaiti, Yahya; Inoue, Go; Lahoud, Jean; Rashid, Md. Safirur; Quasem, Shadid Intisar; Fatima, Maheen; Vidal, Franco; Maslych, Mykola; More, Ketan Pravin; Baliah, Sanoojan; Watawana, Hasindri; Li, Yuhao; Farestam, Fabian; Schaller, Leon; Tymtsiv, Roman; Weber, Simon; Cholakkal, Hisham; Laptev, Ivan; Satoh, Shin'ichi; Felsberg, Michael; Shah, Mubarak; Khan, Salman; Khan, Fahad Shahbaz
A Culturally-diverse Multilingual Multimodal Video Benchmark & Model Conference
Empirical Methods in Natural Language Processing, 2025.
Abstract | Tags: EMNLP | Links:
@conference{Shafique2025,
title = {A Culturally-diverse Multilingual Multimodal Video Benchmark & Model},
author = {Bhuiyan Sanjid Shafique and Ashmal Vayani and Muhammad Maaz and Hanoona Abdul Rasheed and Dinura Dissanayake and Mohammed Irfan Kurpath and Yahya Hmaiti and Go Inoue and Jean Lahoud and Md. Safirur Rashid and Shadid Intisar Quasem and Maheen Fatima and Franco Vidal and Mykola Maslych and Ketan Pravin More and Sanoojan Baliah and Hasindri Watawana and Yuhao Li and Fabian Farestam and Leon Schaller and Roman Tymtsiv and Simon Weber and Hisham Cholakkal and Ivan Laptev and Shin'ichi Satoh and Michael Felsberg and Mubarak Shah and Salman Khan and Fahad Shahbaz Khan},
url = {https://mbzuai-oryx.github.io/ViMUL/
https://arxiv.org/abs/2506.07032
https://huggingface.co/datasets/MBZUAI/ViMUL-Bench},
year = {2025},
date = {2025-11-04},
urldate = {2025-11-04},
publisher = {Empirical Methods in Natural Language Processing},
abstract = {Large multimodal models (LMMs) have recently gained attention due to their effectiveness to understand and generate descriptions of visual content. Most existing LMMs are in English language. While few recent works explore multilingual image LMMs, to the best of our knowledge, moving beyond the English language for cultural and linguistic inclusivity is yet to be investigated in the context of video LMMs. In pursuit of more inclusive video LMMs, we introduce a multilingual Video LMM benchmark, named ViMUL-Bench, to evaluate Video LMMs across 14 languages, including both low and high-resource languages: English, Chinese, Spanish, French, German, Hindi, Arabic, Russian, Bengali, Urdu, Sinhala, Tamil, Swedish, and Japanese. Our ViMUL-Bench is designed to rigorously test video LMMs across 15 categories including eight culturally diverse categories, ranging from lifestyles and festivals to foods and rituals and from local landmarks to prominent cultural personalities. ViMUL-Bench comprises both open-ended (short and long-form) and multiple-choice questions spanning various video durations (short, medium, and long) with 8k samples that are manually verified by native language speakers. In addition, we also introduce a machine translated multilingual video training set comprising 1.2 million samples and develop a simple multilingual video LMM, named ViMUL, that is shown to provide a better tradeoff between high-and low-resource languages for video understanding. We hope our ViMUL-Bench and multilingual video LMM along with a large-scale multilingual video training set will help ease future research in developing cultural and linguistic inclusive multilingual video LMMs. Our proposed benchmark, video LMM and training data will be publicly released.},
keywords = {EMNLP},
pubstate = {published},
tppubtype = {conference}
}
Wang, Song; Tan, Zhen; Chen, Zihan; Zhou, Shuang; Chen, Tianlong; Li, Jundong
AnyMAC: Cascading Flexible Multi-Agent Collaboration via Next-Agent Prediction Conference
Empirical Methods in Natural Language Processing, 2025.
Abstract | Tags: EMNLP | Links:
@conference{Wang2025d,
title = {AnyMAC: Cascading Flexible Multi-Agent Collaboration via Next-Agent Prediction},
author = {Song Wang and Zhen Tan and Zihan Chen and Shuang Zhou and Tianlong Chen and Jundong Li},
url = {https://arxiv.org/abs/2506.17784},
year = {2025},
date = {2025-11-04},
publisher = {Empirical Methods in Natural Language Processing},
abstract = {Recent progress in large language model (LLM)-based multi-agent collaboration highlights the power of structured communication in enabling collective intelligence. However, existing methods largely rely on static or graph-based inter-agent topologies, lacking the potential adaptability and flexibility in communication. In this work, we propose a new framework that rethinks multi-agent coordination through a sequential structure rather than a graph structure, offering a significantly larger topology space for multi-agent communication. Our method focuses on two key directions: (1) Next-Agent Prediction, which selects the most suitable agent role at each step, and (2) Next-Context Selection (NCS), which enables each agent to selectively access relevant information from any previous step. Together, these components construct task-adaptive communication pipelines that support both role flexibility and global information flow. Extensive evaluations across multiple benchmarks demonstrate that our approach achieves superior performance while substantially reducing communication overhead.},
keywords = {EMNLP},
pubstate = {published},
tppubtype = {conference}
}
Saeed, Muhammed; Raza, Shaina; Vayani, Ashmal; Abdul-Mageed, Muhammad; Emami, Ali; Shehata, Shady
Beyond Content: How Grammatical Gender Shapes Visual Representation in Text-to-Image Models Conference
Empirical Methods in Natural Language Processing, 2025.
Abstract | Tags: EMNLP | Links:
@conference{Saeed2025,
title = {Beyond Content: How Grammatical Gender Shapes Visual Representation in Text-to-Image Models},
author = {Muhammed Saeed and Shaina Raza and Ashmal Vayani and Muhammad Abdul-Mageed and Ali Emami and Shady Shehata},
url = {https://arxiv.org/abs/2508.03199#:~:text=Our%20analysis%20reveals%20that%20grammatical,to%2028%5C%25%20in%20English).},
year = {2025},
date = {2025-11-04},
publisher = {Empirical Methods in Natural Language Processing},
abstract = {Research on bias in Text-to-Image (T2I) models has primarily focused on demographic representation and stereotypical attributes, overlooking a fundamental question: how does grammatical gender influence visual representation across languages? We introduce a cross-linguistic benchmark examining words where grammatical gender contradicts stereotypical gender associations (e.g., ``une sentinelle'' - grammatically feminine in French but referring to the stereotypically masculine concept ``guard''). Our dataset spans five gendered languages (French, Spanish, German, Italian, Russian) and two gender-neutral control languages (English, Chinese), comprising 800 unique prompts that generated 28,800 images across three state-of-the-art T2I models. Our analysis reveals that grammatical gender dramatically influences image generation: masculine grammatical markers increase male representation to 73% on average (compared to 22% with gender-neutral English), while feminine grammatical markers increase female representation to 38% (compared to 28% in English). These effects vary systematically by language resource availability and model architecture, with high-resource languages showing stronger effects. Our findings establish that language structure itself, not just content, shapes AI-generated visual outputs, introducing a new dimension for understanding bias and fairness in multilingual, multimodal systems.},
keywords = {EMNLP},
pubstate = {published},
tppubtype = {conference}
}
Zheng, Zaiyi; Wang, Song; Chen, Zihan; Zhu, Yaochen; He, Yinhan; Hong, Liangjie; Guo, Qi; Li, Jundong
CoRAG: Enhancing Hybrid Retrieval-Augmented Generation through a Cooperative Retriever Architecture Conference
Empirical Methods in Natural Language Processing, 2025.
Abstract | Tags: EMNLP | Links:
@conference{Zheng2025,
title = {CoRAG: Enhancing Hybrid Retrieval-Augmented Generation through a Cooperative Retriever Architecture},
author = {Zaiyi Zheng and Song Wang and Zihan Chen and Yaochen Zhu and Yinhan He and Liangjie Hong and Qi Guo and Jundong Li},
url = {https://arxiv.org/abs/2504.01883},
year = {2025},
date = {2025-11-04},
urldate = {2025-11-04},
publisher = {Empirical Methods in Natural Language Processing},
abstract = {Retrieval-Augmented Generation (RAG) models excel in knowledge-intensive tasks, especially under few-shot learning constraints. We introduce CoRAG, a framework extending RAG to collaborative settings, where clients jointly train a shared model using a collaborative passage store. To evaluate CoRAG, we introduce CRAB, a benchmark for collaborative homogeneous open-domain question answering. Our experiments demonstrate that CoRAG consistently outperforms both parametric collaborative learning methods and locally trained RAG models in low-resource scenarios. Further analysis reveals the critical importance of relevant passages within the shared store, the surprising benefits of incorporating irrelevant passages, and the potential for hard negatives to negatively impact performance. This introduces a novel consideration in collaborative RAG: the trade-off between leveraging a collectively enriched knowledge base and the potential risk of incorporating detrimental passages from other clients. Our findings underscore the viability of CoRAG, while also highlighting key design challenges and promising avenues for future research.},
keywords = {EMNLP},
pubstate = {published},
tppubtype = {conference}
}
Wang, Dongwei; Liu, Zijie; Wang, Song; Ren, Yuxin; Deng, Jianing; Hu, Jingtong; Chen, Tianlong; Yang, Huanrui
FIER: Fine-Grained and Efficient KV Cache Retrieval for Long-context LLM Inference Conference
Empirical Methods in Natural Language Processing, 2025.
Abstract | Tags: EMNLP | Links:
@conference{Wang2025e,
title = {FIER: Fine-Grained and Efficient KV Cache Retrieval for Long-context LLM Inference},
author = {Dongwei Wang and Zijie Liu and Song Wang and Yuxin Ren and Jianing Deng and Jingtong Hu and Tianlong Chen and Huanrui Yang},
url = {https://arxiv.org/abs/2508.08256},
year = {2025},
date = {2025-11-04},
publisher = {Empirical Methods in Natural Language Processing},
abstract = {The Key-Value (KV) cache reading latency increases significantly with context lengths, hindering the efficiency of long-context LLM inference. To address this, previous works propose retaining a small fraction of KV cache based on token importance. For example, KV eviction uses static heuristics to retain tokens, while KV retrieval dynamically selects query-relevant tokens for more adaptive cache management. However, we observe that important tokens are often sparsely distributed across the long context. This sparsity makes existing page-level KV retrieval inaccurate, as each page may include irrelevant tokens and miss critical ones. In this work, we propose Fier, a underline{Fi}ne-Grained and underline{E}fficient KV cache underline{R}etrieval method. Fier uses 1-bit quantized keys to estimate the importance of each token, resulting in efficient and precise retrieval. Experiments show that Fier matches full KV performance using only 11% of the cache budget across various long-context tasks, reducing decoding latency by 1.2times to 1.5times.},
keywords = {EMNLP},
pubstate = {published},
tppubtype = {conference}
}
Chen, Zihan; Wang, Song; Fu, Xingbo; Shi, Chengshuai; Lei, Zhenyu; Shen, Cong; Li, Jundong
From Cross-Task Examples to In-Task Prompts: A Graph-Based Pseudo-Labeling Framework for In-context Learning Conference
Empirical Methods in Natural Language Processing, 2025.
Tags: EMNLP
@conference{Chen2025c,
title = {From Cross-Task Examples to In-Task Prompts: A Graph-Based Pseudo-Labeling Framework for In-context Learning},
author = {Zihan Chen and Song Wang and Xingbo Fu and Chengshuai Shi and Zhenyu Lei and Cong Shen and Jundong Li},
year = {2025},
date = {2025-11-04},
publisher = {Empirical Methods in Natural Language Processing},
keywords = {EMNLP},
pubstate = {published},
tppubtype = {conference}
}
Lei, Zhenyu; Tan, Zhen; Wang, Song; Zhu, Yaochen; Chen, Zihan; Dong, Yushun; Li, Jundong
Learning from Diverse Reasoning Paths with Routing and Collaboration Conference
Empirical Methods in Natural Language Processing, 2025.
Abstract | Tags: EMNLP | Links:
@conference{Lei2025,
title = {Learning from Diverse Reasoning Paths with Routing and Collaboration},
author = {Zhenyu Lei and Zhen Tan and Song Wang and Yaochen Zhu and Zihan Chen and Yushun Dong and Jundong Li},
url = {https://arxiv.org/abs/2508.16861
https://github.com/LzyFischer/Distill},
year = {2025},
date = {2025-11-04},
publisher = {Empirical Methods in Natural Language Processing},
abstract = {Advances in large language models (LLMs) significantly enhance reasoning capabilities but their deployment is restricted in resource-constrained scenarios. Knowledge distillation addresses this by transferring knowledge from powerful teacher models to compact and transparent students. However, effectively capturing the teacher's comprehensive reasoning is challenging due to conventional token-level supervision's limited scope. Using multiple reasoning paths per query alleviates this problem, but treating each path identically is suboptimal as paths vary widely in quality and suitability across tasks and models. We propose Quality-filtered Routing with Cooperative Distillation (QR-Distill), combining path quality filtering, conditional routing, and cooperative peer teaching. First, quality filtering retains only correct reasoning paths scored by an LLM-based evaluation. Second, conditional routing dynamically assigns paths tailored to each student's current learning state. Finally, cooperative peer teaching enables students to mutually distill diverse insights, addressing knowledge gaps and biases toward specific reasoning styles. Experiments demonstrate QR-Distill's superiority over traditional single- and multi-path distillation methods. Ablation studies further highlight the importance of each component including quality filtering, conditional routing, and peer teaching in effective knowledge transfer.},
keywords = {EMNLP},
pubstate = {published},
tppubtype = {conference}
}
Wang, Song; Chen, Zihan; Zhepei Wei Peng Wang, Zhen Tan
Separate the Wheat from the Chaff: Winnowing Down Divergent Views in Retrieval Augmented Generation Conference
Empirical Methods in Natural Language Processing, 2025.
Abstract | Tags: EMNLP | Links:
@conference{Wang2025f,
title = {Separate the Wheat from the Chaff: Winnowing Down Divergent Views in Retrieval Augmented Generation},
author = {Song Wang and Zihan Chen and Peng Wang, Zhepei Wei, Zhen Tan, Yu Meng, Cong Shen, Jundong Li},
url = {https://arxiv.org/pdf/2311.01108},
year = {2025},
date = {2025-11-04},
publisher = {Empirical Methods in Natural Language Processing},
abstract = {Retrieval-augmented generation (RAG) addresses the limitation of large language models (LLMs) in achieving up-to-date information by integrating external knowledge sources, but it is hindered by noisy or irrelevant retrieved data, leading to reduced accuracy. Additionally, most RAG methods rely on task-specific supervision, reducing their adaptability across domains. To overcome these challenges, we propose WinnowRAG, a novel multi-agent debate-based RAG framework. WinnowRAG operates in two stages: in Stage I, query-aware clustering groups similar documents, with each cluster assigned to an LLM agent for generating personalized responses. A critic LLM then consolidates these answers, forming super-agents. In Stage II, the super-agents engage in a structured discussion to filter out incorrect or irrelevant information, ensuring only relevant knowledge is used for final response generation. Crucially, WinnowRAG is unsupervised and leverages pretrained LLMs without requiring fine-tuning, making it easily adaptable to various tasks. The experiments on various realistic datasets demonstrate the effectiveness of WinnowRAG over state-of-the-art baselines.},
keywords = {EMNLP},
pubstate = {published},
tppubtype = {conference}
}
Liu, Aoming; Miller, Kevin; Saligrama, Venkatesh; Saenko, Kate; Gong, Boqing; Lim, Ser-Nam; Plummer, Bryan A.
Temporal Experts Averaging for Large-scale Temporal Domain Generalization Conference
Empirical Methods in Natural Language Processing, 2025.
Tags: EMNLP
@conference{Liu2025,
title = {Temporal Experts Averaging for Large-scale Temporal Domain Generalization},
author = {Aoming Liu and Kevin Miller and Venkatesh Saligrama and Kate Saenko and Boqing Gong and Ser-Nam Lim and Bryan A. Plummer},
year = {2025},
date = {2025-11-04},
publisher = {Empirical Methods in Natural Language Processing},
keywords = {EMNLP},
pubstate = {published},
tppubtype = {conference}
}
Agrawal, Aakriti; Aralikatti, Rohith; Satheesh, Anirudh; Chakraborty, Souradip; Bedi, Amrit Singh; Huang, Furong
Uncertainty-Aware Answer Selection for Improved Reasoning in Multi-LLM Systems Conference
Empirical Methods in Natural Language Processing, 2025.
Tags: EMNLP
@conference{Agrawal2025,
title = {Uncertainty-Aware Answer Selection for Improved Reasoning in Multi-LLM Systems},
author = {Aakriti Agrawal and Rohith Aralikatti and Anirudh Satheesh and Souradip Chakraborty and Amrit Singh Bedi and Furong Huang},
year = {2025},
date = {2025-11-04},
publisher = {Empirical Methods in Natural Language Processing},
keywords = {EMNLP},
pubstate = {published},
tppubtype = {conference}
}
Salvador, John; Bansal, Naman; Akter, Mousumi; Sarkar, Souvika; Das, Anupam; Karmaker, Santu
Benchmarking LLMs on the Semantic Overlap Summarization Task Conference
Empirical Methods in Natural Language Processing, 2025.
Tags: EMNLP | Links:
@conference{Salvador2025,
title = {Benchmarking LLMs on the Semantic Overlap Summarization Task},
author = {John Salvador and Naman Bansal and Mousumi Akter and Souvika Sarkar and Anupam Das and Santu Karmaker},
url = {Semantic Overlap Summarization (SOS) is a constrained multi-document summarization task, where the constraint is to capture the common/overlapping information between two alternative narratives. In this work, we perform a benchmarking study of popular Large Language Models (LLMs) exclusively on the SOS task. Additionally, we introduce the PrivacyPolicyPairs (3P) dataset to expand the space of SOS benchmarks in terms of quantity and variety. This dataset provides 135 high-quality SOS data samples sourced from privacy policy documents. We then use a standard prompting taxonomy called TELeR to create and evaluate 905,216 distinct LLM-generated summaries over two SOS datasets from different domains, and we further conduct human evaluation on a subset of 540 samples. We conclude the paper by analyzing models' performances and the reliability of automatic evaluation. The code and datasets used to conduct this study are available at https://anonymous.4open.science/r/llm_eval-E16D/README.md
},
year = {2025},
date = {2025-11-04},
urldate = {2025-11-04},
publisher = {Empirical Methods in Natural Language Processing},
keywords = {EMNLP},
pubstate = {published},
tppubtype = {conference}
}
Thawakar, Omkar; Demidov, Dmitry; Thawkar, Ritesh; Anwer, Rao; Shah, Mubarak; Khan, Fahad; Khan, Salman
Beyond Simple Edits: Composed Video Retrieval with Dense Modifications Conference
IEEE/CVF International Conference on Computer Vision, 2025.
Abstract | Tags: ICCV | Links:
@conference{Thawakar2025,
title = {Beyond Simple Edits: Composed Video Retrieval with Dense Modifications},
author = {Omkar Thawakar and Dmitry Demidov and Ritesh Thawkar and Rao Anwer and Mubarak Shah and Fahad Khan and Salman Khan},
url = {https://iccv.thecvf.com/virtual/2025/poster/1966},
year = {2025},
date = {2025-10-19},
urldate = {2025-10-19},
publisher = {IEEE/CVF International Conference on Computer Vision},
abstract = {Composed video retrieval is a challenging task that strives to retrieve a target video based on a query video and a textual description detailing specific modifications. Standard retrieval frameworks typically struggle to handle the complexity of fine-grained compositional queries and variations in temporal understanding limiting their retrieval ability in the fine-grained setting. To address this issue, we introduce a novel dataset that captures both fine-grained and composed actions across diverse video segments, enabling more detailed compositional changes in retrieved video content.The proposed dataset, named Dense-WebVid-CoVR, consists of 1.6 million samples with dense modification text that is around seven times more than its existing counterpart. We further develop a new model that integrates visual and textual information through Cross-Attention (CA) fusion using grounded text encoder, enabling precise alignment between dense query modifications and target videos. The proposed model achieves state-of-the-art results surpassing existing methods on all metrics. Notably, it achieves 71.3% Recall@1 in visual+text setting and outperforms the state-of-the-art by 3.4%, highlighting its efficacy in terms of leveraging detailed video descriptions and dense modification texts. Our proposed dataset, code, and model will be publicly released.},
keywords = {ICCV},
pubstate = {published},
tppubtype = {conference}
}
Wang, Haoxuan; Zhao, Zhenghao; Wu, Junyi; Shang, Yuzhang; Liu, Gaowen; Yan, Yan
CaO$_2$ : Rectifying Inconsistencies in Diffusion-Based Dataset Distillation Conference
IEEE/CVF International Conference on Computer Vision, 2025.
Abstract | Tags: ICCV | Links:
@conference{Wang2025b,
title = {CaO$_2$ : Rectifying Inconsistencies in Diffusion-Based Dataset Distillation},
author = {Haoxuan Wang and Zhenghao Zhao and Junyi Wu and Yuzhang Shang and Gaowen Liu and Yan Yan},
url = {https://arxiv.org/abs/2506.22637},
year = {2025},
date = {2025-10-19},
publisher = {IEEE/CVF International Conference on Computer Vision},
abstract = {The recent introduction of diffusion models in dataset distillation has shown promising potential in creating compact surrogate datasets for large, high-resolution target datasets, offering improved efficiency and performance over traditional bi-level/uni-level optimization methods. However, current diffusion-based dataset distillation approaches overlook the evaluation process and exhibit two critical inconsistencies in the distillation process: (1) Objective Inconsistency, where the distillation process diverges from the evaluation objective, and (2) Condition Inconsistency, leading to mismatches between generated images and their corresponding conditions. To resolve these issues, we introduce Condition-aware Optimization with Objective-guided Sampling (CaO_2), a two-stage diffusion-based framework that aligns the distillation process with the evaluation objective. The first stage employs a probability-informed sample selection pipeline, while the second stage refines the corresponding latent representations to improve conditional likelihood. CaO_2 achieves state-of-the-art performance on ImageNet and its subsets, surpassing the best-performing baselines by an average of 2.3% accuracy.
},
keywords = {ICCV},
pubstate = {published},
tppubtype = {conference}
}
Pathak, Priyank; Rawat, Yogesh
Colors See Colors Ignore: Clothes Changing ReID with Color Disentanglement Conference
IEEE/CVF International Conference on Computer Vision, 2025.
Abstract | Tags: ICCV | Links:
@conference{nokey,
title = {Colors See Colors Ignore: Clothes Changing ReID with Color Disentanglement},
author = {Priyank Pathak and Yogesh Rawat},
url = {https://arxiv.org/abs/2507.07230},
year = {2025},
date = {2025-10-19},
publisher = {IEEE/CVF International Conference on Computer Vision},
abstract = {Clothes-Changing Re-Identification (CC-ReID) aims to recognize individuals across different locations and times, irrespective of clothing. Existing methods often rely on additional models or annotations to learn robust, clothing-invariant features, making them resource-intensive. In contrast, we explore the use of color - specifically foreground and background colors - as a lightweight, annotation-free proxy for mitigating appearance bias in ReID models. We propose Colors See, Colors Ignore (CSCI), an RGB-only method that leverages color information directly from raw images or video frames. CSCI efficiently captures color-related appearance bias ('Color See') while disentangling it from identity-relevant ReID features ('Color Ignore'). To achieve this, we introduce S2A self-attention, a novel self-attention to prevent information leak between color and identity cues within the feature space. Our analysis shows a strong correspondence between learned color embeddings and clothing attributes, validating color as an effective proxy when explicit clothing labels are unavailable. We demonstrate the effectiveness of CSCI on both image and video ReID with extensive experiments on four CC-ReID datasets. We improve the baseline by Top-1 2.9% on LTCC and 5.0% on PRCC for image-based ReID, and 1.0% on CCVID and 2.5% on MeVID for video-based ReID without relying on additional supervision. Our results highlight the potential of color as a cost-effective solution for addressing appearance bias in CC-ReID. Github: this https URL.
},
keywords = {ICCV},
pubstate = {published},
tppubtype = {conference}
}
Azad, Shehreen; Rawat, Yogesh
DisenQ: Disentangling Q-Former for Activity-Biometrics Conference
IEEE/CVF International Conference on Computer Vision, 2025.
Abstract | Tags: ICCV | Links:
@conference{nokey,
title = {DisenQ: Disentangling Q-Former for Activity-Biometrics},
author = {Shehreen Azad and Yogesh Rawat},
url = {https://chatpaper.com/pt/chatpaper/paper/163039
https://arxiv.org/abs/2507.07262},
year = {2025},
date = {2025-10-19},
publisher = {IEEE/CVF International Conference on Computer Vision},
abstract = {In this work, we address activity-biometrics, which involves identifying individuals across diverse set of activities. Unlike traditional person identification, this setting introduces additional challenges as identity cues become entangled with motion dynamics and appearance variations, making biometrics feature learning more complex. While additional visual data like pose and/or silhouette help, they often struggle from extraction inaccuracies. To overcome this, we propose a multimodal language-guided framework that replaces reliance on additional visual data with structured textual supervision. At its core, we introduce textbf{DisenQ} (textbf{Disen}tangling textbf{Q}-Former), a unified querying transformer that disentangles biometrics, motion, and non-biometrics features by leveraging structured language guidance. This ensures identity cues remain independent of appearance and motion variations, preventing misidentifications. We evaluate our approach on three activity-based video benchmarks, achieving state-of-the-art performance. Additionally, we demonstrate strong generalization to complex real-world scenario with competitive performance on a traditional video-based identification benchmark, showing the effectiveness of our framework.},
keywords = {ICCV},
pubstate = {published},
tppubtype = {conference}
}
Yuan, Zhihang; Xie, Rui; Shang, Yuzhang; Zhang, Hanling; Wang, Siyuan; Yan, Shengen; Dai, Guohao; Wang, Yu
DLFR-Gen: Diffusion-based Video Generation with Dynamic Latent Frame Rate Conference
IEEE/CVF International Conference on Computer Vision, 2025.
Tags: ICCV
@conference{Yuan2025,
title = {DLFR-Gen: Diffusion-based Video Generation with Dynamic Latent Frame Rate},
author = {Zhihang Yuan and Rui Xie and Yuzhang Shang and Hanling Zhang and Siyuan Wang and Shengen Yan and Guohao Dai and Yu Wang},
year = {2025},
date = {2025-10-19},
publisher = {IEEE/CVF International Conference on Computer Vision},
keywords = {ICCV},
pubstate = {published},
tppubtype = {conference}
}
Pang, Yatian; Zhu, Bin; Lin, Bin; Zheng, Mingzhe; Tay, Francis; Lim, Ser-Nam; Yang, Harry; Yuan, Li
DreamDance: Animating Human Images by Enriching 3D Geometry Cues from 2D Poses Conference
IEEE/CVF International Conference on Computer Vision, 2025.
Abstract | Tags: ICCV | Links:
@conference{nokey,
title = {DreamDance: Animating Human Images by Enriching 3D Geometry Cues from 2D Poses},
author = {Yatian Pang and Bin Zhu and Bin Lin and Mingzhe Zheng and Francis Tay and Ser-Nam Lim and Harry Yang and Li Yuan},
url = {https://pang-yatian.github.io/Dreamdance-webpage/
https://arxiv.org/abs/2412.00397
https://github.com/PKU-YuanGroup/DreamDance
https://pang-yatian.github.io/Dreamdance-webpage/resources/full_video_ids.txt},
year = {2025},
date = {2025-10-19},
publisher = {IEEE/CVF International Conference on Computer Vision},
abstract = {In this work, we present DreamDance, a novel method for animating human images using only skeleton pose sequences as conditional inputs. Existing approaches struggle with generating coherent, high-quality content in an efficient and user-friendly manner. Concretely, baseline methods relying on only 2D pose guidance lack the cues of 3D information, leading to suboptimal results, while methods using 3D representation as guidance achieve higher quality but involve a cumbersome and time-intensive process. To address these limitations, DreamDance enriches 3D geometry cues from 2D poses by introducing an efficient diffusion model, enabling high-quality human image animation with various guidance. Our key insight is that human images naturally exhibit multiple levels of correlation, progressing from coarse skeleton poses to fine-grained geometry cues, and further from these geometry cues to explicit appearance details. Capturing such correlations could enrich the guidance signals, facilitating intra-frame coherency and inter-frame consistency. Specifically, we construct the TikTok-Dance5K dataset, comprising 5K high-quality dance videos with detailed frame annotations, including human pose, depth, and normal maps. Next, we introduce a Mutually Aligned Geometry Diffusion Model to generate fine-grained depth and normal maps for enriched guidance. Finally, a Cross-domain Controller incorporates multi-level guidance to animate human images effectively with a video diffusion model. Extensive experiments demonstrate that our method achieves state-of-the-art performance in animating human images.
},
keywords = {ICCV},
pubstate = {published},
tppubtype = {conference}
}
Zhu, Chen; Zhao, Wangbo; Zhang, Huiwen; Zhou, Yuhao; Tang, Weidong; Wang, Shuo; Yuan, Zhihang; Shang, Yuzhang; Peng, Xiaojiang; Wang, Kai; Yang, Dawei
EA-Vit: Efficient Adaptation for Elastic Vision Transformer Conference
IEEE/CVF International Conference on Computer Vision, 2025.
Abstract | Tags: ICCV | Links:
@conference{Zhu2025,
title = {EA-Vit: Efficient Adaptation for Elastic Vision Transformer},
author = {Chen Zhu and Wangbo Zhao and Huiwen Zhang and Yuhao Zhou and Weidong Tang and Shuo Wang and Zhihang Yuan and Yuzhang Shang and Xiaojiang Peng and Kai Wang and Dawei Yang},
url = {https://iccv.thecvf.com/virtual/2025/poster/1084
https://arxiv.org/abs/2507.19360},
year = {2025},
date = {2025-10-19},
publisher = {IEEE/CVF International Conference on Computer Vision},
abstract = {Vision Transformer (ViT) has emerged as a foundational model in computer vision, excelling in generalization and adaptation to downstream tasks. However, supporting diverse resource constraints typically requires retraining multiple, size-specific ViTs, which is both time-consuming and expensive. In this paper, we propose emph{Efficient Elastic ViT Adaptation}, a single ViT framework that encapsulates multiple submodels of varying sizes, eliminating the need for repeated adaptation.We introduce elastic configurations along four key dimensions—embedding dimension, attention heads, MLP expansion ratio, and layer depth—and a lightweight router that selects the optimal submodel under different computational budgets. Training proceeds in two stages: emph{Staged Elastic Adaptation} progressively introduces complexity for efficient joint training of submodels while preserving as much pre-trained knowledge as possible; Subsequently, we integrate the router to refine the model by balancing accuracy and MACs, guiding it to initially focus on a small set of promising submodels for faster convergence within the large design space.Our approach captures an exponentially large family of submodels in a single adaptation process. Extensive experiments demonstrate that, for any resource constraint, the router identifies the best submodel, delivering high performance and reduced overhead compared to previous methods.},
keywords = {ICCV},
pubstate = {published},
tppubtype = {conference}
}
Wu, Wenhan; Guo, Zhishuai; Chen, Chen; Xue, Hongfei; Lu, Aidong
Frequency-Semantic Enhanced Variational Autoencoder for Zero-Shot Skeleton-based Action Recognition Conference
IEEE/CVF International Conference on Computer Vision, 2025.
Abstract | Tags: ICCV | Links:
@conference{Wu2025,
title = {Frequency-Semantic Enhanced Variational Autoencoder for Zero-Shot Skeleton-based Action Recognition},
author = {Wenhan Wu and Zhishuai Guo and Chen Chen and Hongfei Xue and Aidong Lu},
url = {https://arxiv.org/abs/2506.22179},
year = {2025},
date = {2025-10-19},
publisher = {IEEE/CVF International Conference on Computer Vision},
abstract = {Zero-shot skeleton-based action recognition aims to develop models capable of identifying actions beyond the categories encountered during training. Previous approaches have primarily focused on aligning visual and semantic representations but often overlooked the importance of fine-grained action patterns in the semantic space (e.g., the hand movements in drinking water and brushing teeth). To address these limitations, we propose a Frequency-Semantic Enhanced Variational Autoencoder (FS-VAE) to explore the skeleton semantic representation learning with frequency decomposition. FS-VAE consists of three key components: 1) a frequency-based enhancement module with high- and low-frequency adjustments to enrich the skeletal semantics learning and improve the robustness of zero-shot action recognition; 2) a semantic-based action description with multilevel alignment to capture both local details and global correspondence, effectively bridging the semantic gap and compensating for the inherent loss of information in skeleton sequences; 3) a calibrated cross-alignment loss that enables valid skeleton-text pairs to counterbalance ambiguous ones, mitigating discrepancies and ambiguities in skeleton and text features, thereby ensuring robust alignment. Evaluations on the benchmarks demonstrate the effectiveness of our approach, validating that frequency-enhanced semantic features enable robust differentiation of visually and semantically similar action clusters, improving zero-shot action recognition.},
keywords = {ICCV},
pubstate = {published},
tppubtype = {conference}
}
Shatwell, David; Dave, Ishan; Sirnam, Swetha; Shah, Mubarak
GT-Loc: Unifying When and Where in Images through a Joint Embedding Space Conference
IEEE/CVF International Conference on Computer Vision, 2025.
Abstract | Tags: ICCV | Links:
@conference{Shatwell2025,
title = { GT-Loc: Unifying When and Where in Images through a Joint Embedding Space},
author = {David Shatwell and Ishan Dave and Swetha Sirnam and Mubarak Shah},
url = {https://chatpaper.com/fr/chatpaper/paper/163729
https://arxiv.org/abs/2507.10473#:~:text=14%20Jul%202025%5D-,GT%2DLoc%3A%20Unifying%20When%20and%20Where%20in%20Images,Through%20a%20Joint%20Embedding%20Space&text=Abstract%3ATimestamp%20prediction%20aims%20to,%2C%20retrieval%2C%20and%20digital%20forensics.},
year = {2025},
date = {2025-10-19},
publisher = {IEEE/CVF International Conference on Computer Vision},
abstract = {Timestamp prediction aims to determine when an image was captured using only visual information, supporting applications such as metadata correction, retrieval, and digital forensics. In outdoor scenarios, hourly estimates rely on cues like brightness, hue, and shadow positioning, while seasonal changes and weather inform date estimation. However, these visual cues significantly depend on geographic context, closely linking timestamp prediction to geo-localization. To address this interdependence, we introduce GT-Loc, a novel retrieval-based method that jointly predicts the capture time (hour and month) and geo-location (GPS coordinates) of an image. Our approach employs separate encoders for images, time, and location, aligning their embeddings within a shared high-dimensional feature space. Recognizing the cyclical nature of time, instead of conventional contrastive learning with hard positives and negatives, we propose a temporal metric-learning objective providing soft targets by modeling pairwise time differences over a cyclical toroidal surface. We present new benchmarks demonstrating that our joint optimization surpasses previous time prediction methods, even those using the ground-truth geo-location as an input during inference. Additionally, our approach achieves competitive results on standard geo-localization tasks, and the unified embedding space facilitates compositional and text-based image retrieval.},
keywords = {ICCV},
pubstate = {published},
tppubtype = {conference}
}
Shang, Yuzhang; Cai, Mu; Xu, Bingxin; Lee, Yong Jae; Yan, Yan
LLaVA-PruMerge: Adaptive Token Reduction for Efficient Large Multimodal Models Conference
IEEE/CVF International Conference on Computer Vision, 2025.
Abstract | Tags: ICCV | Links:
@conference{Shang2025,
title = {LLaVA-PruMerge: Adaptive Token Reduction for Efficient Large Multimodal Models},
author = {Yuzhang Shang and Mu Cai and Bingxin Xu and Yong Jae Lee and Yan Yan},
url = {https://llava-prumerge.github.io/
https://arxiv.org/abs/2403.15388
https://github.com/42Shawn/LLaVA-PruMerge},
year = {2025},
date = {2025-10-19},
publisher = {IEEE/CVF International Conference on Computer Vision},
abstract = {Large Multimodal Models (LMMs) have shown significant reasoning capabilities by connecting a visual encoder and a large language model. LMMs typically use a fixed amount of visual tokens, such as the penultimate layer features in the CLIP visual encoder, as the prefix content. Recent LMMs incorporate more complex visual inputs, such as high-resolution images and videos, which increase the number of visual tokens significantly. However, due to the design of the Transformer architecture, computational costs associated with these models tend to increase quadratically with the number of input tokens. To tackle this problem, we explore a token reduction mechanism and find, similar to prior work, that many visual tokens are spatially redundant. Based on this, we propose PruMerge, a novel adaptive visual token reduction approach, which largely reduces the number of visual tokens while maintaining comparable model performance. We first select the unpruned visual tokens based on their similarity to class tokens and spatial tokens. We then cluster the pruned tokens based on key similarity and merge the clustered tokens with the unpruned tokens to supplement their information. Empirically, when applied to LLaVA-1.5, our approach can compress the visual tokens by 18 times on average (14 times on MME/TextVQA), and achieve comparable performance across diverse visual question-answering and reasoning tasks. Code and checkpoints will be released. To facilitate future research, we will release our code, dataset, benchmark, and checkpoints at },
keywords = {ICCV},
pubstate = {published},
tppubtype = {conference}
}
Ma, Xuran; Liu, Yexin; LIU, Yaofu; Wu, Xianfeng; Zheng, Mingzhe; Wang, Zihao; Lim, Ser-Nam; Yang, Harry
Model Reveals What to Cache: Profiling-Based Feature Reuse for Video Diffusion Models Conference
IEEE/CVF International Conference on Computer Vision, 2025.
Abstract | Tags: ICCV | Links:
@conference{Ma2025,
title = {Model Reveals What to Cache: Profiling-Based Feature Reuse for Video Diffusion Models},
author = {Xuran Ma and Yexin Liu and Yaofu LIU and Xianfeng Wu and Mingzhe Zheng and Zihao Wang and Ser-Nam Lim and Harry Yang},
url = {https://iccv.thecvf.com/virtual/2025/poster/1031
https://arxiv.org/abs/2504.03140},
year = {2025},
date = {2025-10-19},
publisher = {IEEE/CVF International Conference on Computer Vision},
abstract = {Video generation using diffusion models has shown remarkable progress, yet it remains computationally expensive due to the repeated processing of redundant features across blocks and steps. To address this, we propose a novel adaptive feature reuse mechanism that dynamically identifies and caches the most informative features by focusing on foreground and caching more on background, significantly reducing computational overhead with less sacrificing video quality. By leveraging the step and block caching, our method achieves up to 1.8× speed up on HunyuanVideo while maintaining competitive performance on Vbench, PSNR, SSIM, FID and LPIPS. Extensive experiments demonstrate that our approach not only improves efficiency but also enhances the quality of generated videos. The proposed method is generalizable and can be integrated into existing diffusion transformer frameworks.},
keywords = {ICCV},
pubstate = {published},
tppubtype = {conference}
}
Abdullah, Raiyaan; Claypoole, Jared; Cogswell, Michael; Divakaran, Ajay; Rawat, Yogesh
Punching Bag vs. Punching Person: Motion Transferability in Videos Conference
IEEE/CVF International Conference on Computer Vision, 2025.
Abstract | Tags: ICCV | Links:
@conference{Abdullah2025,
title = {Punching Bag vs. Punching Person: Motion Transferability in Videos},
author = {Raiyaan Abdullah and Jared Claypoole and Michael Cogswell and Ajay Divakaran and Yogesh Rawat},
url = {https://iccv.thecvf.com/virtual/2025/poster/935},
year = {2025},
date = {2025-10-19},
publisher = {IEEE/CVF International Conference on Computer Vision},
abstract = {Action recognition models, both unimodal and multimodal, have demonstrated strong generalization in tasks such as zero-shot learning, base-to-novel transfer, and domain adaptation. However, can they effectively transfer high-level motion concepts across diverse contexts, even within similar distributions? For example, can a model recognize the broad action "Pushing" when presented with unknown variations such as "Pushing something from right to left"? To explore this, we introduce a motion transferability framework with three datasets: (1) Syn-TA, a synthetic dataset with 3D object motions; (2) Kinetics400-TA; and (3) Something-Something-v2-TA, both adapted from natural video datasets. We evaluate 13 state-of-the-art models on these benchmarks and observe a significant drop in performance when recognizing high-level actions in novel contexts. Our analysis reveals: 1) Multimodal models struggle more with fine-grained unknown actions than coarse ones; 2) The bias-free Syn-TA proves as challenging as real-world datasets, with models showing greater performance drops in controlled settings; 3) Larger models improve transferability when spatial cues dominate but struggle with intensive temporal reasoning, while reliance on object and background cues hinders generalization. We further explore how disentangling coarse and fine motions can improve recognition in temporally challenging datasets. Our study establishes a crucial benchmark for assessing motion transferability in action recognition.},
keywords = {ICCV},
pubstate = {published},
tppubtype = {conference}
}
Wang, Haoxuan; Shang, Yuzhang; Yuan, Zhihang; Wu, Junyi; Yan, Junchi; Yan, Yan
QuEST: Low-bit Diffusion Model Quantization via Efficient Selective Finetuning Conference
IEEE/CVF International Conference on Computer Vision, 2025.
Abstract | Tags: ICCV | Links:
@conference{Wang2025c,
title = {QuEST: Low-bit Diffusion Model Quantization via Efficient Selective Finetuning},
author = {Haoxuan Wang and Yuzhang Shang and Zhihang Yuan and Junyi Wu and Junchi Yan and Yan Yan},
url = {https://arxiv.org/abs/2402.03666},
year = {2025},
date = {2025-10-19},
publisher = {IEEE/CVF International Conference on Computer Vision},
abstract = {The practical deployment of diffusion models is still hindered by the high memory and computational overhead. Although quantization paves a way for model compression and acceleration, existing methods face challenges in achieving low-bit quantization efficiently. In this paper, we identify imbalanced activation distributions as a primary source of quantization difficulty, and propose to adjust these distributions through weight finetuning to be more quantization-friendly. We provide both theoretical and empirical evidence supporting finetuning as a practical and reliable solution. Building on this approach, we further distinguish two critical types of quantized layers: those responsible for retaining essential temporal information and those particularly sensitive to bit-width reduction. By selectively finetuning these layers under both local and global supervision, we mitigate performance degradation while enhancing quantization efficiency. Our method demonstrates its efficacy across three high-resolution image generation tasks, obtaining state-of-the-art performance across multiple bit-width settings.},
keywords = {ICCV},
pubstate = {published},
tppubtype = {conference}
}
Kang, Weitai; Huang, Haifeng; Shang, Yuzhang; Shah, Mubarak; Yan, Yan
Robin3D: Improving 3D Large Language Model via Robust Instruction Tuning Conference
IEEE/CVF International Conference on Computer Vision, 2025.
Abstract | Tags: ICCV | Links:
@conference{Kang2025b,
title = {Robin3D: Improving 3D Large Language Model via Robust Instruction Tuning},
author = {Weitai Kang and Haifeng Huang and Yuzhang Shang and Mubarak Shah and Yan Yan},
url = {https://arxiv.org/abs/2410.00255
https://github.com/WeitaiKang/Robin3D},
year = {2025},
date = {2025-10-19},
publisher = {IEEE/CVF International Conference on Computer Vision},
abstract = {Recent advancements in 3D Large Language Models (3DLLMs) have highlighted their potential in building general-purpose agents in the 3D real world, yet challenges remain due to the lack of high-quality robust instruction-following data, leading to limited discriminative power and generalization of 3DLLMs. In this paper, we introduce Robin3D, a powerful 3DLLM trained on large-scale instruction-following data generated by our novel data engine, Robust Instruction Generation (RIG) engine. RIG generates two key instruction data: 1) the Adversarial Instruction-following data, which features mixed negative and positive samples to enhance the model's discriminative understanding. 2) the Diverse Instruction-following data, which contains various instruction styles to enhance model's generalization. As a result, we construct 1 million instruction-following data, consisting of 344K Adversarial samples, 508K Diverse samples, and 165K benchmark training set samples. To better handle these complex instructions, Robin3D first incorporates Relation-Augmented Projector to enhance spatial understanding, and then strengthens the object referring and grounding ability through ID-Feature Bonding. Robin3D consistently outperforms previous methods across five widely-used 3D multimodal learning benchmarks, without the need for task-specific fine-tuning. Notably, we achieve a 7.8% improvement in the grounding task (Multi3DRefer) and a 6.9% improvement in the captioning task (Scan2Cap).},
keywords = {ICCV},
pubstate = {published},
tppubtype = {conference}
}
Pinyoanuntapong, Ekkasit; Saleem, Muhammad Usama; Karunratanakul, Korrawe; Wang, Pu; Xue, Hongfei; Chen, Chen; chuan guo,; Cao, Junli; Ren, Jian; Tulyakov, Sergey
Spatio-Temporal Control for Masked Motion Synthesis Conference
IEEE/CVF International Conference on Computer Vision, 2025.
Abstract | Tags: ICCV | Links:
@conference{Pinyoanuntapong2025,
title = {Spatio-Temporal Control for Masked Motion Synthesis},
author = {Ekkasit Pinyoanuntapong and Muhammad Usama Saleem and Korrawe Karunratanakul and Pu Wang and Hongfei Xue and Chen Chen and chuan guo and Junli Cao and Jian Ren and Sergey Tulyakov},
url = {https://www.ekkasit.com/ControlMM-page/
https://github.com/exitudio/MaskControl/},
year = {2025},
date = {2025-10-19},
publisher = {IEEE/CVF International Conference on Computer Vision},
abstract = {Recent advances in motion diffusion models have enabled spatially controllable text-to-motion generation. However, these models struggle to achieve high-precision control while maintaining high-quality motion generation. To address these challenges, we propose MaskControl, the first approach to introduce controllability to the generative masked motion model. Our approach introduces two key innovations. First, textit{Logits Regularizer} implicitly perturbs logits at training time to align the distribution of motion tokens with the controlled joint positions, while regularizing the categorical token prediction to ensure high-fidelity generation. Second, textit{Logit Optimization} explicitly optimizes the predicted logits during inference time, directly reshaping the token distribution that forces the generated motion to accurately align with the controlled joint positions. Moreover, we introduce textit{Differentiable Expectation Sampling (DES)} to combat the non-differential distribution sampling process encountered by logits regularizer and optimization. Extensive experiments demonstrate that MaskControl outperforms state-of-the-art methods, achieving superior motion quality (FID decreases by ~77%) and higher control precision (average error 0.91 vs. 1.08). Additionally, MaskControl enables diverse applications, including any-joint-any-frame control, body-part timeline control, and zero-shot objective control.},
keywords = {ICCV},
pubstate = {published},
tppubtype = {conference}
}
Li, Ming; Gu, Xin; Chen, Fan; Xing, Xiaoying; Wen, Longyin; Chen, Chen; Zhu, Sijie
SuperEdit: Rectifying and Facilitating Supervision for Instruction-Based Image Editing Conference
IEEE/CVF International Conference on Computer Vision, 2025.
Abstract | Tags: | Links:
@conference{Li2025b,
title = {SuperEdit: Rectifying and Facilitating Supervision for Instruction-Based Image Editing},
author = {Ming Li and Xin Gu and Fan Chen and Xiaoying Xing and Longyin Wen and Chen Chen and Sijie Zhu},
url = {https://liming-ai.github.io/SuperEdit/
https://github.com/bytedance/SuperEdit
https://huggingface.co/datasets/limingcv/SuperEdit-40K},
year = {2025},
date = {2025-10-19},
urldate = {2025-10-19},
publisher = {IEEE/CVF International Conference on Computer Vision},
abstract = {Due to the challenges of manually collecting accurate editing data, existing datasets are typically constructed using various automated methods, leading to noisy supervision signals caused by the mismatch between editing instructions and original-edited image pairs. Recent efforts attempt to improve editing models through generating higher-quality edited images, pre-training on recognition tasks, or introducing vision-language models (VLMs) but fail to resolve this fundamental issue. In this paper, we offer a novel solution by constructing more effective editing instructions for given image pairs. This includes rectifying the editing instructions to better align with the original-edited image pairs and using contrastive editing instructions to further enhance their effectiveness. Specifically, we find that editing models exhibit specific generation attributes at different inference steps, independent of the text. Based on these prior attributes, we define a unified guide for VLMs to rectify editing instructions. However, there are some challenging editing scenarios that cannot be resolved solely with rectified instructions. To this end, we further construct contrastive supervision signals with positive and negative instructions and introduce them into the model training using triplet loss, thereby further facilitating supervision effectiveness. Our method does not require the VLM modules or pre-training tasks used in previous work, offering a more direct and efficient way to provide better supervision signals, and providing a novel, simple, and effective solution for instruction-based image editing. Results on multiple benchmarks demonstrate that our method significantly outperforms existing approaches. Compared with previous SOTA SmartEdit, we achieve 9.19% improvements on the Real-Edit benchmark with 30x less training data and 13x smaller model size.},
keywords = {},
pubstate = {published},
tppubtype = {conference}
}
Fan, Xinqi; Chen, Xueli; Yang, Luoxiao; Yap, Chuin Hong; Qureshi, Rizwan; Dou, Qi; Yap, Moi Hoon; Shah, Mubarak
Test-Time Retrieval-Augmented Adaptation for Vision-Language Models Conference
IEEE/CVF International Conference on Computer Vision, 2025.
Abstract | Tags: ICCV | Links:
@conference{Fan2025,
title = {Test-Time Retrieval-Augmented Adaptation for Vision-Language Models},
author = {Xinqi Fan and Xueli Chen and Luoxiao Yang and Chuin Hong Yap and Rizwan Qureshi and Qi Dou and Moi Hoon Yap and Mubarak Shah},
url = {https://iccv.thecvf.com/virtual/2025/poster/2327},
year = {2025},
date = {2025-10-19},
urldate = {2025-10-19},
publisher = {IEEE/CVF International Conference on Computer Vision},
abstract = {Vision-language models (VLMs) have shown promise in test-time adaptation tasks due to their remarkable capabilities in understanding and reasoning about visual content through natural language descriptions. However, training VLMs typically demands substantial computational resources, and they often struggle to adapt efficiently to new domains or tasks. Additionally, dynamically estimating the test distribution from streaming data at test time remains a significant challenge. In this work, we propose a novel test-time retrieval-augmented adaption (TT-RAA) method that enables VLMs to maintain high performance across diverse visual recognition tasks without the need for task-specific training or large computational overhead. During inference, TT-RAA employs a streaming mixture of Gaussian database (SMGD) to continuously estimate test distributions, requiring minimal storage. Then, TT-RAA retrieves the most relevant information from the SMGD, enhancing the original VLM outputs. A key limitation of CLIP-based VLMs is their inter-modal vision-language optimization, which does not optimize vision-space similarity, leading to larger intra-modal variance. To address this, we propose a multimodal retrieval augmentation module that transforms the SMGD into a unified multimodal space, enabling retrieval that aligns both vision and language modalities. Extensive experiments across both cross-domain and out-of-distribution benchmarks comprising fourteen datasets demonstrate TT-RAA’s superior performance compared to state-of-the-art methods. Ablation studies and hyperparameter analyses further validate the effectiveness of the proposed modules.},
keywords = {ICCV},
pubstate = {published},
tppubtype = {conference}
}
Lyu, Zonglin; Chen, Chen
TLB-VFI: Temporal-Aware Latent Brownian Bridge Diffusion for Video Frame Interpolation Conference
IEEE/CVF International Conference on Computer Vision, 2025.
Abstract | Tags: ICCV | Links:
@conference{Lyu2025,
title = {TLB-VFI: Temporal-Aware Latent Brownian Bridge Diffusion for Video Frame Interpolation},
author = {Zonglin Lyu and Chen Chen},
url = {https://arxiv.org/abs/2507.04984},
year = {2025},
date = {2025-10-19},
publisher = {IEEE/CVF International Conference on Computer Vision},
abstract = {Video Frame Interpolation (VFI) aims to predict the intermediate frame I_n (we use n to denote time in videos to avoid notation overload with the timestep t in diffusion models) based on two consecutive neighboring frames I_0 and I_1. Recent approaches apply diffusion models (both image-based and video-based) in this task and achieve strong performance. However, image-based diffusion models are unable to extract temporal information and are relatively inefficient compared to non-diffusion methods. Video-based diffusion models can extract temporal information, but they are too large in terms of training scale, model size, and inference time. To mitigate the above issues, we propose Temporal-Aware Latent Brownian Bridge Diffusion for Video Frame Interpolation (TLB-VFI), an efficient video-based diffusion model. By extracting rich temporal information from video inputs through our proposed 3D-wavelet gating and temporal-aware autoencoder, our method achieves 20% improvement in FID on the most challenging datasets over recent SOTA of image-based diffusion models. Meanwhile, due to the existence of rich temporal information, our method achieves strong performance while having 3times fewer parameters. Such a parameter reduction results in 2.3x speed up. By incorporating optical flow guidance, our method requires 9000x less training data and achieves over 20x fewer parameters than video-based diffusion models. },
keywords = {ICCV},
pubstate = {published},
tppubtype = {conference}
}
Jang, Young Kyun; Lim, Ser-Nam
Towards Cross-modal Backward-compatible Representation Learning for Vision-Language Models Conference
IEEE/CVF International Conference on Computer Vision, 2025.
Abstract | Tags: ICCV | Links:
@conference{Jang2025,
title = {Towards Cross-modal Backward-compatible Representation Learning for Vision-Language Models},
author = {Young Kyun Jang and Ser-Nam Lim},
url = {https://arxiv.org/abs/2405.14715},
year = {2025},
date = {2025-10-19},
publisher = {IEEE/CVF International Conference on Computer Vision},
abstract = {Modern retrieval systems often struggle with upgrading to new and more powerful models due to the incompatibility of embeddings between the old and new models. This necessitates a costly process known as backfilling, which involves re-computing the embeddings for a large number of data samples. In vision, Backward-compatible Training (BT) has been proposed to ensure that the new model aligns with the old model's embeddings. This paper extends the concept of vision-only BT to the field of cross-modal retrieval, marking the first attempt to address Cross-modal BT (XBT). Our goal is to achieve backward-compatibility between Vision-Language Pretraining (VLP) models, such as CLIP, for the cross-modal retrieval task. To address XBT challenges, we propose an efficient solution: a projection module that maps the new model's embeddings to those of the old model. This module, pretrained solely with text data, significantly reduces the number of image-text pairs required for XBT learning, and, once it is pretrained, it avoids using the old model during training. Furthermore, we utilize parameter-efficient training strategies that improve efficiency and preserve the off-the-shelf new model's knowledge by avoiding any modifications. Experimental results on cross-modal retrieval datasets demonstrate the effectiveness of XBT and its potential to enable backfill-free upgrades when a new VLP model emerges.},
keywords = {ICCV},
pubstate = {published},
tppubtype = {conference}
}
Li, Ming; Gu, Xin; Chen, Fan; Xing, Xiaoying; Wen, Longyin; Chen, Chen; Zhu, Sijie
SuperEdit: Rectifying and Facilitating Supervision for Instruction-Based Image Editing Conference
IEEE/CVF International Conference on Computer Vision, 2025.
Abstract | Tags: ICCV | Links:
@conference{Li2025c,
title = {SuperEdit: Rectifying and Facilitating Supervision for Instruction-Based Image Editing},
author = {Ming Li and Xin Gu and Fan Chen and Xiaoying Xing and Longyin Wen and Chen Chen and Sijie Zhu},
url = {https://liming-ai.github.io/SuperEdit/
https://github.com/bytedance/SuperEdit
https://huggingface.co/datasets/limingcv/SuperEdit-40K},
year = {2025},
date = {2025-10-19},
publisher = {IEEE/CVF International Conference on Computer Vision},
abstract = {Due to the challenges of manually collecting accurate editing data, existing datasets are typically constructed using various automated methods, leading to noisy supervision signals caused by the mismatch between editing instructions and original-edited image pairs. Recent efforts attempt to improve editing models through generating higher-quality edited images, pre-training on recognition tasks, or introducing vision-language models (VLMs) but fail to resolve this fundamental issue. In this paper, we offer a novel solution by constructing more effective editing instructions for given image pairs. This includes rectifying the editing instructions to better align with the original-edited image pairs and using contrastive editing instructions to further enhance their effectiveness. Specifically, we find that editing models exhibit specific generation attributes at different inference steps, independent of the text. Based on these prior attributes, we define a unified guide for VLMs to rectify editing instructions. However, there are some challenging editing scenarios that cannot be resolved solely with rectified instructions. To this end, we further construct contrastive supervision signals with positive and negative instructions and introduce them into the model training using triplet loss, thereby further facilitating supervision effectiveness. Our method does not require the VLM modules or pre-training tasks used in previous work, offering a more direct and efficient way to provide better supervision signals, and providing a novel, simple, and effective solution for instruction-based image editing. Results on multiple benchmarks demonstrate that our method significantly outperforms existing approaches. Compared with previous SOTA SmartEdit, we achieve 9.19% improvements on the Real-Edit benchmark with 30x less training data and 13x smaller model size.},
keywords = {ICCV},
pubstate = {published},
tppubtype = {conference}
}
Tu, Yuanpeng; Chen, Xi; Lim, Ser-Nam; Zhao, Hengshuang
DreamMask: Boosting Open-vocabulary Panoptic Segmentation with Synthetic Data Conference
SIGGRAPH, 2025.
Abstract | Tags: SIGGRAPH | Links:
@conference{Tu2025,
title = {DreamMask: Boosting Open-vocabulary Panoptic Segmentation with Synthetic Data},
author = {Yuanpeng Tu and Xi Chen and Ser-Nam Lim and Hengshuang Zhao},
url = {https://yuanpengtu.github.io/Dreammask-Page/},
year = {2025},
date = {2025-08-10},
urldate = {2025-08-10},
publisher = {SIGGRAPH},
abstract = {Open-vocabulary panoptic segmentation has received significant attention due to its applicability in the real world. Despite claims of robust generalization, we find that the advancements of previous works are attributed mainly to trained categories, exposing a lack of generalization to novel classes. In this paper, we explore boosting existing models from a data-centric perspective. We propose DreamMask, which systematically explores how to generate training data in the open-vocabulary setting, and how to train the model with both real and synthetic data. For the first part, we propose an automatic data generation pipeline with off-the-shelf models. We propose crucial designs for vocabulary expansion, layout arrangement, data filtering, etc. Equipped with these techniques, our generated data could significantly outperform the manually collected web data. To train the model with generated data, a synthetic-real alignment loss is designed to bridge the representation gap, bringing noticeable improvements across multiple benchmarks. In general, DreamMask significantly simplifies the collection of large-scale training data, serving as a plug-and-play enhancement for existing methods. For instance, when trained on COCO and tested on ADE20K, the model equipped with DreamMask outperforms the previous state-of-the-art by a substantial margin of 2.1% mIoU.},
keywords = {SIGGRAPH},
pubstate = {published},
tppubtype = {conference}
}
Cui, Xuanming; Chionh, Wei Peng; Kuek, Adriel; Lim, Ser-Nam (Ed.)
Improving Soft Unification with Knowledge Graph Embedding Methods Conference
Forty-Second International Conference on Machine Learning, 2025.
Abstract | Tags: ICML | Links:
@conference{Cui2025b,
title = {Improving Soft Unification with Knowledge Graph Embedding Methods},
editor = {Xuanming Cui and Wei Peng Chionh and Adriel Kuek and Ser-Nam Lim},
url = {https://openreview.net/forum?id=OOqvY9yvVG
},
year = {2025},
date = {2025-07-13},
publisher = {Forty-Second International Conference on Machine Learning},
abstract = {Neural Theorem Provers (NTPs) present a promising framework for neuro-symbolic reasoning, combining end-to-end differentiability with the interpretability of symbolic logic programming. However, optimizing NTPs remains a significant challenge due to their complex objective landscape and gradient sparcity. On the other hand, Knowledge Graph Embedding (KGE) methods offer smooth optimization with well-defined learning objectives but often lack interpretability. In this work, we propose several strategies to integrate the strengths of NTPs and KGEs. By incorporating KGE objectives into the NTP framework, we demonstrate substantial improvements in both accuracy and computational efficiency.},
keywords = {ICML},
pubstate = {published},
tppubtype = {conference}
}
Li, Zhuoling; Xu, Xiaogang; Xu, Zhenhua; Lim, Ser-Nam; Zhao, Hengshuang
LARM: Large Auto-Regressive Model for Long-Horizon Embodied Intelligence Conference
Forty-Second International Conference on Machine Learning, 2025.
Abstract | Tags: ICML | Links:
@conference{Li2025,
title = {LARM: Large Auto-Regressive Model for Long-Horizon Embodied Intelligence},
author = {Zhuoling Li and Xiaogang Xu and Zhenhua Xu and Ser-Nam Lim and Hengshuang Zhao},
url = {https://lizhuoling.github.io/LARM_webpage/},
year = {2025},
date = {2025-07-13},
publisher = {Forty-Second International Conference on Machine Learning},
abstract = {Recent embodied agents are primarily built based on reinforcement learning (RL) or large language models (LLMs). Among them, RL agents are efficient for deployment but only perform very few tasks. By contrast, giant LLM agents (often more than 1000B parameters) present strong generalization while demanding enormous computing resources. In this work, we combine their advantages while avoiding the drawbacks by conducting the proposed referee RL on our developed large auto-regressive model (LARM). Specifically, LARM is built upon a lightweight LLM (fewer than 5B parameters) and directly outputs the next action to execute rather than text. We mathematically reveal that classic RL feedbacks vanish in long-horizon embodied exploration and introduce a giant LLM based referee to handle this reward vanishment during training LARM. In this way, LARM learns to complete diverse open-world tasks without human intervention. Especially, LARM successfully harvests enchanted diamond equipment in Minecraft, which demands significantly longer decision-making chains than the highest achievements of prior best methods.},
keywords = {ICML},
pubstate = {published},
tppubtype = {conference}
}
Roch, Zachary Andrew; Atia, George; Wang, Yue
Efficient and Scalable Reinforcement Learning for Average Reward under Model Uncertainty Conference
Forty-Second International Conference on Machine Learning, 2025.
Tags: ICML
@conference{Roch2025,
title = {Efficient and Scalable Reinforcement Learning for Average Reward under Model Uncertainty},
author = {Zachary Andrew Roch and George Atia and Yue Wang},
year = {2025},
date = {2025-07-13},
publisher = {Forty-Second International Conference on Machine Learning},
keywords = {ICML},
pubstate = {published},
tppubtype = {conference}
}
Zhang, Chi; Jia, Ziying; Atia, George; He, Sihong; Wang, Yue
Pessimism Principle Can Be Effective: Towards a Framework for Zero-Shot Transfer Reinforcement Learning Conference
2025.
Tags: ICML
@conference{Zhang2025b,
title = {Pessimism Principle Can Be Effective: Towards a Framework for Zero-Shot Transfer Reinforcement Learning},
author = {Chi Zhang and Ziying Jia and George Atia and Sihong He and Yue Wang},
year = {2025},
date = {2025-07-13},
keywords = {ICML},
pubstate = {published},
tppubtype = {conference}
}
Giampouras, Paris; Cai, HanQin; Vidal, René
Guarantees of a Preconditioned Subgradient Algorithm for Overparameterized Asymmetric Low-rank Matrix Recovery Conference
Forty-Second International Conference on Machine Learning, 2025.
Abstract | Tags: ICML | Links:
@conference{Giampouras2025,
title = {Guarantees of a Preconditioned Subgradient Algorithm for Overparameterized Asymmetric Low-rank Matrix Recovery},
author = {Paris Giampouras and HanQin Cai and René Vidal},
url = {https://arxiv.org/abs/2410.16826},
year = {2025},
date = {2025-07-13},
publisher = {Forty-Second International Conference on Machine Learning},
abstract = {In this paper, we focus on a matrix factorization-based approach for robust low-rank and asymmetric matrix recovery from corrupted measurements. We address the challenging scenario where the rank of the sought matrix is unknown and employ an overparameterized approach using the variational form of the nuclear norm as a regularizer. We propose a subgradient algorithm that inherits the merits of preconditioned algorithms, whose rate of convergence does not depend on the condition number of the sought matrix, and addresses their current limitation, i.e., the lack of convergence guarantees in the case of asymmetric matrices with unknown rank. In this setting, we provide, for the first time in the literature, linear convergence guarantees for the derived overparameterized preconditioned subgradient algorithm in the presence of gross corruptions. Additionally, by applying our approach to matrix sensing, we highlight its merits when the measurement operator satisfies the mixed-norm restricted isometry properties. Lastly, we present numerical experiments that validate our theoretical results and demonstrate the effectiveness of our approach.
},
keywords = {ICML},
pubstate = {published},
tppubtype = {conference}
}
Chehade, M. F. El Hajj; Ghosal, S. S.; Chakraborty, S.; Reddy, A.; Manocha, Dinesh; H. Zhu, H.; Bedi, Amrit Singh
Inference-Time Alignment of LLMs via User-Specified Multi-Criteria Transfer Decoding Conference
Forty-Second International Conference on Machine Learning, 2025.
Tags: ICML | Links:
@conference{Chehade2025,
title = {Inference-Time Alignment of LLMs via User-Specified Multi-Criteria Transfer Decoding},
author = {M. F. El Hajj Chehade and S. S. Ghosal and S. Chakraborty and A. Reddy and Dinesh Manocha and H. Zhu, H. and Amrit Singh Bedi },
url = {https://arxiv.org/html/2410.09300v3},
year = {2025},
date = {2025-07-13},
urldate = {2025-07-13},
publisher = {Forty-Second International Conference on Machine Learning},
keywords = {ICML},
pubstate = {published},
tppubtype = {conference}
}
Chen, Ziang; Chen, Xiaohan; Liu, Jialin; Wang, Xinshang; Yin, Wotao
Expressive Power of Graph Neural Networks for (Mixed-Integer) Quadratic Programs Conference
2025.
Abstract | Tags: ICML | Links:
@conference{Chen2025b,
title = {Expressive Power of Graph Neural Networks for (Mixed-Integer) Quadratic Programs},
author = {Ziang Chen and Xiaohan Chen and Jialin Liu and Xinshang Wang and Wotao Yin},
url = {https://openreview.net/forum?id=iqd8aHKwGA},
year = {2025},
date = {2025-07-13},
abstract = {Quadratic programming (QP) is the most widely applied category of problems in nonlinear programming. Many applications require real-time/fast solutions, though not necessarily with high precision. Existing methods either involve matrix decomposition or use the preconditioned conjugate gradient method. For relatively large instances, these methods cannot achieve the real-time requirement unless there is an effective preconditioner. Recently, graph neural networks (GNNs) opened new possibilities for QP. Some promising empirical studies of applying GNNs for QP tasks show that GNNs can capture key characteristics of an optimization instance and provide adaptive guidance accordingly to crucial configurations during the solving process, or directly provide an approximate solution. Despite notable empirical observations, theoretical foundations are still lacking.
In this work, we investigate the expressive or representative power of GNNs, a crucial aspect of neural network theory, specifically in the context of QP tasks, with both continuous and mixed-integer settings. We prove the existence of message-passing GNNs that can reliably represent key properties of quadratic programs, including feasibility, optimal objective value, and optimal solution. Our theory is validated by numerical results.},
keywords = {ICML},
pubstate = {published},
tppubtype = {conference}
}
In this work, we investigate the expressive or representative power of GNNs, a crucial aspect of neural network theory, specifically in the context of QP tasks, with both continuous and mixed-integer settings. We prove the existence of message-passing GNNs that can reliably represent key properties of quadratic programs, including feasibility, optimal objective value, and optimal solution. Our theory is validated by numerical results.
Roch, Zachary Andrew; Atia, George; Wang, Yue
Efficient and Scalable Reinforcement Learning for Average Reward under Model Uncertainty Conference
Forty-Second International Conference on Machine Learning, 2025.
Tags: ICML
@conference{Roch2025b,
title = {Efficient and Scalable Reinforcement Learning for Average Reward under Model Uncertainty},
author = {Zachary Andrew Roch and George Atia and Yue Wang},
year = {2025},
date = {2025-07-13},
publisher = {Forty-Second International Conference on Machine Learning},
keywords = {ICML},
pubstate = {published},
tppubtype = {conference}
}
Zhang, Chi; Jia, Ziying; Atia, George; He, Sihong; Wang, Yue
Pessimism Principle Can Be Effective: Towards a Framework for Zero-Shot Transfer Reinforcement Learning Conference
2025.
Tags: ICML
@conference{Zhang2025c,
title = {Pessimism Principle Can Be Effective: Towards a Framework for Zero-Shot Transfer Reinforcement Learning},
author = {Chi Zhang and Ziying Jia and George Atia and Sihong He and Yue Wang},
year = {2025},
date = {2025-07-13},
urldate = {2025-07-13},
keywords = {ICML},
pubstate = {published},
tppubtype = {conference}
}
Chan-Santiago, Jeffrey A.; Tirupattur, Praveen; Nayak, Gaurav Kumar; Liu, Gaowen; Shah, Mubarak
MGD^3: Mode-Guided Dataset Distillation using Diffusion Models Conference
Forty-Second International Conference on Machine Learning, 2025.
Abstract | Tags: ICML | Links:
@conference{Chan-Santiago2025,
title = {MGD^3: Mode-Guided Dataset Distillation using Diffusion Models},
author = {Jeffrey A. Chan-Santiago and Praveen Tirupattur and Gaurav Kumar Nayak and Gaowen Liu and Mubarak Shah},
url = {https://icml.cc/virtual/2025/poster/45507},
year = {2025},
date = {2025-07-13},
publisher = {Forty-Second International Conference on Machine Learning},
abstract = {Dataset distillation has emerged as an effective strategy, significantly reducing training costs and facilitating more efficient model deployment.Recent advances have leveraged generative models to distill datasets by capturing the underlying data distribution. Unfortunately, existing methods require model fine-tuning with distillation losses to encourage diversity and representativeness. However, these methods do not guarantee sample diversity, limiting their performance.We propose a mode-guided diffusion model leveraging a pre-trained diffusion model without the need to fine-tune with distillation losses. Our approach addresses dataset diversity in three stages: Mode Discovery to identify distinct data modes, Mode Guidance to enhance intra-class diversity, and Stop Guidance to mitigate artifacts in synthetic samples that affect performance.We evaluate our approach on ImageNette, ImageIDC, ImageNet-100, and ImageNet-1K, achieving accuracy improvements of 4.4%, 2.9%, 1.6%, and 1.6%, respectively, over state-of-the-art methods. Our method eliminates the need for fine-tuning diffusion models with distillation losses, significantly reducing computational costs.
},
keywords = {ICML},
pubstate = {published},
tppubtype = {conference}
}
Shi, Min; Afzal, Muhammad Muneeb; Wen, Hao Huang; Congcong; Luo, Yan; Khan, Muhammad Osama; Tian, Yu; Kim, Leo; Fang, Yi; Wang, Mengyu
Equitable Deep Learning for Diabetic Retinopathy Detection Using Multidimensional Retinal Imaging With Fair Adaptive Scaling Journal Article
In: Translational Vision Science & Technology, vol. 14, iss. 7, pp. 1, 2025.
Abstract | Tags: TVST | Links:
@article{Shi2025,
title = {Equitable Deep Learning for Diabetic Retinopathy Detection Using Multidimensional Retinal Imaging With Fair Adaptive Scaling},
author = {Min Shi and Muhammad Muneeb Afzal and Hao Huang; Congcong Wen and Yan Luo and Muhammad Osama Khan and Yu Tian and Leo Kim and Yi Fang and Mengyu Wang},
url = {https://tvst.arvojournals.org/article.aspx?articleid=2803200},
doi = {10.1167/tvst.14.7.1},
year = {2025},
date = {2025-07-01},
journal = {Translational Vision Science & Technology},
volume = {14},
issue = {7},
pages = {1},
abstract = {To investigate the fairness of existing deep models for diabetic retinopathy (DR) detection and introduce an equitable model to reduce group performance disparities. We evaluated the performance and fairness of various deep learning models for DR detection using fundus images and optical coherence tomography (OCT) B-scans. A Fair Adaptive Scaling (FAS) module was developed to reduce group disparities. Model performance was evaluated using the area under the receiver operating characteristic curve (AUC), and equity across various groups was assessed by equity-scaled AUC, which accommodated both overall AUC and AUCs of individual groups. Using color fundus images, the integration of FAS with EfficientNet improved the overall AUC and equity-scaled AUC from 0.88 and 0.83 to 0.90 and 0.84 (P < 0.05) by race. AUCs for Asians and Whites increased by 0.05 and 0.03, respectively (P < 0.01). For gender, both metrics improved by 0.01 (P < 0.05). Using DenseNet121 on OCT B-Scans by race, FAS improved the overall AUC and equity-scaled AUC from 0.875 and 0.81 to 0.884 and 0.82, with gains of 0.03 and 0.02 for Asians and Blacks (P < 0.01). For gender, DenseNet121's metrics rose by 0.04 and 0.03, with gains of 0.05 and 0.04 for females and males (P < 0.01). Deep learning models demonstrate varying accuracies across different groups in DR detection. FAS improves equity and accuracy of deep learning models. The proposed deep learning model has a potential to improve both model performance and equity of DR detection.},
keywords = {TVST},
pubstate = {published},
tppubtype = {article}
}
Wang, Lan; Ao, Wei; Boddeti, Vishnu; Lim, Ser-Nam
Generative Zero-Shot Composed Image Retrieval Conference
IEEE/CVF Computer Vision and Pattern Recognition Conference (CVPR), 2025.
Abstract | Tags: CVPR | Links:
@conference{Wang2025,
title = {Generative Zero-Shot Composed Image Retrieval},
author = {Lan Wang and Wei Ao and Vishnu Boddeti and Ser-Nam Lim},
url = {https://hal.cse.msu.edu/papers/cig-generative-zero-shot-composed-image-retrieval/
https://hal.cse.msu.edu/assets/pdfs/papers/2025-cvpr-cig-generative-zero-shot-composed-image-retrieval.pdf
https://hal.cse.msu.edu/assets/pdfs/papers/2025-cvpr-cig-generative-zero-shot-composed-image-retrieval-supp.pdf
https://lan-lw.github.io/CIG/},
year = {2025},
date = {2025-06-11},
urldate = {2025-06-11},
publisher = {IEEE/CVF Computer Vision and Pattern Recognition Conference (CVPR)},
abstract = {Composed Image Retrieval (CIR) is a vision-language task utilizing queries comprising images and textual descriptions to achieve precise image retrieval. This task seeks to find images that are visually similar to a reference image while incorporating specific changes or features described textually (visual delta). CIR enables a more flexible and user-specific retrieval by bridging visual data with verbal instructions. This paper introduces a novel generative method that augments Composed Image Retrieval by Composed Image Generation (CIG) to provide pseudotarget images. CIG utilizes a textual inversion network to map reference images into semantic word space, which generates pseudo-target images in combination with textual descriptions. These images serve as additional visual information, significantly improving the accuracy and relevance of retrieved images when integrated into existing retrieval frameworks. Experiments conducted across multiple CIR datasets and several baseline methods demonstrate improvements in retrieval performance, which shows the potential of our approach as an effective add-on for existing composed image retrieval.},
keywords = {CVPR},
pubstate = {published},
tppubtype = {conference}
}
Ghosal, Soumya Suvra; Chakraborty, Souradip; Singh, Vaibhav; Guan, Tianrui; Wang, Mengdi; Beirami, Ahmad; Huang, Furong; Velasquez, Alvaro; Manocha, Dinesh; Bedi, Amrit Singh
IMMUNELogo : Improving Safety Against Jailbreaks in Multi-modal LLMs via Inference-Time Alignment Conference
IEEE/CVF Computer Vision and Pattern Recognition Conference (CVPR), 2025.
Abstract | Tags: CVPR | Links:
@conference{Ghosal2025,
title = {IMMUNELogo : Improving Safety Against Jailbreaks in Multi-modal LLMs via Inference-Time Alignment},
author = {Soumya Suvra Ghosal and Souradip Chakraborty and Vaibhav Singh and Tianrui Guan and Mengdi Wang and Ahmad Beirami and Furong Huang and Alvaro Velasquez and Dinesh Manocha and Amrit Singh Bedi},
url = {https://itsvaibhav01.github.io/immune-web/
https://arxiv.org/pdf/2411.18688
https://arxiv.org/abs/2411.18688
https://github.com/itsvaibhav01/Immune},
year = {2025},
date = {2025-06-11},
urldate = {2025-06-11},
publisher = {IEEE/CVF Computer Vision and Pattern Recognition Conference (CVPR)},
abstract = {With the widespread deployment of Multimodal Large Language Models (MLLMs) for visual-reasoning tasks, improving their safety has become crucial. Recent research indicates that despite training-time safety alignment, these models remain vulnerable to jailbreak attacks. In this work, we first highlight an important safety gap to describe that alignment achieved solely through safety training may be insufficient against jailbreak attacks. To address this vulnerability, we propose Immune, an inference-time defense framework that leverages a safe reward model through controlled decoding to defend against jailbreak attacks. Additionally, we provide a mathematical characterization of Immune, offering insights on why it improves safety against jailbreaks. Extensive evaluations on diverse jailbreak benchmarks using recent MLLMs reveal that Immune effectively enhances model safety while preserving the model's original capabilities. For instance, against text-based jailbreak attacks on LLaVA-1.6, Immune reduces the attack success rate by 57.82% and 16.78% compared to the base MLLM and state-of-the-art defense strategy, respectively.},
keywords = {CVPR},
pubstate = {published},
tppubtype = {conference}
}
xin Liu, Ye; Liang, Zhengyang; Wang, Yueze; Wu, Xianfeng; Tang, Feilong; He, Muyang; Li, Jian; Liu, Zheng; Yang, Harry; Lim, Ser-Nam; Zhao, Bo
Unveiling the Ignorance of MLLMs: Seeing Clearly, Answering Incorrectly Conference
IEEE/CVF Computer Vision and Pattern Recognition Conference (CVPR), 2025.
Abstract | Tags: CVPR | Links:
@conference{xinLiu2025,
title = {Unveiling the Ignorance of MLLMs: Seeing Clearly, Answering Incorrectly},
author = {Ye xin Liu and Zhengyang Liang and Yueze Wang and Xianfeng Wu and Feilong Tang and Muyang He and Jian Li and Zheng Liu and Harry Yang and Ser-Nam Lim and Bo Zhao},
url = {https://arxiv.org/html/2406.10638v2
https://arxiv.org/pdf/2406.10638v2},
year = {2025},
date = {2025-06-11},
urldate = {2025-06-11},
publisher = {IEEE/CVF Computer Vision and Pattern Recognition Conference (CVPR)},
abstract = {Multimodal Large Language Models (MLLMs) have displayed remarkable performance in multi-modal tasks, particularly in visual comprehension. However, we reveal that MLLMs often generate incorrect answers even when they understand the visual content. To this end, we manually construct a benchmark with 12 categories and design evaluation metrics that assess the degree of error in MLLM responses even when the visual content is seemingly understood. Based on this benchmark, we test 15 leading MLLMs and analyze the distribution of attention maps and logits of some MLLMs. Our investigation identifies two primary issues: 1) most instruction tuning datasets predominantly feature questions that “directly” relate to the visual content, leading to a bias in MLLMs’ responses to other indirect questions, and 2) MLLMs’ attention to visual tokens is notably lower than to system and question tokens. We further observe that attention scores between questions and visual tokens as well as the model’s confidence in the answers are lower in response to misleading questions than to straightforward ones. To address the first challenge, we introduce a paired positive and negative data construction pipeline to diversify the dataset. For the second challenge, we propose to enhance the model’s focus on visual content during decoding by refining the text and visual prompt. For the text prompt, we propose a content guided refinement strategy that performs preliminary visual content analysis to generate structured information before answering the question. Additionally, we employ a visual attention refinement strategy that highlights question-relevant visual tokens to increase the model’s attention to visual content that aligns with the question. Extensive experiments demonstrate that these challenges can be significantly mitigated with our proposed dataset and techniques. The benchmark, training set, and code will be available.},
keywords = {CVPR},
pubstate = {published},
tppubtype = {conference}
}
Cui, Yuning; Zamir, Syed Waqas; Khan, Salman; Knoll, Alois; Shah, Mubarak; Khan, Fahad Shahbaz
AdaIR: Adaptive All-in-One Image Restoration via Frequency Mining and Modulation Conference
Thirteenth International Conference on Learning Representations (ICLR), 2025.
Abstract | Tags: ICLR | Links:
@conference{Cui2025,
title = {AdaIR: Adaptive All-in-One Image Restoration via Frequency Mining and Modulation},
author = {Yuning Cui and Syed Waqas Zamir and Salman Khan and Alois Knoll and Mubarak Shah and Fahad Shahbaz Khan},
url = {https://openreview.net/forum?id=M5t0WvjfCg&referrer=%5BAuthor%20Console%5D(%2Fgroup%3Fid%3DICLR.cc%2F2025%2FConference%2FAuthors%23your-submissions)
https://www.crcv.ucf.edu/wp-content/uploads/2018/11/8311_AdaIR_Adaptive_All_in_One.pdf.pdf},
year = {2025},
date = {2025-04-24},
urldate = {2025-04-24},
booktitle = {Thirteenth International Conference on Learning Representations (ICLR)},
abstract = {In the image acquisition process, various forms of degradation, including noise, blur, haze, and rain, are frequently introduced. These degradations typically arise from the inherent limitations of cameras or unfavorable ambient conditions. To recover clean images from their degraded versions, numerous specialized restoration methods have been developed, each targeting a specific type of degradation. Recently, all-in-one algorithms have garnered significant attention by addressing different types of degradations within a single model without requiring the prior information of the input degradation type. However, most methods purely operate in the spatial domain and do not delve into the distinct frequency variations inherent to different degradation types. To address this gap, we propose an adaptive all-in-one image restoration network based on frequency mining and modulation. Our approach is motivated by the observation that different degradation types impact the image content on different frequency subbands, thereby requiring different treatments for each restoration task. Specifically, we first mine low- and high-frequency information from the input features, guided by the adaptively decoupled spectra of the degraded image. The extracted features are then modulated by a bidirectional operator to facilitate interactions between different frequency components. Finally, the modulated features are merged into the original input for a progressively guided restoration. With this approach, the model achieves adaptive reconstruction by accentuating the informative frequency subbands according to different input degradations. Extensive experiments demonstrate that the proposed method, named AdaIR, achieves state-of-the-art performance on different image restoration tasks, including image denoising, dehazing, deraining, motion deblurring, and low-light image enhancement. Our code and models will be made publicly available.},
keywords = {ICLR},
pubstate = {published},
tppubtype = {conference}
}
Fioresi, Joseph; Dave, Ishan Rajendrakumar; Shah, Mubarak
ALBAR: Adversarial Learning approach to mitigate Biases in Action Recognition Conference
Thirteenth International Conference on Learning Representations (ICLR), 2025.
Abstract | Tags: ICLR | Links:
@conference{Fioresi2025,
title = {ALBAR: Adversarial Learning approach to mitigate Biases in Action Recognition},
author = {Joseph Fioresi and Ishan Rajendrakumar Dave and Mubarak Shah},
url = {https://openreview.net/forum?id=9KiE3t6CsL&referrer=%5BAuthor%20Console%5D(%2Fgroup%3Fid%3DICLR.cc%2F2025%2FConference%2FAuthors%23your-submissions)
https://www.crcv.ucf.edu/wp-content/uploads/2018/11/7609_ALBAR_Adversarial_Learnin.pdf},
year = {2025},
date = {2025-04-24},
booktitle = {Thirteenth International Conference on Learning Representations (ICLR)},
abstract = {Bias in machine learning models can lead to unfair decision making, and while it has been well-studied in the image and text domains, it remains underexplored in action recognition. Action recognition models often suffer from background bias (i.e., inferring actions based on background cues) and foreground bias (i.e., relying on subject appearance), which can be detrimental to real-life applications such as autonomous vehicles or assisted living monitoring. While prior approaches have mainly focused on mitigating background bias using specialized augmentations, we thoroughly study both biases. We propose approachname, a novel adversarial training method that mitigates foreground and background biases without requiring specialized knowledge of the bias attributes. Our framework applies an adversarial cross-entropy loss to the sampled static clip (where all the frames are the same) and aims to make its class probabilities uniform using a proposed textit{entropy maximization} loss. Additionally, we introduce a textit{gradient penalty} loss for regularization against the debiasing process. We evaluate our method on established background and foreground bias protocols, setting a new state-of-the-art and strongly improving combined debiasing performance by over textbf{12%} on HMDB51. Furthermore, we identify an issue of background leakage in the existing UCF101 protocol for bias evaluation which provides a shortcut to predict actions and does not provide an accurate measure of the debiasing capability of a model. We address this issue by proposing more fine-grained segmentation boundaries for the actor, where our method also outperforms existing approaches.},
keywords = {ICLR},
pubstate = {published},
tppubtype = {conference}
}
Chhipa, Prakash Chandra; Vashishtha, Gautam; Jithamanyu, Settur; Saini, Rajkumar; Shah, Mubarak; Liwicki, Marcus
ASTrA: Adversarial Self-supervised Training with Adaptive-Attacks Conference
Thirteenth International Conference on Learning Representations (ICLR), 2025.
Abstract | Tags: ICLR | Links:
@conference{Chhipa2025,
title = {ASTrA: Adversarial Self-supervised Training with Adaptive-Attacks},
author = {Prakash Chandra Chhipa and Gautam Vashishtha and Settur Jithamanyu and Rajkumar Saini and Mubarak Shah and Marcus Liwicki},
url = {https://prakashchhipa.github.io/projects/ASTrA
https://www.crcv.ucf.edu/wp-content/uploads/2018/11/7338_ASTrA_Adversarial_Self_su.pdf.pdf},
year = {2025},
date = {2025-04-24},
booktitle = {Thirteenth International Conference on Learning Representations (ICLR)},
abstract = {Existing self-supervised adversarial training (self-AT) methods rely on handcrafted adversarial attack strategies for PGD attacks, which fail to adapt to the evolving learning dynamics of the model and do not account for instancespecific characteristics of images. This results in sub-optimal adversarial robustness and limits the alignment between clean and adversarial data distributions. To address this, we propose ASTrA (Adversarial Self-supervised Training with Adaptive-Attacks), a novel framework introducing a learnable, self-supervised attack strategy network that autonomously discovers optimal attack parameters through exploration-exploitation in a single training episode. ASTrA leverages a reward mechanism based on contrastive loss, optimized with REINFORCE, enabling adaptive attack strategies without labeled data or additional hyperparameters. We further introduce a mixed contrastive objective to align the distribution of clean and adversarial examples in representation space. ASTrA achieves state-of-the-art results on CIFAR10, CIFAR100, and STL10 while integrating seamlessly as a plug-and-play module for other self-AT methods. ASTrAshows scalability to larger datasets, demonstrates strong semi-supervised performance, and is resilient to robust overfitting, backed by explainability analysis on optimal attack strategies. Project page for source code and other details at https://prakashchhipa.github.io/projects/ASTrA.},
keywords = {ICLR},
pubstate = {published},
tppubtype = {conference}
}
Chen, Chen; Liu, Daochang; Shah, Mubarak; Xu, Chang
Exploring Local Memorization in Diffusion Models via Bright Ending Attention Conference
Thirteenth International Conference on Learning Representations (ICLR), 2025.
Abstract | Tags: ICLR | Links:
@conference{Chen2025,
title = {Exploring Local Memorization in Diffusion Models via Bright Ending Attention},
author = {Chen Chen and Daochang Liu and Mubarak Shah and Chang Xu},
url = {https://openreview.net/forum?id=p4cLtzk4oe&referrer=%5BAuthor%20Console%5D(%2Fgroup%3Fid%3DICLR.cc%2F2025%2FConference%2FAuthors%23your-submissions)
https://www.crcv.ucf.edu/wp-content/uploads/2018/11/9283_Exploring_Local_Memorizat.pdf},
year = {2025},
date = {2025-04-24},
booktitle = {Thirteenth International Conference on Learning Representations (ICLR)},
abstract = {In this paper, we identify and leverage a novel `bright ending' (BE) anomaly in diffusion models prone to memorizing training images to address a new task: locating localized memorization regions within these models. BE refers to a distinct cross-attention pattern observed in text-to-image generations using diffusion models. Specifically, memorized image patches exhibit significantly greater attention to the end token during the final inference step compared to non-memorized patches. This attention map effectively highlights regions where the generated image replicates training data. Furthermore, driven by our observation that local memorization significantly underperforms in existing tasks of measuring, detecting, and mitigating memorization in diffusion models compared to global memorization, we propose a simple yet effective method to integrate BE and the results of the new localization task into these existing frameworks. This integration effectively improves their performances by narrowing the performance gap caused by local memorization. Our results not only demonstrate the successful execution of the new localization task but also establish new state-of-the-art performance across all existing tasks, underscoring the significance of the BE phenomenon.},
keywords = {ICLR},
pubstate = {published},
tppubtype = {conference}
}
Kang, Weitai; Qu, Mengxue; Kini, Jyoti; Wei, Yunchao; Shah, Mubarak; Yan, Yan
Intent3D: 3D Object Detection in RGB-D Scans Based on Human Intention Conference
Thirteenth International Conference on Learning Representations (ICLR), 2025.
Abstract | Tags: ICLR | Links:
@conference{Kang2025,
title = {Intent3D: 3D Object Detection in RGB-D Scans Based on Human Intention},
author = {Weitai Kang and Mengxue Qu and Jyoti Kini and Yunchao Wei and Mubarak Shah and Yan Yan},
url = {https://openreview.net/forum?id=5GgjiRzYp3&referrer=%5BAuthor%20Console%5D(%2Fgroup%3Fid%3DICLR.cc%2F2025%2FConference%2FAuthors%23your-submissions)
https://www.crcv.ucf.edu/wp-content/uploads/2018/11/3450_Intent3D_3D_Object_Detect.pdf.pdf},
year = {2025},
date = {2025-04-24},
booktitle = {Thirteenth International Conference on Learning Representations (ICLR)},
abstract = {In real-life scenarios, humans seek out objects in the 3D world to fulfill their daily needs or intentions. This inspires us to introduce 3D intention grounding, a new task in 3D object detection employing RGB-D, based on human intention, such as “I want something to support my back.” Closely related, 3D visual grounding focuses on understanding human reference. To achieve detection based on human intention, it relies on humans to observe the scene, reason out the target that aligns with their intention (“pillow” in this case), and finally provide a reference to the AI system, such as “A pillow on the couch”. Instead, 3D intention grounding challenges AI agents to automatically observe, reason and detect the desired target solely based on human intention. To tackle this challenge, we introduce the new Intent3D dataset, consisting of 44,990 intention texts associated with 209 f ine-grained classes from 1,042 scenes of the ScanNet [Dai et al., 2017] dataset. We also establish several baselines based on different language-based 3D object detection models on our benchmark. Finally, we propose IntentNet, our unique approach, designed to tackle this intention-based detection problem. It focuses on three key aspects: intention understanding, reasoning to identify object candidates, and cascaded adaptive learning that leverages the intrinsic priority logic of different losses for multiple objective optimization.},
keywords = {ICLR},
pubstate = {published},
tppubtype = {conference}
}
Monsefi, Amin Karimi; Zhou, Mengxi; Monsefi, Nastaran Karimi; Lim, Ser-Nam; Chao, Wei-Lun; Ramnath, Rajiv
Frequency-Guided Masking for Enhanced Vision Self-Supervised Learning Conference
Thirteenth International Conference on Learning Representations (ICLR), 2025.
Abstract | Tags: ICLR | Links:
@conference{Monsefi2025,
title = {Frequency-Guided Masking for Enhanced Vision Self-Supervised Learning},
author = {Amin Karimi Monsefi and Mengxi Zhou and Nastaran Karimi Monsefi and Ser-Nam Lim and Wei-Lun Chao and Rajiv Ramnath},
url = {https://arxiv.org/abs/2409.10362
https://arxiv.org/pdf/2409.10362},
year = {2025},
date = {2025-04-24},
booktitle = {Thirteenth International Conference on Learning Representations (ICLR)},
abstract = {We present a novel frequency-based Self-Supervised Learning (SSL) approach that significantly enhances its efficacy for pre-training. Prior work in this direction masks out pre-defined frequencies in the input image and employs a reconstruction loss to pre-train the model. While achieving promising results, such an implementation has two fundamental limitations as identified in our paper. First, using pre-defined frequencies overlooks the variability of image frequency responses. Second, pre-trained with frequency-filtered images, the resulting model needs relatively more data to adapt to naturally looking images during fine-tuning. To address these drawbacks, we propose FOurier transform compression with seLf-Knowledge distillation (FOLK), integrating two dedicated ideas. First, inspired by image compression, we adaptively select the masked-out frequencies based on image frequency responses, creating more suitable SSL tasks for pre-training. Second, we employ a two-branch framework empowered by knowledge distillation, enabling the model to take both the filtered and original images as input, largely reducing the burden of downstream tasks. Our experimental results demonstrate the effectiveness of FOLK in achieving competitive performance to many state-of-the-art SSL methods across various downstream tasks, including image classification, few-shot learning, and semantic segmentation.
},
keywords = {ICLR},
pubstate = {published},
tppubtype = {conference}
}
Huang, Jiani; Li, Ziyang; Nail, Mayur; Lim, Ser-Nam
LASER: A Neuro-Symbolic Framework for Learning Spatio-Temporal Scene Graphs with Weak Supervision Conference
Thirteenth International Conference on Learning Representations (ICLR), 2025.
Abstract | Tags: ICLR | Links:
@conference{Huang2025,
title = {LASER: A Neuro-Symbolic Framework for Learning Spatio-Temporal Scene Graphs with Weak Supervision},
author = {Jiani Huang and Ziyang Li and Mayur Nail and Ser-Nam Lim},
url = {https://arxiv.org/abs/2304.07647
https://arxiv.org/pdf/2304.07647},
year = {2025},
date = {2025-04-24},
booktitle = {Thirteenth International Conference on Learning Representations (ICLR)},
abstract = {We propose LASER, a neuro-symbolic approach to learn semantic video representations that capture rich spatial and temporal properties in video data by leveraging high-level logic specifications. In particular, we formulate the problem in terms of alignment between raw videos and spatio-temporal logic specifications. The alignment algorithm leverages a differentiable symbolic reasoner and a combination of contrastive, temporal, and semantics losses. It effectively and efficiently trains low-level perception models to extract a fine-grained video representation in the form of a spatio-temporal scene graph that conforms to the desired high-level specification. To practically reduce the manual effort of obtaining ground truth labels, we derive logic specifications from captions by employing a large language model with a generic prompting template. In doing so, we explore a novel methodology that weakly supervises the learning of spatio-temporal scene graphs with widely accessible video-caption data. We evaluate our method on three datasets with rich spatial and temporal specifications: 20BN-Something-Something, MUGEN, and OpenPVSG. We demonstrate that our method learns better fine-grained video semantics than existing baselines.},
keywords = {ICLR},
pubstate = {published},
tppubtype = {conference}
}
Tang, Feilong; Huang, Zile; Liu, Chengzhi; Sun, Qiang; Yang, Harry; Lim, Ser-Nam
Intervening Anchor Token: Decoding Strategy in Alleviating Hallucinations for MLLMs Conference
Thirteenth International Conference on Learning Representations (ICLR), 2025.
Abstract | Tags: ICLR | Links:
@conference{Tang2025,
title = {Intervening Anchor Token: Decoding Strategy in Alleviating Hallucinations for MLLMs},
author = {Feilong Tang and Zile Huang and Chengzhi Liu and Qiang Sun and Harry Yang and Ser-Nam Lim},
url = {https://openreview.net/forum?id=zGb4WgCW5i
https://openreview.net/pdf?id=zGb4WgCW5i},
year = {2025},
date = {2025-04-24},
urldate = {2025-04-24},
booktitle = {Thirteenth International Conference on Learning Representations (ICLR)},
abstract = {Multimodal large language models (MLLMs) offer a powerful mechanism for interpreting visual information. However, they often suffer from hallucinations, which impede the real-world usage of these models. Existing methods attempt to alleviate this issue by designing special decoding strategies that penalize the summary tokens. However, these methods lack analysis of the relationship between hallucination and summarization mechanism of LLMs. Interestingly, we find that penalizing summary tokens is not necessary: merely intervening the query-key parameters variance, without costing extra inference time, still alleviates hallucinations. Specifically, we explore the causes of hallucinations by analyzing localized self-attention patterns called ``anchor" tokens and define the attention localization degree of the model as token propagation probabilities. Our analysis reveals that over-propagation of anchor tokens occurs when the distribution of eigenvalues of the query and key matrices has a non-zero mean and a polarized variance, leading to excessive dependence on anchor tokens while neglecting vision information and describes the image content with hallucination. Based on the observation, we propose a versatile plug-and-play decoding strategy, Dynamic Token Propagation Mechanism (TAME), to alleviate excessive propagation by dynamically intervening the eigenspectrum variance of the attention weight, thereby alleviating hallucinations without relying on complex decoding strategies. Extensive experiments reveal a correlation between the eigenspectrum and hallucinations across various MLLMs, and show that TAME reduces the percentage of hallucinated objects.},
keywords = {ICLR},
pubstate = {published},
tppubtype = {conference}
}
Zhang, Chi; Farhat, Zain Ulabedeen; Atia, George K.; Wang, Yue
Model-Free Offline Reinforcement Learning with Enhanced Robustness Conference
Thirteenth International Conference on Learning Representations (ICLR), 2025.
Abstract | Tags: ICLR | Links:
@conference{Zhang2025,
title = {Model-Free Offline Reinforcement Learning with Enhanced Robustness},
author = {Chi Zhang and Zain Ulabedeen Farhat and George K. Atia and Yue Wang},
url = {https://openreview.net/forum?id=QyVLJ7EnAC
https://openreview.net/pdf?id=QyVLJ7EnAC},
year = {2025},
date = {2025-04-24},
urldate = {2025-04-24},
booktitle = {Thirteenth International Conference on Learning Representations (ICLR)},
abstract = {Offline reinforcement learning (RL) has gained considerable attention for its ability to learn policies from pre-collected data without real-time interaction, which makes it particularly useful for high-risk applications. However, due to its reliance on offline datasets, existing works inevitably introduce assumptions to ensure effective learning, which, however, often lead to a trade-off between robustness to model mismatch and scalability to large environments. In this paper, we enhance both aspects with a novel double-pessimism principle, which conservatively estimates performance and accounts for both limited data and potential model mismatches, two major reasons for the previous trade-off. We then propose a universal, model-free algorithm to learn an optimal policy that is robust to potential environment mismatches, which enhances robustness in a scalable manner. Furthermore, we provide a sample complexity analysis of our algorithm when the mismatch is modeled by the
-norm, which also theoretically demonstrates the efficiency of our method. Extensive experiments further demonstrate that our approach significantly improves robustness in a more scalable manner than existing methods.},
keywords = {ICLR},
pubstate = {published},
tppubtype = {conference}
}
-norm, which also theoretically demonstrates the efficiency of our method. Extensive experiments further demonstrate that our approach significantly improves robustness in a more scalable manner than existing methods.
Pathak, Priyank; Marjit, Shyam; Vyas, Shruti; Rawat, Yogesh
LR0.FM: Low-Resolution Zero-Shot Classification Benchmark for Foundation Models Conference
Thirteenth International Conference on Learning Representations (ICLR), 2025.
Abstract | Tags: ICLR | Links:
@conference{Pathak2025,
title = {LR0.FM: Low-Resolution Zero-Shot Classification Benchmark for Foundation Models},
author = {Priyank Pathak and Shyam Marjit and Shruti Vyas and Yogesh Rawat},
url = {https://arxiv.org/abs/2502.03950
https://arxiv.org/pdf/2502.03950
https://github.com/shyammarjit/LR0.FM},
year = {2025},
date = {2025-04-24},
booktitle = {Thirteenth International Conference on Learning Representations (ICLR)},
abstract = {Visual-language foundation Models (FMs) exhibit remarkable zero-shot generalization across diverse tasks, largely attributed to extensive pre-training on largescale datasets. However, their robustness on low-resolution/pixelated (LR) images, a common challenge in real-world scenarios, remains underexplored. We introduce this http URL, a comprehensive benchmark evaluating the impact of low resolution on the zero-shot classification performance of 10 FM(s) across 66 backbones and 15 datasets. We propose a novel metric, Weighted Aggregated Robustness, to address the limitations of existing metrics and better evaluate model performance across resolutions and datasets. Our key findings show that: (i) model size positively correlates with robustness to resolution degradation, (ii) pre-training dataset quality is more important than its size, and (iii) fine-tuned and higher resolution models are less robust against LR. Our analysis further reveals that the model makes semantically reasonable predictions at LR, and the lack of fine-grained details in input adversely impacts the model's initial layers more than the deeper layers. We use these insights and introduce a simple strategy, LR-TK0, to enhance the robustness of models without compromising their pre-trained weights. We demonstrate the effectiveness of LR-TK0 for robustness against low-resolution across several datasets and its generalization capability across backbones and other approaches. },
keywords = {ICLR},
pubstate = {published},
tppubtype = {conference}
}
Kumar, Akash; Kira, Zsolt; Rawat, Yogesh
Contextual Self-paced Learning for Weakly Supervised Spatio-Temporal Video Grounding Conference
Thirteenth International Conference on Learning Representations (ICLR), 2025.
Abstract | Tags: ICLR | Links:
@conference{Kumar2025,
title = {Contextual Self-paced Learning for Weakly Supervised Spatio-Temporal Video Grounding},
author = {Akash Kumar and Zsolt Kira and Yogesh Rawat},
url = {https://openreview.net/pdf?id=yHj6EunfVQ},
year = {2025},
date = {2025-04-24},
booktitle = {Thirteenth International Conference on Learning Representations (ICLR)},
abstract = {In this work, we focus on Weakly Supervised Spatio-Temporal Video Grounding (WSTVG). It is a multimodal task aimed at localizing specific subjects spatiotemporally based on textual queries without bounding box supervision. Motivated by recent advancements in multi-modal foundation models for grounding tasks, we f irst explore the potential of state-of-the-art object detection models for WSTVG. Despite their robust zero-shot capabilities, our adaptation reveals significant limitations, including inconsistent temporal predictions, inadequate understanding of complex queries, and challenges in adapting to difficult scenarios. We propose CoSPaL (Contextual Self-Paced Learning), a novel approach which is designed to overcome these limitations. CoSPaL integrates three core components: (1) Tubelet Phrase Grounding (TPG), which introduces spatio-temporal prediction by linking textual queries to tubelets; (2) Contextual Referral Grounding (CRG), which improves comprehension of complex queries by extracting contextual information to refine object identification over time; and (3) Self-Paced Scene Understanding (SPS), a training paradigm that progressively increases task difficulty, enabling the model to adapt to complex scenarios by transitioning from coarse to fine-grained understanding. We demonstrate the effectiveness of CoSPaL on three benchmark WSTVGdatasets, achieving a 3.9% absolute improvement on VidSTG and a 7.9% improvement on HCSTVG-v1.},
keywords = {ICLR},
pubstate = {published},
tppubtype = {conference}
}
2024
Wang, Yue; Sun, Zhongchang; Zou, Shaofeng
A Unified Principle of Pessimism for Offline Reinforcement Learning under Model Mismatch Conference
Thirty-Eighth Annual Conference on Neural Information Processing Systems (NeurIPS), 2024.
Abstract | Tags: NeurIPS | Links:
@conference{Wang2024,
title = {A Unified Principle of Pessimism for Offline Reinforcement Learning under Model Mismatch},
author = {Yue Wang and Zhongchang Sun and Shaofeng Zou},
url = {https://nips.cc/virtual/2024/poster/94438},
year = {2024},
date = {2024-12-12},
urldate = {2024-12-12},
publisher = {Thirty-Eighth Annual Conference on Neural Information Processing Systems (NeurIPS)},
abstract = {Abstract: In this paper, we address the challenges of offline reinforcement learning (RL) under model mismatch, where the agent aims to optimize its performance through an offline dataset that may not accurately represent the deployment environment. We identify two primary challenges under the setting: inaccurate model estimation due to limited data and performance degradation caused by the model mismatch between the dataset-collecting environment and the target deployment one. To tackle these issues, we propose a unified principle of pessimism using distributionally robust Markov decision processes. We carefully construct a robust MDP with a single uncertainty set to tackle both data sparsity and model mismatch, and demonstrate that the optimal robust policy enjoys a near-optimal sub-optimality gap under the target environment across three widely used uncertainty models: total variation, χ^2 divergence, and KL divergence. Our results improve upon or match the state-of-the-art performance under the total variation and KL divergence models, and provide the first result for the χ^2 divergence model.},
keywords = {NeurIPS},
pubstate = {published},
tppubtype = {conference}
}
Nguyen, Tri; Ibrahim, Shahana; Fu, Xiao
Noisy Label Learning with Instance-Dependent Outliers: Identifiability via Crowd Wisdom Conference
Thirty-Eighth Annual Conference on Neural Information Processing Systems (NeurIPS), 2024.
Abstract | Tags: NeurIPS | Links:
@conference{Nguyen2024,
title = {Noisy Label Learning with Instance-Dependent Outliers: Identifiability via Crowd Wisdom},
author = {Tri Nguyen and Shahana Ibrahim and Xiao Fu},
url = {https://nips.cc/virtual/2024/poster/95831},
year = {2024},
date = {2024-12-12},
urldate = {2024-12-12},
publisher = {Thirty-Eighth Annual Conference on Neural Information Processing Systems (NeurIPS)},
abstract = {The generation of label noise is often modeled as a process involving a probability transition matrix (often interpreted as the {it annotator confusion matrix}) imposed onto the ground-truth label distribution.Under this model, rectifying the label noise and learning the target classifier boil down to identifying the confusion matrix. This line of work demonstrated appealing empirical performance, yet identifiability of the model was mostly established by assuming an instance-invariant confusion matrix. Having an (occasionally) instance-dependent confusion matrix across data samples is apparently more realistic, but inevitably introduces outliers to the model.Our interest lies in confusion matrix-based noisy label learning with such outliers taken into consideration.We begin with pointing out that under the model of interest, detecting the outliers in the presence of a single confusion matrix is fundamentally insufficient.Then, we prove that by employing a crowdsourcing strategy involving multiple annotators, a carefully designed loss function can detect the outliers and identify the desired classifier under reasonable conditions.Our development builds upon a link between the noisy label model and a column-corrupted matrix factorization model---which turns out attesting to the importance of crowdsourced data annotation. Experiments show that our learning scheme substantially improves the outlier detection probability and the learned neural systems' testing accuracy.},
keywords = {NeurIPS},
pubstate = {published},
tppubtype = {conference}
}
Chakraborty, Souradip; Ghosal, Soumya Suvra; Yin, Ming; Manocha, Dinesh; Wang, Mengdi; Bedi, Amrit Singh; Huang, Furong
Transfer Q-star : Principled Decoding for LLM Alignment Conference
Thirty-Eighth Annual Conference on Neural Information Processing Systems (NeurIPS), 2024.
Abstract | Tags: NeurIPS | Links:
@conference{Chakraborty2024b,
title = {Transfer Q-star : Principled Decoding for LLM Alignment},
author = {Souradip Chakraborty and Soumya Suvra Ghosal and Ming Yin and Dinesh Manocha and Mengdi Wang and Amrit Singh Bedi and Furong Huang},
url = {https://neurips.cc/virtual/2024/poster/96588},
year = {2024},
date = {2024-12-12},
urldate = {2024-12-12},
publisher = {Thirty-Eighth Annual Conference on Neural Information Processing Systems (NeurIPS)},
abstract = {Aligning foundation models is essential for their safe and trustworthy deployment. However, traditional fine-tuning methods are computationally intensive and require updating billions of model parameters. A promising alternative, alignment via decoding, adjusts the response distribution directly without model updates to maximize a target reward r, thus providing a lightweight and adaptable framework for alignment. However, principled decoding methods rely on oracle access to an optimal Q-function (Q∗), which is often unavailable in practice. Hence, prior SoTA methods either approximate this Q∗ using Qπsft (derived from the reference SFT model) or rely on short-term rewards, resulting in sub-optimal decoding performance. In this work, we propose Transfer Q∗, which implicitly estimates the optimal value function for a target reward
r through a baseline model ρBL aligned with a baseline reward rBL (which can be different from the target reward r). Theoretical analyses of Transfer Q∗ provide a rigorous characterization of its optimality, deriving an upper bound on the sub-optimality gap and identifying a hyperparameter to control the deviation from the pre-trained reference SFT model based on user needs. Our approach significantly reduces the sub-optimality gap observed in prior SoTA methods and demonstrates superior empirical performance across key metrics such as coherence, diversity, and quality in extensive tests on several synthetic and real datasets. },
keywords = {NeurIPS},
pubstate = {published},
tppubtype = {conference}
}
r through a baseline model ρBL aligned with a baseline reward rBL (which can be different from the target reward r). Theoretical analyses of Transfer Q∗ provide a rigorous characterization of its optimality, deriving an upper bound on the sub-optimality gap and identifying a hyperparameter to control the deviation from the pre-trained reference SFT model based on user needs. Our approach significantly reduces the sub-optimality gap observed in prior SoTA methods and demonstrates superior empirical performance across key metrics such as coherence, diversity, and quality in extensive tests on several synthetic and real datasets.
Bornstein, Marco; Bedi, Amrit Singh; Mohamed, Abdirisak; Huang, Furong
FACT or Fiction: Can Truthful Mechanisms Eliminate Federated Free Riding? Conference
Thirty-Eighth Annual Conference on Neural Information Processing Systems (NeurIPS), 2024.
Abstract | Tags: NeurIPS | Links:
@conference{Bornstein2024,
title = {FACT or Fiction: Can Truthful Mechanisms Eliminate Federated Free Riding?},
author = {Marco Bornstein and Amrit Singh Bedi and Abdirisak Mohamed and Furong Huang},
url = {https://nips.cc/virtual/2024/poster/95703},
year = {2024},
date = {2024-12-12},
publisher = {Thirty-Eighth Annual Conference on Neural Information Processing Systems (NeurIPS)},
abstract = {Standard federated learning (FL) approaches are vulnerable to the free-rider dilemma: participating agents can contribute little to nothing yet receive a well-trained aggregated model. While prior mechanisms attempt to solve the free-rider dilemma, none have addressed the issue of truthfulness. In practice, adversarial agents can provide false information to the server in order to cheat its way out of contributing to federated training. In an effort to make free-riding-averse federated mechanisms truthful, and consequently less prone to breaking down in practice, we propose FACT. FACT is the first federated mechanism that: (1) eliminates federated free riding by using a penalty system, (2) ensures agents provide truthful information by creating a competitive environment, and (3) encourages agent participation by offering better performance than training alone. Empirically, FACT avoids free-riding when agents are untruthful, and reduces agent loss by over 4x.},
keywords = {NeurIPS},
pubstate = {published},
tppubtype = {conference}
}
Jialin Liu Ziang Chen, Xiaohan Chen
Rethinking the Capacity of Graph Neural Networks for Branching Strategy Conference
Thirty-Eighth Annual Conference on Neural Information Processing Systems (NeurIPS), 2024.
Abstract | Tags: NeurIPS | Links:
@conference{Chen2024,
title = {Rethinking the Capacity of Graph Neural Networks for Branching Strategy},
author = {Ziang Chen, Jialin Liu, Xiaohan Chen, Xinshang Wang, Wotao Yin},
url = {https://neurips.cc/virtual/2024/poster/95991
https://arxiv.org/pdf/2402.07099},
year = {2024},
date = {2024-12-11},
publisher = {Thirty-Eighth Annual Conference on Neural Information Processing Systems (NeurIPS)},
abstract = {Graph neural networks (GNNs) have been widely used to predict properties and heuristics of mixed-integer linear programs (MILPs) and hence accelerate MILP solvers. This paper investigates the capacity of GNNs to represent strong branching (SB), the most effective yet computationally expensive heuristic employed in the branch-and-bound algorithm. In the literature, message-passing GNN (MP-GNN), as the simplest GNN structure, is frequently used as a fast approximation of SB and we find that not all MILPs's SB can be represented with MP-GNN. We precisely define a class of
MP-tractable" MILPs for which MP-GNNs can accurately approximate SB scores. Particularly, we establish a universal approximation theorem: for any data distribution over the MP-tractable class, there always exists an MP-GNN that can approximate the SB score with arbitrarily high accuracy and arbitrarily high probability, which lays a theoretical foundation of the existing works on imitating SB with MP-GNN. For MILPs without the MP-tractability, unfortunately, a similar result is impossible, which can be illustrated by two MILP instances with different SB scores that cannot be distinguished by any MP-GNN, regardless of the number of parameters. Recognizing this, we explore another GNN structure called the second-order folklore GNN (2-FGNN) that overcomes this limitation, and the aforementioned universal approximation theorem can be extended to the entire MILP space using 2-FGNN, regardless of the MP-tractability. A small-scale numerical experiment is conducted to directly validate our theoretical findings.},
keywords = {NeurIPS},
pubstate = {published},
tppubtype = {conference}
}
MP-tractable" MILPs for which MP-GNNs can accurately approximate SB scores. Particularly, we establish a universal approximation theorem: for any data distribution over the MP-tractable class, there always exists an MP-GNN that can approximate the SB score with arbitrarily high accuracy and arbitrarily high probability, which lays a theoretical foundation of the existing works on imitating SB with MP-GNN. For MILPs without the MP-tractability, unfortunately, a similar result is impossible, which can be illustrated by two MILP instances with different SB scores that cannot be distinguished by any MP-GNN, regardless of the number of parameters. Recognizing this, we explore another GNN structure called the second-order folklore GNN (2-FGNN) that overcomes this limitation, and the aforementioned universal approximation theorem can be extended to the entire MILP space using 2-FGNN, regardless of the MP-tractability. A small-scale numerical experiment is conducted to directly validate our theoretical findings.
Csaba, Botos; Zhang, Wenxuan; Müller, Matthias; Lim, Ser-Nam; Torr, Philip; Bibi, Adel
Label Delay in Online Continual Learning Conference
Thirty-Eighth Annual Conference on Neural Information Processing Systems (NeurIPS), 2024.
Abstract | Tags: NeurIPS | Links:
@conference{Csaba2024,
title = {Label Delay in Online Continual Learning},
author = {Botos Csaba and Wenxuan Zhang and Matthias Müller and Ser-Nam Lim and Philip Torr and Adel Bibi},
url = {https://arxiv.org/abs/2312.00923},
year = {2024},
date = {2024-12-09},
urldate = {2024-12-09},
publisher = {Thirty-Eighth Annual Conference on Neural Information Processing Systems (NeurIPS)},
abstract = {Online continual learning, the process of training models on streaming data, has gained increasing attention in recent years. However, a critical aspect often overlooked is the label delay, where new data may not be labeled due to slow and costly annotation processes. We introduce a new continual learning framework with explicit modeling of the label delay between data and label streams over time steps. In each step, the framework reveals both unlabeled data from the current time step t and labels delayed with d steps, from the time step t−d. In our extensive experiments amounting to 1060 GPU days, we show that merely augmenting the computational resources is insufficient to tackle this challenge. Our findings underline a notable performance decline when solely relying on labeled data when the label delay becomes significant. More surprisingly, when using state-of-the-art SSL and TTA techniques to utilize the newer, unlabeled data, they fail to surpass the performance of a naïve method that simply trains on the delayed supervised stream. To this end, we introduce a simple, efficient baseline that rehearses from the labeled memory samples that are most similar to the new unlabeled samples. This method bridges the accuracy gap caused by label delay without significantly increasing computational complexity. We show experimentally that our method is the least affected by the label delay factor and in some cases successfully recovers the accuracy of the non-delayed counterpart. We conduct various ablations and sensitivity experiments, demonstrating the effectiveness of our approach.},
keywords = {NeurIPS},
pubstate = {published},
tppubtype = {conference}
}
Lim, Hui Xian Grace; Cui, Xuanming; Rawat, Yogesh Singh; Lim, Ser-Nam
AirSketch: Generative Motion to Sketch Conference
Thirty-Eighth Annual Conference on Neural Information Processing Systems (NeurIPS), 2024.
Abstract | Tags: NeurIPS | Links:
@conference{Lim2024,
title = {AirSketch: Generative Motion to Sketch},
author = {Hui Xian Grace Lim and Xuanming Cui and Yogesh Singh Rawat and Ser-Nam Lim },
url = {https://arxiv.org/pdf/2407.08906
https://www.crcv.ucf.edu/person/rawat/},
year = {2024},
date = {2024-12-09},
urldate = {2024-12-09},
publisher = {Thirty-Eighth Annual Conference on Neural Information Processing Systems (NeurIPS)},
abstract = {Illustration is a fundamental mode of human expression and communication. Certain types of motion that accompany speech can provide this illustrative mode of communication. While Augmented and Virtual Reality technologies (AR/VR) have introduced tools for producing drawings with hand motions (air drawing), they typically require costly hardware and additional digital markers, thereby limiting their accessibility and portability. Furthermore, air drawing demands considerable skill to achieve aesthetic results. To address these challenges, we introduce the concept of AirSketch, aimed at generating faithful and visually coherent sketches directly from hand motions, eliminating the need for complicated headsets or markers. We devise a simple augmentation-based self-supervised training procedure, enabling a controllable image diffusion model to learn to translate from highly noisy hand tracking images to clean, aesthetically pleasing sketches, while preserving the essential visual cues from the original tracking data. We present two air drawing datasets to study this problem. Our findings demonstrate that beyond producing photo-realistic images from precise spatial inputs, controllable image diffusion can effectively produce a refined, clear sketch from a noisy input. Our work serves as an initial step towards marker-less air drawing and reveals distinct applications of controllable diffusion models to AirSketch and AR/VR in general.},
keywords = {NeurIPS},
pubstate = {published},
tppubtype = {conference}
}
Mahajan, Yash; Bansal, Naman; Blanco, Eduardo; Santu, Shubhra Kanti Karmaker
ALIGN-SIM: A Task-Free Test Bed for Evaluating and Interpreting Sentence Embeddings through Semantic Similarity Alignment Conference
Findings of the Association for Computational Linguistics: EMNLP 2024, Association for Computational Linguistics, 2024.
Abstract | Tags: EMNLP | Links:
@conference{Mahajan2024,
title = {ALIGN-SIM: A Task-Free Test Bed for Evaluating and Interpreting Sentence Embeddings through Semantic Similarity Alignment},
author = {Yash Mahajan and Naman Bansal and Eduardo Blanco and Shubhra Kanti Karmaker Santu},
editor = {Yaser Al-Onaizan and Mohit Bansal and Yun-Nung Chen},
url = {https://aclanthology.org/2024.findings-emnlp.436.pdf
https://aclanthology.org/2024.findings-emnlp.436/
https://aclanthology.org/attachments/2024.findings-emnlp.436.software.zip
https://aclanthology.org/attachments/2024.findings-emnlp.436.data.zip},
doi = {10.18653/v1/2024.findings-emnlp.436},
year = {2024},
date = {2024-11-01},
booktitle = {Findings of the Association for Computational Linguistics: EMNLP 2024},
pages = {7393–7428},
publisher = {Association for Computational Linguistics},
abstract = {Sentence embeddings play a pivotal role in a wide range of NLP tasks, yet evaluating and interpreting these real-valued vectors remains an open challenge to date, especially in a task-free setting. To address this challenge, we introduce a novel task-free test bed for evaluating and interpreting sentence embeddings. Our test bed consists of five semantic similarity alignment criteria, namely, *semantic distinction, synonym replacement, antonym replacement, paraphrasing without negation, and sentence jumbling*. Using these criteria, we examined five classical (e.g., Sentence-BERT, Universal Sentence Encoder (USE), etc.) and eight LLM-induced sentence embedding techniques (e.g., LLaMA2, GPT-3, OLMo, etc.) to test whether their semantic similarity spaces align with what a human mind would naturally expect. Our extensive experiments with 13 different sentence encoders revealed that none of the studied embeddings aligned with all the five semantic similarity alignment criteria. Yet, most encoders performed highly on the SentEval dataset, a popular task-specific benchmark. This finding demonstrates a significant limitation of the current practice in sentence embedding evaluation and associated popular benchmarks, a critical issue that needs careful attention and reassessment by the NLP community. Finally, we conclude the paper by highlighting the utility of the proposed alignment-based test bed for analyzing sentence embeddings in a novel way, especially in a task-free setting.},
keywords = {EMNLP},
pubstate = {published},
tppubtype = {conference}
}
Arul, S. H.; Bedi, A. S.; Manocha, D.
When, What, and with Whom to Communicate: Enhancing RL-based Multi-Robot Navigation through Selective Communication Conference
2024.
Tags: IROS
@conference{nokey,
title = {When, What, and with Whom to Communicate: Enhancing RL-based Multi-Robot Navigation through Selective Communication},
author = {S. H. Arul and A. S. Bedi and D. Manocha},
year = {2024},
date = {2024-10-14},
keywords = {IROS},
pubstate = {published},
tppubtype = {conference}
}
Sun, Xingpeng; Zhang, Yiran; Tang, Xindi; Bedi, Amrit Singh; Bera, Aniket
TrustNavGPT: Trust-Driven Audio-Guided Robot Navigation under Uncertainty with Large Language Models Conference
IEEE/RSJ International Conference on Intelligent Robots and Systems (IROS) [Oral], 2024.
Abstract | Tags: IROS | Links:
@conference{nokey,
title = {TrustNavGPT: Trust-Driven Audio-Guided Robot Navigation under Uncertainty with Large Language Models},
author = {Xingpeng Sun and Yiran Zhang and Xindi Tang and Amrit Singh Bedi and Aniket Bera},
url = {https://xingpengsun0.github.io/trustnav/},
year = {2024},
date = {2024-10-14},
urldate = {2024-10-14},
publisher = {IEEE/RSJ International Conference on Intelligent Robots and Systems (IROS) [Oral]},
abstract = {Large language models (LLMs) exhibit a wide range of promising capabilities -- from step-by-step planning to commonsense reasoning --that provide utility for robot navigation. However, as humans communicate with robots in the real world, ambiguity and uncertainty may be embedded inside spoken instructions. While LLMs are proficient at processing text in human conversations, they often encounter difficulties with the nuances of verbal instructions and, thus, remain prone to hallucinate trust in human command. In this work, we present TrustNavGPT, an LLM-based audio-guided navigation agent that uses affective cues in spoken communication—elements such as tone and inflection that convey meaning beyond words—allowing it to assess the trustworthiness of human commands and make effective, safe decisions.},
keywords = {IROS},
pubstate = {published},
tppubtype = {conference}
}
Shek, C. L.; Wu, X.; Suttle, W. A.; Busart, C.; Zaroukian, E.; Manocha, D.; Tokekar, P.; Bedi, A. S.
LANCAR: Leveraging Language for Context-Aware Robot Locomotion in Unstructured Environments Conference
IEEE/RSJ International Conference on Intelligent Robots and Systems (IROS), 2024.
Tags: IROS | Links:
@conference{Shek2024,
title = {LANCAR: Leveraging Language for Context-Aware Robot Locomotion in Unstructured Environments},
author = {C. L. Shek and X. Wu and W. A. Suttle and C. Busart and E. Zaroukian and D. Manocha and P. Tokekar and A. S. Bedi},
url = {https://www.crcv.ucf.edu/wp-content/uploads/2018/11/2310.00481v2.pdf},
year = {2024},
date = {2024-10-14},
publisher = {IEEE/RSJ International Conference on Intelligent Robots and Systems (IROS)},
keywords = {IROS},
pubstate = {published},
tppubtype = {conference}
}
Sun, Guangyu; Mendieta, Matias; Dutta, Aritra; Li, Xin; Chen, Chen
Towards Multi-modal Transformers in Federated Learning Conference
The 18th European Conference on Computer Vision ECCV 2024, 2024.
Tags: ECCV | Links:
@conference{Sun2024,
title = {Towards Multi-modal Transformers in Federated Learning},
author = {Guangyu Sun and Matias Mendieta and Aritra Dutta and Xin Li and Chen Chen},
url = {https://arxiv.org/pdf/2404.12467.pdf
https://github.com/imguangyu/FedCola},
year = {2024},
date = {2024-09-29},
urldate = {2024-09-29},
publisher = {The 18th European Conference on Computer Vision ECCV 2024},
keywords = {ECCV},
pubstate = {published},
tppubtype = {conference}
}
Hassan, Md. Mahadi; Salvador, John; Santu, Shubhra Kanti Karmaker; Rahman, Akond
State Reconciliation Defects in Infrastructure as Code Conference
vol. 1, no. 83, Association for Computing Machinery, 2024.
@conference{Hassan2024b,
title = {State Reconciliation Defects in Infrastructure as Code},
author = {Md. Mahadi Hassan and John Salvador and Shubhra Kanti Karmaker Santu and Akond Rahman},
url = {https://dl.acm.org/doi/10.1145/3660790},
doi = {https://doi.org/10.1145/366079},
year = {2024},
date = {2024-07-01},
volume = {1},
number = {83},
issue = {FSE},
pages = {1865 - 1888},
publisher = {Association for Computing Machinery},
abstract = {In infrastructure as code (IaC), state reconciliation is the process of querying and comparing the infrastructure state prior to changing the infrastructure. As state reconciliation is pivotal to manage IaC-based computing infrastructure at scale, defects related to state reconciliation can create large-scale consequences. A categorization of state reconciliation defects, i.e., defects related to state reconciliation, can aid in understanding the nature of state reconciliation defects. We conduct an empirical study with 5,110 state reconciliation defects where we apply qualitative analysis to categorize state reconciliation defects. From the identified defect categories, we derive heuristics to design prompts for a large language model (LLM), which in turn are used for validation of state reconciliation. From our empirical study, we identify 8 categories of state reconciliation defects, amongst which 3 have not been reported for previously-studied software systems. The most frequently occurring defect category is inventory, i.e., the category of defects that occur when managing infrastructure inventory. Using an LLM with heuristics-based paragraph style prompts, we identify 9 previously unknown state reconciliation defects of which 7 have been accepted as valid defects, and 4 have already been fixed. Based on our findings, we conclude the paper by providing a set of recommendations for researchers and practitioners.},
keywords = {FSE},
pubstate = {published},
tppubtype = {conference}
}
Huang, Shuaiyi; Suri, Saksham; Gupta, Kamal; Rambhatla, Sai Saketh; Lim, Ser-Nam; Shrivastava, Abhinav
UVIS: Unsupervised Video Instance Segmentation Workshop
2024.
Tags: CVPR
@workshop{Huang2024,
title = {UVIS: Unsupervised Video Instance Segmentation},
author = {Shuaiyi Huang and Saksham Suri and Kamal Gupta and Sai Saketh Rambhatla and Ser-Nam Lim and Abhinav Shrivastava},
editor = {CVPR Workshop on Learning with Limited Labelled Data},
year = {2024},
date = {2024-06-17},
urldate = {2024-06-17},
keywords = {CVPR},
pubstate = {published},
tppubtype = {workshop}
}
Pham, Khoi; Huynh, Chuong Minh; Lim, Ser-Nam; Shrivastava, Abhinav
Composing Object Relations and Attributes for Image-Text Matching Conference
IEEE/CVF Computer Vision and Pattern Recognition Conference (CVPR), 2024.
Tags: CVPR
@conference{Pham2024,
title = {Composing Object Relations and Attributes for Image-Text Matching},
author = {Khoi Pham and Chuong Minh Huynh and Ser-Nam Lim and Abhinav Shrivastava},
year = {2024},
date = {2024-06-17},
urldate = {2024-06-17},
publisher = {IEEE/CVF Computer Vision and Pattern Recognition Conference (CVPR)},
keywords = {CVPR},
pubstate = {published},
tppubtype = {conference}
}
Cui, Xuanming; Aparcedo, Alejandro; Jang, Young Kyun; Lim, Ser-Nam
On the Robustness of Large Multimodal Models Against Image Adversarial Attacks Conference
IEEE/CVF Computer Vision and Pattern Recognition Conference (CVPR), 2024.
Abstract | Tags: CVPR | Links:
@conference{Cui2024,
title = {On the Robustness of Large Multimodal Models Against Image Adversarial Attacks},
author = {Xuanming Cui and Alejandro Aparcedo and Young Kyun Jang and Ser-Nam Lim},
url = {https://arxiv.org/abs/2312.03777},
year = {2024},
date = {2024-06-17},
urldate = {2024-06-17},
publisher = {IEEE/CVF Computer Vision and Pattern Recognition Conference (CVPR)},
abstract = {Recent advances in instruction tuning have led to the development of State-of-the-Art Large Multimodal Models (LMMs). Given the novelty of these models, the impact of visual adversarial attacks on LMMs has not been thoroughly examined. We conduct a comprehensive study of the robustness of various LMMs against different adversarial attacks, evaluated across tasks including image classification, image captioning, and Visual Question Answer (VQA). We find that in general LMMs are not robust to visual adversarial inputs. However, our findings suggest that context provided to the model via prompts, such as questions in a QA pair helps to mitigate the effects of visual adversarial inputs. Notably, the LMMs evaluated demonstrated remarkable resilience to such attacks on the ScienceQA task with only an 8.10% drop in performance compared to their visual counterparts which dropped 99.73%. We also propose a new approach to real-world image classification which we term query decomposition. By incorporating existence queries into our input prompt we observe diminished attack effectiveness and improvements in image classification accuracy. This research highlights a previously under-explored facet of LMM robustness and sets the stage for future work aimed at strengthening the resilience of multimodal systems in adversarial environments.},
keywords = {CVPR},
pubstate = {published},
tppubtype = {conference}
}
Pramanick, Shraman; Han, Guangxing; Hou, Rui; Nag, Sayan; Lim, Ser-Nam; Ballas, Nicolas; Wang, Qifan; Chellappa, Rama; Almahairi, Amjad
Jack of All Tasks, Master of Many: Designing General-Purpose Coarse-to-Fine Vision-Language Model Conference
IEEE/CVF Computer Vision and Pattern Recognition Conference (CVPR), 2024.
Abstract | Tags: CVPR | Links:
@conference{Pramanick2024,
title = {Jack of All Tasks, Master of Many: Designing General-Purpose Coarse-to-Fine Vision-Language Model},
author = {Shraman Pramanick and Guangxing Han and Rui Hou and Sayan Nag and Ser-Nam Lim and Nicolas Ballas and Qifan Wang and Rama Chellappa and Amjad Almahairi},
url = {https://arxiv.org/abs/2312.12423
https://shramanpramanick.github.io/VistaLLM/},
year = {2024},
date = {2024-06-17},
urldate = {2024-06-17},
publisher = {IEEE/CVF Computer Vision and Pattern Recognition Conference (CVPR)},
abstract = {The ability of large language models (LLMs) to process visual inputs has given rise to general-purpose vision systems, unifying various vision-language (VL) tasks by instruction tuning. However, due to the enormous diversity in input-output formats in the vision domain, existing general-purpose models fail to successfully integrate segmentation and multi-image inputs with coarse-level tasks into a single framework. In this work, we introduce VistaLLM, a powerful visual system that addresses coarse- and fine-grained VL tasks over single and multiple input images using a unified framework. VistaLLM utilizes an instruction-guided image tokenizer that filters global embeddings using task descriptions to extract compressed and refined features from numerous images. Moreover, VistaLLM employs a gradient-aware adaptive sampling technique to represent binary segmentation masks as sequences, significantly improving over previously used uniform sampling. To bolster the desired capability of VistaLLM, we curate CoinIt, a comprehensive coarse-to-fine instruction tuning dataset with 6.8M samples. We also address the lack of multi-image grounding datasets by introducing a novel task, AttCoSeg (Attribute-level Co-Segmentation), which boosts the model's reasoning and grounding capability over multiple input images. Extensive experiments on a wide range of V- and VL tasks demonstrate the effectiveness of VistaLLM by achieving consistent state-of-the-art performance over strong baselines across all downstream tasks. },
keywords = {CVPR},
pubstate = {published},
tppubtype = {conference}
}
He, Bo; Li, Hengduo; Jang, Young Kyun; Jia, Menglin; Cao, Xuefei; Shah, Ashish; Shrivastava, Abhinav; Lim, Ser-Nam
MA-LMM: Memory-Augmented Multimodal Model for Long-Term Video Understand Conference
IEEE/CVF Computer Vision and Pattern Recognition Conference (CVPR), 2024.
Abstract | Tags: CVPR | Links:
@conference{He2024,
title = {MA-LMM: Memory-Augmented Multimodal Model for Long-Term Video Understand},
author = {Bo He and Hengduo Li and Young Kyun Jang and Menglin Jia and Xuefei Cao and Ashish Shah and Abhinav Shrivastava and Ser-Nam Lim},
url = {https://arxiv.org/abs/2404.05726
https://boheumd.github.io/MA-LMM/},
year = {2024},
date = {2024-06-17},
urldate = {2024-06-17},
publisher = {IEEE/CVF Computer Vision and Pattern Recognition Conference (CVPR)},
abstract = {With the success of large language models (LLMs), integrating the vision model into LLMs to build vision-language foundation models has gained much more interest recently. However, existing LLM-based large multimodal models (e.g., Video-LLaMA, VideoChat) can only take in a limited number of frames for short video understanding. In this study, we mainly focus on designing an efficient and effective model for long-term video understanding. Instead of trying to process more frames simultaneously like most existing work, we propose to process videos in an online manner and store past video information in a memory bank. This allows our model to reference historical video content for long-term analysis without exceeding LLMs' context length constraints or GPU memory limits. Our memory bank can be seamlessly integrated into current multimodal LLMs in an off-the-shelf manner. We conduct extensive experiments on various video understanding tasks, such as long-video understanding, video question answering, and video captioning, and our model can achieve state-of-the-art performances across multiple datasets. },
keywords = {CVPR},
pubstate = {published},
tppubtype = {conference}
}
Han, Guangxing; Lim, Ser-Nam
Few-Shot Object Detection with Foundation Models Conference
IEEE/CVF Computer Vision and Pattern Recognition Conference (CVPR), 2024.
Tags: CVPR
@conference{nokey,
title = {Few-Shot Object Detection with Foundation Models},
author = {Guangxing Han and Ser-Nam Lim},
year = {2024},
date = {2024-06-17},
publisher = {IEEE/CVF Computer Vision and Pattern Recognition Conference (CVPR)},
keywords = {CVPR},
pubstate = {published},
tppubtype = {conference}
}
Jang, Young Kyun; Kim, Donghyun; Meng, Zihang; Huynh, Dat; Lim, Ser-Nam
Visual Delta Generator for Semi-Supervised Composed Image Retrieval Conference
IEEE/CVF Computer Vision and Pattern Recognition Conference (CVPR), 2024.
Abstract | Tags: CVPR | Links:
@conference{Jang2024,
title = {Visual Delta Generator for Semi-Supervised Composed Image Retrieval},
author = {Young Kyun Jang and Donghyun Kim and Zihang Meng and Dat Huynh and Ser-Nam Lim},
url = {https://arxiv.org/pdf/2404.15516v1.pdf},
year = {2024},
date = {2024-06-17},
urldate = {2024-06-17},
publisher = {IEEE/CVF Computer Vision and Pattern Recognition Conference (CVPR)},
abstract = {Composed Image Retrieval (CIR) is a task that retrieves images similar to a query, based on a provided textual modification. Current techniques rely on supervised learning for CIR models using labeled triplets of the reference image, text, target image. These specific triplets are not as commonly available as simple image-text pairs, limiting the widespread use of CIR and its scalability. On the other hand, zero-shot CIR can be relatively easily trained with image-caption pairs without considering the image-to-image relation, but this approach tends to yield lower accuracy. We propose a new semi-supervised CIR approach where we search for a reference and its related target images in auxiliary data and learn our large language model-based Visual Delta Generator (VDG) to generate text describing the visual difference (i.e., visual delta) between the two. VDG, equipped with fluent language knowledge and being model agnostic, can generate pseudo triplets to boost the performance of CIR models. Our approach significantly improves the existing supervised learning approaches and achieves state-of-the-art results on the CIR benchmarks.},
keywords = {CVPR},
pubstate = {published},
tppubtype = {conference}
}
Yue, Kaiyu; Chen, Bor-Chun; Geiping, Jonas; Li, Hengduo; Goldstein, Tom; Lim, Ser-Nam
Object Recognition as Next Token Prediction Conference
IEEE/CVF Computer Vision and Pattern Recognition Conference (CVPR), 2024.
Abstract | Tags: CVPR | Links:
@conference{Yue2024,
title = {Object Recognition as Next Token Prediction},
author = {Kaiyu Yue and Bor-Chun Chen and Jonas Geiping and Hengduo Li and Tom Goldstein and Ser-Nam Lim},
url = {https://arxiv.org/abs/2312.02142
https://github.com/kaiyuyue/nxtp},
year = {2024},
date = {2024-06-17},
urldate = {2024-06-17},
publisher = {IEEE/CVF Computer Vision and Pattern Recognition Conference (CVPR)},
abstract = {We present an approach to pose object recognition as next token prediction. The idea is to apply a language decoder that auto-regressively predicts the text tokens from image embeddings to form labels. To ground this prediction process in auto-regression, we customize a non-causal attention mask for the decoder, incorporating two key features: modeling tokens from different labels to be independent, and treating image tokens as a prefix. This masking mechanism inspires an efficient method - one-shot sampling - to simultaneously sample tokens of multiple labels in parallel and rank generated labels by their probabilities during inference. To further enhance the efficiency, we propose a simple strategy to construct a compact decoder by simply discarding the intermediate blocks of a pretrained language model. This approach yields a decoder that matches the full model's performance while being notably more efficient.},
keywords = {CVPR},
pubstate = {published},
tppubtype = {conference}
}
Xiaogang Xu Zhuoling Li, Ser-Nam Lim
UniMODE: Universal Monocular 3D Object Detection Conference
IEEE/CVF Computer Vision and Pattern Recognition Conference (CVPR), 2024.
Abstract | Tags: CVPR | Links:
@conference{Li2024,
title = {UniMODE: Universal Monocular 3D Object Detection},
author = {Zhuoling Li, Xiaogang Xu, Ser-Nam Lim, Hengshuang Zhao},
url = {https://arxiv.org/abs/2402.18573},
year = {2024},
date = {2024-06-17},
urldate = {2024-06-17},
publisher = {IEEE/CVF Computer Vision and Pattern Recognition Conference (CVPR)},
abstract = {Realizing unified monocular 3D object detection, including both indoor and outdoor scenes, holds great importance in applications like robot navigation. However, involving various scenarios of data to train models poses challenges due to their significantly different characteristics, e.g., diverse geometry properties and heterogeneous domain distributions. To address these challenges, we build a detector based on the bird's-eye-view (BEV) detection paradigm, where the explicit feature projection is beneficial to addressing the geometry learning ambiguity when employing multiple scenarios of data to train detectors. Then, we split the classical BEV detection architecture into two stages and propose an uneven BEV grid design to handle the convergence instability caused by the aforementioned challenges. Moreover, we develop a sparse BEV feature projection strategy to reduce computational cost and a unified domain alignment method to handle heterogeneous domains. Combining these techniques, a unified detector UniMODE is derived, which surpasses the previous state-of-the-art on the challenging Omni3D dataset (a large-scale dataset including both indoor and outdoor scenes) by 4.9% AP_3D, revealing the first successful generalization of a BEV detector to unified 3D object detection.},
keywords = {CVPR},
pubstate = {published},
tppubtype = {conference}
}
Park, Juhyuk; Wang, Ji; Guan, Webster; Gjesteby, Lars A.; Pollack, Dylan; Kamentsky, Lee; Evans, Nicholas B.; Stirman, Jeff; Gu, Xinyi; Zhao, Chuanxi; Marx, Slayton; nd Seo Woo Choi, Minyoung E. Kim; Snyder, Michael; Chavez, David; Su-Arcaro, Clover; Tian, Yuxuan; Park, Chang Sin; Zhang, Qiangge; Yu, Dae Hee; Moukheiber, Mira; Feng, Guoping; Yang, X. William; Keene, C. Dirk; Hof, Patrick R.; Ghosh, Satrajit S.; Frosch, Matthew P.; Brattain, Laura J.; Chung, Kwanghun
Integrated platform for multiscale molecular imaging and phenotyping of the human brain Journal Article
In: Science, vol. 384, iss. 6701, pp. eadh9979, 2024.
Abstract | Tags: | Links:
@article{Park2024,
title = {Integrated platform for multiscale molecular imaging and phenotyping of the human brain},
author = {Juhyuk Park and Ji Wang and Webster Guan and Lars A. Gjesteby and Dylan Pollack and Lee Kamentsky and Nicholas B. Evans and Jeff Stirman and Xinyi Gu and Chuanxi Zhao and Slayton Marx and Minyoung E. Kim nd Seo Woo Choi and Michael Snyder and David Chavez and Clover Su-Arcaro and Yuxuan Tian and Chang Sin Park and Qiangge Zhang and Dae Hee Yu and Mira Moukheiber and Guoping Feng and X. William Yang and C. Dirk Keene and Patrick R. Hof and Satrajit S. Ghosh and Matthew P. Frosch and Laura J. Brattain and Kwanghun Chung},
url = {https://www.science.org/doi/10.1126/science.adh9979},
doi = {https://doi.org/10.1126/science.adh9979},
year = {2024},
date = {2024-06-14},
journal = {Science},
volume = {384},
issue = {6701},
pages = {eadh9979},
abstract = {Understanding cellular architectures and their connectivity is essential for interrogating system function and dysfunction. However, we lack technologies for mapping the multiscale details of individual cells and their connectivity in the human organ–scale system. We developed a platform that simultaneously extracts spatial, molecular, morphological, and connectivity information of individual cells from the same human brain. The platform includes three core elements: a vibrating microtome for ultraprecision slicing of large-scale tissues without losing cellular connectivity (MEGAtome), a polymer hydrogel–based tissue processing technology for multiplexed multiscale imaging of human organ–scale tissues (mELAST), and a computational pipeline for reconstructing three-dimensional connectivity across multiple brain slabs (UNSLICE). We applied this platform for analyzing human Alzheimer’s disease pathology at multiple scales and demonstrating scalable neural connectivity mapping in the human brain.
},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Chakraborty, Souradip; Bedi, Amrit Singh; Koppel, Alec; Manocha, Dinesh; Wang, Huazheng; Wang, Mengdi; Huang, Furong
PARL: A Unified Framework for Policy Alignment in Reinforcement Learning Conference
International Conference on Learning Representations (ICLR), Vienna, Austria, 2024.
Abstract | Tags: ICLR | Links:
@conference{Chakraborty2024,
title = {PARL: A Unified Framework for Policy Alignment in Reinforcement Learning},
author = {Souradip Chakraborty and Amrit Singh Bedi and Alec Koppel and Dinesh Manocha and Huazheng Wang and Mengdi Wang and Furong Huang},
url = {https://ai.ucf.edu/wp-content/uploads/2024/01/2308.02585.pdf
https://arxiv.org/abs/2308.02585},
year = {2024},
date = {2024-05-07},
urldate = {2024-05-07},
publisher = {International Conference on Learning Representations (ICLR)},
address = {Vienna, Austria},
abstract = {We present a novel unified bilevel optimization-based framework, textsf{PARL}, formulated to address the recently highlighted critical issue of policy alignment in reinforcement learning using utility or preference-based feedback. We identify a major gap within current algorithmic designs for solving policy alignment due to a lack of precise characterization of the dependence of the alignment objective on the data generated by policy trajectories. This shortfall contributes to the sub-optimal performance observed in contemporary algorithms. Our framework addressed these concerns by explicitly parameterizing the distribution of the upper alignment objective (reward design) by the lower optimal variable (optimal policy for the designed reward). Interestingly, from an optimization perspective, our formulation leads to a new class of stochastic bilevel problems where the stochasticity at the upper objective depends upon the lower-level variable. To demonstrate the efficacy of our formulation in resolving alignment issues in RL, we devised an algorithm named textsf{A-PARL} to solve PARL problem, establishing sample complexity bounds of order O(1/T). Our empirical results substantiate that the proposed textsf{PARL} can address the alignment concerns in RL by showing significant improvements (up to 63% in terms of required samples) for policy alignment in large-scale environments of the Deepmind control suite and Meta world tasks.},
keywords = {ICLR},
pubstate = {published},
tppubtype = {conference}
}
Shrivastava, Gaurav; Lim, Ser-Nam; Shrivastava, Abhinav
Video Decomposition Prior: Editing Videos Layer by Layer Conference
Eleventh International Conference on Learning Representations (ICLR) 2023, 2024.
Abstract | Tags: ICLR | Links:
@conference{Shrivastava2024,
title = {Video Decomposition Prior: Editing Videos Layer by Layer},
author = {Gaurav Shrivastava and Ser-Nam Lim and Abhinav Shrivastava},
url = {https://ai.ucf.edu/wp-content/uploads/2024/02/2017_video_decomposition_prior_edit.pdf
https://openreview.net/forum?id=nfMyERXNru},
year = {2024},
date = {2024-05-07},
urldate = {2024-05-07},
publisher = {Eleventh International Conference on Learning Representations (ICLR) 2023},
abstract = {In the evolving landscape of video editing methodologies, a majority of deep learning techniques are often reliant on extensive datasets of observed input and ground truth sequence pairs for optimal performance. Such reliance often falters when acquiring data becomes challenging, especially in tasks like video dehazing and relighting, where replicating identical motions and camera angles in both corrupted and ground truth sequences is complicated. Moreover, these conventional methodologies perform best when the test distribution closely mirrors the training distribution. Recognizing these challenges, this paper introduces a novel video decomposition prior `VDP' framework which derives inspiration from professional video editing practices. Our methodology does not mandate task-specific external data corpus collection, instead pivots to utilizing the motion and appearance of the input video. VDP framework decomposes a video sequence into a set of multiple RGB layers and associated opacity levels. These set of layers are then manipulated individually to obtain the desired results. We addresses tasks such as video object segmentation, dehazing, and relighting. Moreover, we introduce a novel logarithmic video decomposition formulation for video relighting tasks, setting a new benchmark over the existing methodologies. We evaluate our approach on standard video datasets like DAVIS, REVIDE, & SDSD and show qualitative results on a diverse array of internet videos.},
keywords = {ICLR},
pubstate = {published},
tppubtype = {conference}
}
Hassan, Md. Mahadi; Knipper, Alex; Santu, Shubhra Kanti Karmaker
Introducing "Forecast Utterance" for Conversational Data Science Journal Article Forthcoming
In: Transactions on Machine Learning Research, Forthcoming.
Abstract | Tags: TMLR | Links:
@article{Hassan2024,
title = {Introducing "Forecast Utterance" for Conversational Data Science},
author = {Md. Mahadi Hassan and Alex Knipper and Shubhra Kanti Karmaker Santu},
url = {https://arxiv.org/abs/2309.03877},
year = {2024},
date = {2024-03-01},
urldate = {2024-03-01},
journal = {Transactions on Machine Learning Research},
abstract = {Envision an intelligent agent capable of assisting users in conducting forecasting tasks through intuitive, natural conversations, without requiring in-depth knowledge of the underlying machine learning (ML) processes. A significant challenge for the agent in this endeavor is to accurately comprehend the user's prediction goals and, consequently, formulate precise ML tasks. In this paper, we take a pioneering step towards this ambitious goal by introducing a new concept called Forecast Utterance and then focus on the automatic and accurate interpretation of users' prediction goals from these utterances. Specifically, we frame the task as a slot-filling problem, where each slot corresponds to a specific aspect of the goal prediction task. We then employ two zero-shot methods for solving the slot-filling task, namely: 1) Entity Extraction (EE), and 2) Question-Answering (QA) techniques. Our experiments, conducted with three meticulously crafted data sets, validate the viability of our ambitious goal and demonstrate the effectiveness of both EE and QA techniques in interpreting Forecast Utterances.},
keywords = {TMLR},
pubstate = {forthcoming},
tppubtype = {article}
}
Dutta, Aritra; Das, Srijan; Nielsen, Jacob; Chakraborty, Rajatsubhra; Shah, Mubarak
Multiview Aerial Visual Recognition (MAVREC): Can Multi-view Improve Aerial Visual Perception? Conference
IEEE/CVF Computer Vision and Pattern Recognition Conference (CVPR), 2024.
Abstract | Tags: CVPR | Links:
@conference{Dutta2024,
title = {Multiview Aerial Visual Recognition (MAVREC): Can Multi-view Improve Aerial Visual Perception? },
author = {Aritra Dutta and Srijan Das and Jacob Nielsen and Rajatsubhra Chakraborty and Mubarak Shah},
url = {https://mavrec.github.io/
https://mavrec.github.io/src/MAVRec_CVPR2024-12-1-13.pdf
https://drive.google.com/drive/folders/1X7M2EpYpBMA09j-TF8S5mrwTNBGnKhFd?usp=share_link
https://github.com/DVD-dataset/dvdv1-code
https://mavrec.github.io/src/MAVRec_CVPR2024-12-14-23.pdf},
year = {2024},
date = {2024-02-01},
urldate = {2024-02-01},
publisher = {IEEE/CVF Computer Vision and Pattern Recognition Conference (CVPR)},
abstract = {Despite the commercial abundance of UAVs, aerial data acquisition remains challenging, and the existing Asia and North America-centric open-source UAV datasets are small-scale or low-resolution and lack diversity in scene contextuality. Additionally, the color content of the scenes, solar-zenith angle, and population density of different geographies influence the data diversity. These two factors conjointly render suboptimal aerial-visual perception of the deep neural network (DNN) models trained primarily on the ground-view data, including the open-world foundational models.
To pave the way for a transformative era of aerial detection, we present Multiview Aerial Visual RECognition or MAVREC, a video dataset where we record synchronized scenes from different perspectives --- ground camera and drone-mounted camera. MAVREC consists of around 2.5 hours of industry-standard 2.7K resolution video sequences, more than 0.5 million frames, and 1.1 million annotated bounding boxes.~This makes MAVREC the largest ground and aerial-view dataset, and the fourth largest among all drone-based datasets across all modalities and tasks.~Through our extensive benchmarking on MAVEREC, we recognize that augmenting object detectors with ground-view images from the corresponding geographical location is a superior pre-training strategy for aerial detection. Building on this strategy, we benchmark MAVREC with a curriculum-based semi-supervised object detection approach that leverages labeled (ground and aerial) and unlabeled (only aerial) images to enhance the aerial detection.},
keywords = {CVPR},
pubstate = {published},
tppubtype = {conference}
}
To pave the way for a transformative era of aerial detection, we present Multiview Aerial Visual RECognition or MAVREC, a video dataset where we record synchronized scenes from different perspectives --- ground camera and drone-mounted camera. MAVREC consists of around 2.5 hours of industry-standard 2.7K resolution video sequences, more than 0.5 million frames, and 1.1 million annotated bounding boxes.~This makes MAVREC the largest ground and aerial-view dataset, and the fourth largest among all drone-based datasets across all modalities and tasks.~Through our extensive benchmarking on MAVEREC, we recognize that augmenting object detectors with ground-view images from the corresponding geographical location is a superior pre-training strategy for aerial detection. Building on this strategy, we benchmark MAVREC with a curriculum-based semi-supervised object detection approach that leverages labeled (ground and aerial) and unlabeled (only aerial) images to enhance the aerial detection.
Bergou, El Houcine; Boucheroutie, Soumia; Dutta, Aritra; Li, Xin; Ma, Anna
A Note on Randomized Kaczmarz Algorithm for Solving Doubly-Noisy Linear Systems, El Houcine Bergou, Soumia Boucheroutie Journal Article
In: SIAM Journal on Matrix Analysis and Applications, 2024.
Abstract | Tags: SIAM | Links:
@article{Bergou2024,
title = {A Note on Randomized Kaczmarz Algorithm for Solving Doubly-Noisy Linear Systems, El Houcine Bergou, Soumia Boucheroutie},
author = {El Houcine Bergou and Soumia Boucheroutie and Aritra Dutta and Xin Li and Anna Ma},
url = {https://arxiv.org/abs/2308.16904},
year = {2024},
date = {2024-02-01},
journal = {SIAM Journal on Matrix Analysis and Applications},
abstract = {Large-scale linear systems, Ax=b, frequently arise in practice and demand effective iterative solvers. Often, these systems are noisy due to operational errors or faulty data-collection processes. In the past decade, the randomized Kaczmarz (RK) algorithm has been studied extensively as an efficient iterative solver for such systems. However, the convergence study of RK in the noisy regime is limited and considers measurement noise in the right-hand side vector, b. Unfortunately, in practice, that is not always the case; the coefficient matrix A can also be noisy. In this paper, we analyze the convergence of RK for noisy linear systems when the coefficient matrix, A, is corrupted with both additive and multiplicative noise, along with the noisy vector, b. In our analyses, the quantity R~=∥A~†∥22∥A~∥2F influences the convergence of RK, where A~ represents a noisy version of A. We claim that our analysis is robust and realistically applicable, as we do not require information about the noiseless coefficient matrix, A, and considering different conditions on noise, we can control the convergence of RK. We substantiate our theoretical findings by performing comprehensive numerical experiments.},
keywords = {SIAM},
pubstate = {published},
tppubtype = {article}
}
Shamsi, K.; Poursafaei, F.; Huang, S.; Ngo, B.; Coskunuzer, B.; Akcora, Cuneyt
Graphpulse: Topological representations for temporal graph property prediction Conference
Eleventh International Conference on Learning Representations (ICLR) 2023, 2024.
Abstract | Tags: ICLR | Links:
@conference{Shamsi2024b,
title = {Graphpulse: Topological representations for temporal graph property prediction},
author = {K. Shamsi and F. Poursafaei and S. Huang and B. Ngo and B. Coskunuzer and Cuneyt Akcora},
url = {https://ai.ucf.edu/wp-content/uploads/2024/02/2079_graphpulse_topological_represe.pdf
https://openreview.net/forum?id=DZqic2sPTY},
year = {2024},
date = {2024-01-16},
publisher = {Eleventh International Conference on Learning Representations (ICLR) 2023},
abstract = {Many real-world networks evolve over time, and predicting the evolution of such networks remains a challenging task. Graph Neural Networks (GNNs) have shown empirical success for learning on static graphs, but they lack the ability to effectively learn from nodes and edges with different timestamps. Consequently, the prediction of future properties in temporal graphs remains a relatively under-explored area. In this paper, we aim to bridge this gap by introducing a principled framework, named GraphPulse. The framework combines two important techniques for the analysis of temporal graphs within a Newtonian framework. First, we employ the Mapper method, a key tool in topological data analysis, to extract essential clustering information from graph nodes. Next, we harness the sequential modeling capabilities of Recurrent Neural Networks (RNNs) for temporal reasoning regarding the graph's evolution. Through extensive experimentation, we demonstrate that our model enhances the ROC-AUC metric by 10.2% in comparison to the top-performing state-of-the-art method across various temporal networks. We provide the implementation of GraphPulse at <a href="https://anonymous.4open.science/r/Graph_Pulse" target="_blank">https://anonymous.4open.science/r/Graph_Pulse</a>},
keywords = {ICLR},
pubstate = {published},
tppubtype = {conference}
}
Bedi, Amrit Singh; Parayil, Anjaly; Zhang, Junyu; Wang, Mengdi; Koppel, Alec
On the Sample Complexity and Metastability of Heavy-tailed Policy Search in Continuous Control Journal Article
In: Journal of Machine Learning Research (JMLR), 2024.
Abstract | Tags: JMLR | Links:
@article{nokey,
title = {On the Sample Complexity and Metastability of Heavy-tailed Policy Search in Continuous Control},
author = {Amrit Singh Bedi and Anjaly Parayil and Junyu Zhang and Mengdi Wang and Alec Koppel
},
url = {https://ai.ucf.edu/wp-content/uploads/2024/01/2106.08414.pdf
https://arxiv.org/abs/2106.08414
https://doi.org/10.48550/arXiv.2106.08414
},
year = {2024},
date = {2024-01-02},
urldate = {2024-01-02},
journal = {Journal of Machine Learning Research (JMLR)},
abstract = {Reinforcement learning is a framework for interactive decision-making with incentives sequentially revealed across time without a system dynamics model. Due to its scaling to continuous spaces, we focus on policy search where one iteratively improves a parameterized policy with stochastic policy gradient (PG) updates. In tabular Markov Decision Problems (MDPs), under persistent exploration and suitable parameterization, global optimality may be obtained. By contrast, in continuous space, the non-convexity poses a pathological challenge as evidenced by existing convergence results being mostly limited to stationarity or arbitrary local extrema. To close this gap, we step towards persistent exploration in continuous space through policy parameterizations defined by distributions of heavier tails defined by tail-index parameter alpha, which increases the likelihood of jumping in state space. Doing so invalidates smoothness conditions of the score function common to PG. Thus, we establish how the convergence rate to stationarity depends on the policy's tail index alpha, a Holder continuity parameter, integrability conditions, and an exploration tolerance parameter introduced here for the first time. Further, we characterize the dependence of the set of local maxima on the tail index through an exit and transition time analysis of a suitably defined Markov chain, identifying that policies associated with Levy Processes of a heavier tail converge to wider peaks. This phenomenon yields improved stability to perturbations in supervised learning, which we corroborate also manifests in improved performance of policy search, especially when myopic and farsighted incentives are misaligned.},
keywords = {JMLR},
pubstate = {published},
tppubtype = {article}
}
Shamsi, Nina I.; Xu, Alex S.; Gjesteby, Lars A.; Brattain, Laura J.
Improved Topological Preservation in 3D Axon Segmentation and Centerline Detection using Geometric Assessment-driven Topological Smoothing (GATS) Conference
Proceedings of the IEEE/CVF Winter Conference on Applications of Computer Vision , 2024.
Abstract | Tags: WACV | Links:
@conference{Shamsi2024,
title = {Improved Topological Preservation in 3D Axon Segmentation and Centerline Detection using Geometric Assessment-driven Topological Smoothing (GATS)},
author = {Nina I. Shamsi and Alex S. Xu and Lars A. Gjesteby and Laura J. Brattain},
url = {https://ai.ucf.edu/wp-content/uploads/2024/02/Shamsi_Improved_Topological_Preservation_in_3D_Axon_Segmentation_and_Centerline_Detection_WACV_2024_paper.pdf
https://arxiv.org/abs/2311.04116
https://doi.org/10.48550/arXiv.2311.04116},
year = {2024},
date = {2024-01-01},
booktitle = {Proceedings of the IEEE/CVF Winter Conference on Applications of Computer Vision },
pages = {8005-8014},
abstract = {Automated axon tracing via fully supervised learning requires large amounts of 3D brain imagery, which is time consuming and laborious to obtain. It also requires expertise. Thus, there is a need for more efficient segmentation and centerline detection techniques to use in conjunction with automated annotation tools. Topology-preserving methods ensure that segmented components maintain geometric connectivity, which is especially meaningful for applications where volumetric data is used, and these methods often make use of morphological thinning algorithms as the thinned outputs can be useful for both segmentation and centerline detection of curvilinear structures. Current morphological thinning approaches used in conjunction with topology-preserving methods are prone to over-thinning and require manual configuration of hyperparameters. We propose an automated approach for morphological smoothing using geometric assessment of the radius of tubular structures in brain microscopy volumes, and apply average pooling to prevent over-thinning. We use this approach to formulate a loss function, which we call Geo-metric Assessment-driven Topological Smoothing loss, or GATS. Our approach increased segmentation and center-line detection evaluation metrics by 2%-5% across multiple datasets, and improved the Betti error rates by 9%. Our ablation study showed that geometric assessment of tubular structures achieved higher segmentation and centerline detection scores, and using average pooling for morphological smoothing in place of thinning algorithms reduced the Betti errors. We observed increased topological preservation during automated annotation of 3D axons volumes from models trained with GATS.},
keywords = {WACV},
pubstate = {published},
tppubtype = {conference}
}
2023
Bergou, El Houcine; Burlachenko, Konstantin; Dutta, Aritra; Richtárik, Peter
Personalized Federated Learning with Communication Compression Journal Article
In: Transactions on Machine Learning Research, 2023.
Abstract | Tags: | Links:
@article{Bergou2023,
title = {Personalized Federated Learning with Communication Compression},
author = {El Houcine Bergou and Konstantin Burlachenko and Aritra Dutta and Peter Richtárik},
url = {https://arxiv.org/abs/2209.05148},
year = {2023},
date = {2023-11-01},
journal = {Transactions on Machine Learning Research},
abstract = {In contrast to training traditional machine learning (ML) models in data centers, federated learning (FL) trains ML models over local datasets contained on resource-constrained heterogeneous edge devices. Existing FL algorithms aim to learn a single global model for all participating devices, which may not be helpful to all devices participating in the training due to the heterogeneity of the data across the devices. Recently, Hanzely and Richtárik (2020) proposed a new formulation for training personalized FL models aimed at balancing the trade-off between the traditional global model and the local models that could be trained by individual devices using their private data only. They derived a new algorithm, called Loopless Gradient Descent (L2GD), to solve it and showed that this algorithms leads to improved communication complexity guarantees in regimes when more personalization is required. In this paper, we equip their L2GD algorithm with a bidirectional compression mechanism to further reduce the communication bottleneck between the local devices and the server. Unlike other compression-based algorithms used in the FL-setting, our compressed L2GD algorithm operates on a probabilistic communication protocol, where communication does not happen on a fixed schedule. Moreover, our compressed L2GD algorithm maintains a similar convergence rate as vanilla SGD without compression. To empirically validate the efficiency of our algorithm, we perform diverse numerical experiments on both convex and non-convex problems and using various compression techniques.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}