This paper presents an automated pipeline for constructing a Vietnamese Visual Question Answering with Natural Language Explanations (VQA-NLE) dataset. The pipeline addresses the challenge of creating high-quality multimodal datasets for low-resource languages like Vietnamese, combining visual understanding with natural language reasoning.
@inproceedings{duong2026vivqax,author={Duong, Truong-Binh and Tran, Hoang-Minh and Le-Nguyen, Binh-Nam and Duong, Dinh-Thang},title={An Automated Pipeline for Constructing a Vietnamese VQA-NLE Dataset},booktitle={Proceedings of the Fifth International Conference on Intelligent Systems and Networks},series={Lecture Notes in Networks and Systems},year={2026},publisher={Springer Nature Singapore},pages={164--173},isbn={978-981-95-1746-6},doi={10.1007/978-981-95-1746-6_18},}