Search dblp for Publications

export results for "audio-visual feature"

 download as .bib file

@article{DBLP:journals/ipm/MotamediKSEBT24,
  author       = {Elham Motamedi and
                  Danial Khosh Kholgh and
                  Sorush Saghari and
                  Mehdi Elahi and
                  Francesco Barile and
                  Marko Tkalcic},
  title        = {Predicting movies' eudaimonic and hedonic scores: {A} machine learning
                  approach using metadata, audio and visual features},
  journal      = {Inf. Process. Manag.},
  volume       = {61},
  number       = {2},
  pages        = {103610},
  year         = {2024},
  url          = {https://doi.org/10.1016/j.ipm.2023.103610},
  doi          = {10.1016/J.IPM.2023.103610},
  timestamp    = {Mon, 05 Feb 2024 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/journals/ipm/MotamediKSEBT24.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/tomccap/BingolPFA24,
  author       = {G{\"{u}}lnaziye Bing{\"{o}}l and
                  Simone Porcu and
                  Alessandro Floris and
                  Luigi Atzori},
  title        = {QoE Estimation of WebRTC-based Audio-visual Conversations from Facial
                  and Speech Features},
  journal      = {{ACM} Trans. Multim. Comput. Commun. Appl.},
  volume       = {20},
  number       = {5},
  pages        = {130:1--130:23},
  year         = {2024},
  url          = {https://doi.org/10.1145/3638251},
  doi          = {10.1145/3638251},
  timestamp    = {Fri, 22 Mar 2024 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/journals/tomccap/BingolPFA24.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/mmm/TanGZ24,
  author       = {Sze An Peter Tan and
                  Guangyu Gao and
                  Jia Zhao},
  editor       = {Stevan Rudinac and
                  Alan Hanjalic and
                  Cynthia C. S. Liem and
                  Marcel Worring and
                  Bj{\"{o}}rn {\TH}{\'{o}}r J{\'{o}}nsson and
                  Bei Liu and
                  Yoko Yamakata},
  title        = {Audio-Visual Segmentation by Leveraging Multi-scaled Features Learning},
  booktitle    = {MultiMedia Modeling - 30th International Conference, {MMM} 2024, Amsterdam,
                  The Netherlands, January 29 - February 2, 2024, Proceedings, Part
                  {II}},
  series       = {Lecture Notes in Computer Science},
  volume       = {14555},
  pages        = {156--169},
  publisher    = {Springer},
  year         = {2024},
  url          = {https://doi.org/10.1007/978-3-031-53308-2\_12},
  doi          = {10.1007/978-3-031-53308-2\_12},
  timestamp    = {Sat, 16 Mar 2024 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/conf/mmm/TanGZ24.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2401-17796,
  author       = {Xueyuan Chen and
                  Yuejiao Wang and
                  Xixin Wu and
                  Disong Wang and
                  Zhiyong Wu and
                  Xunying Liu and
                  Helen Meng},
  title        = {Exploiting Audio-Visual Features with Pretrained AV-HuBERT for Multi-Modal
                  Dysarthric Speech Reconstruction},
  journal      = {CoRR},
  volume       = {abs/2401.17796},
  year         = {2024},
  url          = {https://doi.org/10.48550/arXiv.2401.17796},
  doi          = {10.48550/ARXIV.2401.17796},
  eprinttype    = {arXiv},
  eprint       = {2401.17796},
  timestamp    = {Wed, 07 Feb 2024 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2401-17796.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/ijon/LiZ23,
  author       = {Yangke Li and
                  Xinman Zhang},
  title        = {Lip landmark-based audio-visual speech enhancement with multimodal
                  feature fusion network},
  journal      = {Neurocomputing},
  volume       = {549},
  pages        = {126432},
  year         = {2023},
  url          = {https://doi.org/10.1016/j.neucom.2023.126432},
  doi          = {10.1016/J.NEUCOM.2023.126432},
  timestamp    = {Tue, 07 May 2024 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/ijon/LiZ23.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/jifs/ZhaoZZL23,
  author       = {Yiming Zhao and
                  Hongdong Zhao and
                  Xuezhi Zhang and
                  Weina Liu},
  title        = {Vehicle classification based on audio-visual feature fusion with low-quality
                  images and noise},
  journal      = {J. Intell. Fuzzy Syst.},
  volume       = {45},
  number       = {5},
  pages        = {8931--8944},
  year         = {2023},
  url          = {https://doi.org/10.3233/jifs-232812},
  doi          = {10.3233/JIFS-232812},
  timestamp    = {Mon, 18 Mar 2024 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/journals/jifs/ZhaoZZL23.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/mta/KihalH23,
  author       = {Marouane Kihal and
                  Lamia Hamza},
  title        = {Robust multimedia spam filtering based on visual, textual, and audio
                  deep features and random forest},
  journal      = {Multim. Tools Appl.},
  volume       = {82},
  number       = {26},
  pages        = {40819--40837},
  year         = {2023},
  url          = {https://doi.org/10.1007/s11042-023-15170-x},
  doi          = {10.1007/S11042-023-15170-X},
  timestamp    = {Thu, 09 Nov 2023 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/journals/mta/KihalH23.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/mta/MistryBK23,
  author       = {Yogita D. Mistry and
                  Gajanan K. Birajdar and
                  Archana M. Khodke},
  title        = {Time-frequency visual representation and texture features for audio
                  applications: a comprehensive review, recent trends, and challenges},
  journal      = {Multim. Tools Appl.},
  volume       = {82},
  number       = {23},
  pages        = {36143--36177},
  year         = {2023},
  url          = {https://doi.org/10.1007/s11042-023-14734-1},
  doi          = {10.1007/S11042-023-14734-1},
  timestamp    = {Sat, 14 Oct 2023 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/mta/MistryBK23.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/sensors/LiFSLZ23,
  author       = {Guizhu Li and
                  Min Fu and
                  Mengnan Sun and
                  Xuefeng Liu and
                  Bing Zheng},
  title        = {A Facial Feature and Lip Movement Enhanced Audio-Visual Speech Separation
                  Model},
  journal      = {Sensors},
  volume       = {23},
  number       = {21},
  pages        = {8770},
  year         = {2023},
  url          = {https://doi.org/10.3390/s23218770},
  doi          = {10.3390/S23218770},
  timestamp    = {Sat, 20 Jan 2024 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/journals/sensors/LiFSLZ23.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/apsipa/ChungHWZSL23,
  author       = {Yu{-}Ching Chung and
                  Ji{-}Yan Han and
                  Bo{-}Sin Wang and
                  Wei{-}Zhong Zheng and
                  Kung{-}Yao Shen and
                  Ying{-}Hui Lai},
  title        = {An Audio-Visual Speech Enhancement System Based on 3D Image Features:
                  An Application in Hearing Aids},
  booktitle    = {Asia Pacific Signal and Information Processing Association Annual
                  Summit and Conference, {APSIPA} {ASC} 2023, Taipei, Taiwan, October
                  31 - Nov. 3, 2023},
  pages        = {1131--1137},
  publisher    = {{IEEE}},
  year         = {2023},
  url          = {https://doi.org/10.1109/APSIPAASC58517.2023.10317139},
  doi          = {10.1109/APSIPAASC58517.2023.10317139},
  timestamp    = {Sat, 02 Dec 2023 14:05:45 +0100},
  biburl       = {https://dblp.org/rec/conf/apsipa/ChungHWZSL23.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/dagm/MerceaHKA23,
  author       = {Otniel{-}Bogdan Mercea and
                  Thomas Hummel and
                  A. Sophia Koepke and
                  Zeynep Akata},
  editor       = {Ullrich K{\"{o}}the and
                  Carsten Rother},
  title        = {Text-to-Feature Diffusion for Audio-Visual Few-Shot Learning},
  booktitle    = {Pattern Recognition - 45th {DAGM} German Conference, {DAGM} {GCPR}
                  2023, Heidelberg, Germany, September 19-22, 2023, Proceedings},
  series       = {Lecture Notes in Computer Science},
  volume       = {14264},
  pages        = {491--507},
  publisher    = {Springer},
  year         = {2023},
  url          = {https://doi.org/10.1007/978-3-031-54605-1\_32},
  doi          = {10.1007/978-3-031-54605-1\_32},
  timestamp    = {Sat, 16 Mar 2024 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/conf/dagm/MerceaHKA23.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/embc/SinghTKG23,
  author       = {Prerna Singh and
                  Ayush Tripathi and
                  Lalan Kumar and
                  Tapan Kumar Gandhi},
  title        = {Brain Connectivity Features-based Age Group Classification using Temporal
                  Asynchrony Audio-Visual Integration Task},
  booktitle    = {45th Annual International Conference of the {IEEE} Engineering in
                  Medicine {\&} Biology Society, {EMBC} 2023, Sydney, Australia,
                  July 24-27, 2023},
  pages        = {1--4},
  publisher    = {{IEEE}},
  year         = {2023},
  url          = {https://doi.org/10.1109/EMBC40787.2023.10341177},
  doi          = {10.1109/EMBC40787.2023.10341177},
  timestamp    = {Thu, 11 Jan 2024 15:01:18 +0100},
  biburl       = {https://dblp.org/rec/conf/embc/SinghTKG23.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/eusipco/MezzaSS23,
  author       = {Alessandro Ilic Mezza and
                  Paolo Sani and
                  Augusto Sarti},
  title        = {Automatic {TV} Genre Classification Based on Visually-Conditioned
                  Deep Audio Features},
  booktitle    = {31st European Signal Processing Conference, {EUSIPCO} 2023, Helsinki,
                  Finland, September 4-8, 2023},
  pages        = {166--170},
  publisher    = {{IEEE}},
  year         = {2023},
  url          = {https://doi.org/10.23919/EUSIPCO58844.2023.10289723},
  doi          = {10.23919/EUSIPCO58844.2023.10289723},
  timestamp    = {Mon, 06 Nov 2023 12:35:15 +0100},
  biburl       = {https://dblp.org/rec/conf/eusipco/MezzaSS23.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/hci/SetoA23,
  author       = {Kazuki Seto and
                  Yumi Asahi},
  editor       = {Hirohiko Mori and
                  Yumi Asahi},
  title        = {Sound Logo to Increase {TV} Advertising Effectiveness Based on Audio-Visual
                  Features},
  booktitle    = {Human Interface and the Management of Information - Thematic Area,
                  {HIMI} 2023, Held as Part of the 25th {HCI} International Conference,
                  {HCII} 2023, Copenhagen, Denmark, July 23-28, 2023, Proceedings, Part
                  {I}},
  series       = {Lecture Notes in Computer Science},
  volume       = {14015},
  pages        = {136--151},
  publisher    = {Springer},
  year         = {2023},
  url          = {https://doi.org/10.1007/978-3-031-35132-7\_10},
  doi          = {10.1007/978-3-031-35132-7\_10},
  timestamp    = {Thu, 13 Jul 2023 10:09:58 +0200},
  biburl       = {https://dblp.org/rec/conf/hci/SetoA23.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/icassp/ChenZZSZL23,
  author       = {Hongbo Chen and
                  Dongchen Zhu and
                  Guanghui Zhang and
                  Wenjun Shi and
                  Xiaolin Zhang and
                  Jiamao Li},
  title        = {{CM-CS:} Cross-Modal Common-Specific Feature Learning For Audio-Visual
                  Video Parsing},
  booktitle    = {{IEEE} International Conference on Acoustics, Speech and Signal Processing
                  {ICASSP} 2023, Rhodes Island, Greece, June 4-10, 2023},
  pages        = {1--5},
  publisher    = {{IEEE}},
  year         = {2023},
  url          = {https://doi.org/10.1109/ICASSP49357.2023.10097072},
  doi          = {10.1109/ICASSP49357.2023.10097072},
  timestamp    = {Sun, 05 Nov 2023 16:51:21 +0100},
  biburl       = {https://dblp.org/rec/conf/icassp/ChenZZSZL23.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/icassp/JiangCDWL23,
  author       = {Ya Jiang and
                  Hang Chen and
                  Jun Du and
                  Qing Wang and
                  Chin{-}Hui Lee},
  title        = {Incorporating Lip Features into Audio-Visual Multi-Speaker {DOA} Estimation
                  by Gated Fusion},
  booktitle    = {{IEEE} International Conference on Acoustics, Speech and Signal Processing
                  {ICASSP} 2023, Rhodes Island, Greece, June 4-10, 2023},
  pages        = {1--5},
  publisher    = {{IEEE}},
  year         = {2023},
  url          = {https://doi.org/10.1109/ICASSP49357.2023.10095549},
  doi          = {10.1109/ICASSP49357.2023.10095549},
  timestamp    = {Sun, 05 Nov 2023 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/conf/icassp/JiangCDWL23.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/icassp/XuWZYWGFD23,
  author       = {Haitao Xu and
                  Liangfa Wei and
                  Jie Zhang and
                  Jianming Yang and
                  Yannan Wang and
                  Tian Gao and
                  Xin Fang and
                  Li{-}Rong Dai},
  title        = {A Multi-Scale Feature Aggregation Based Lightweight Network for Audio-Visual
                  Speech Enhancement},
  booktitle    = {{IEEE} International Conference on Acoustics, Speech and Signal Processing
                  {ICASSP} 2023, Rhodes Island, Greece, June 4-10, 2023},
  pages        = {1--5},
  publisher    = {{IEEE}},
  year         = {2023},
  url          = {https://doi.org/10.1109/ICASSP49357.2023.10096565},
  doi          = {10.1109/ICASSP49357.2023.10096565},
  timestamp    = {Tue, 23 Apr 2024 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/conf/icassp/XuWZYWGFD23.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/icmi/LiLLZTZZ23,
  author       = {Sunan Li and
                  Hailun Lian and
                  Cheng Lu and
                  Yan Zhao and
                  Chuangao Tang and
                  Yuan Zong and
                  Wenming Zheng},
  editor       = {Elisabeth Andr{\'{e}} and
                  Mohamed Chetouani and
                  Dominique Vaufreydaz and
                  Gale M. Lucas and
                  Tanja Schultz and
                  Louis{-}Philippe Morency and
                  Alessandro Vinciarelli},
  title        = {Audio-Visual Group-based Emotion Recognition using Local and Global
                  Feature Aggregation based Multi-Task Learning},
  booktitle    = {Proceedings of the 25th International Conference on Multimodal Interaction,
                  {ICMI} 2023, Paris, France, October 9-13, 2023},
  pages        = {741--745},
  publisher    = {{ACM}},
  year         = {2023},
  url          = {https://doi.org/10.1145/3577190.3616544},
  doi          = {10.1145/3577190.3616544},
  timestamp    = {Mon, 05 Feb 2024 20:29:19 +0100},
  biburl       = {https://dblp.org/rec/conf/icmi/LiLLZTZZ23.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/micad/BhattacharyaP23,
  author       = {Moinak Bhattacharya and
                  Prateek Prasanna},
  editor       = {Khan M. Iftekharuddin and
                  Weijie Chen},
  title        = {Audio-visual feature fusion for improved thoracic disease classification},
  booktitle    = {Medical Imaging 2023: Computer-Aided Diagnosis, San Diego, CA, USA,
                  February 19-23, 2023},
  series       = {{SPIE} Proceedings},
  volume       = {12465},
  publisher    = {{SPIE}},
  year         = {2023},
  url          = {https://doi.org/10.1117/12.2654571},
  doi          = {10.1117/12.2654571},
  timestamp    = {Tue, 19 Mar 2024 12:50:04 +0100},
  biburl       = {https://dblp.org/rec/conf/micad/BhattacharyaP23.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/smc/WangYGLW23,
  author       = {Jinxin Wang and
                  Chao Yang and
                  Zhongwen Guo and
                  Xiaomei Li and
                  Weigang Wang},
  title        = {An End-to-End Mandarin Audio-Visual Speech Recognition Model with
                  a Feature Enhancement Module},
  booktitle    = {{IEEE} International Conference on Systems, Man, and Cybernetics,
                  {SMC} 2023, Honolulu, Oahu, HI, USA, October 1-4, 2023},
  pages        = {572--577},
  publisher    = {{IEEE}},
  year         = {2023},
  url          = {https://doi.org/10.1109/SMC53992.2023.10394108},
  doi          = {10.1109/SMC53992.2023.10394108},
  timestamp    = {Sun, 03 Mar 2024 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/conf/smc/WangYGLW23.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/specom/NandakishorP23,
  author       = {Salam Nandakishor and
                  Debadatta Pati},
  editor       = {Alexey Karpov and
                  K. Samudravijaya and
                  K. T. Deepak and
                  Rajesh M. Hegde and
                  Shyam S. Agrawal and
                  S. R. Mahadeva Prasanna},
  title        = {Improvement of Audio-Visual Keyword Spotting System Accuracy Using
                  Excitation Source Feature},
  booktitle    = {Speech and Computer - 25th International Conference, {SPECOM} 2023,
                  Dharwad, India, November 29 - December 2, 2023, Proceedings, Part
                  {II}},
  series       = {Lecture Notes in Computer Science},
  volume       = {14339},
  pages        = {344--356},
  publisher    = {Springer},
  year         = {2023},
  url          = {https://doi.org/10.1007/978-3-031-48312-7\_28},
  doi          = {10.1007/978-3-031-48312-7\_28},
  timestamp    = {Tue, 07 May 2024 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/conf/specom/NandakishorP23.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2304-06315,
  author       = {Prerna Singh and
                  Ayush Tripathi and
                  Lalan Kumar and
                  Tapan Kumar Gandhi},
  title        = {Brain Connectivity Features-based Age Group Classification using Temporal
                  Asynchrony Audio-Visual Integration Task},
  journal      = {CoRR},
  volume       = {abs/2304.06315},
  year         = {2023},
  url          = {https://doi.org/10.48550/arXiv.2304.06315},
  doi          = {10.48550/ARXIV.2304.06315},
  eprinttype    = {arXiv},
  eprint       = {2304.06315},
  timestamp    = {Thu, 20 Apr 2023 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2304-06315.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2307-04760,
  author       = {Sagnik Majumder and
                  Ziad Al{-}Halah and
                  Kristen Grauman},
  title        = {Learning Spatial Features from Audio-Visual Correspondence in Egocentric
                  Videos},
  journal      = {CoRR},
  volume       = {abs/2307.04760},
  year         = {2023},
  url          = {https://doi.org/10.48550/arXiv.2307.04760},
  doi          = {10.48550/ARXIV.2307.04760},
  eprinttype    = {arXiv},
  eprint       = {2307.04760},
  timestamp    = {Mon, 24 Jul 2023 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2307-04760.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2309-03869,
  author       = {Otniel{-}Bogdan Mercea and
                  Thomas Hummel and
                  A. Sophia Koepke and
                  Zeynep Akata},
  title        = {Text-to-feature diffusion for audio-visual few-shot learning},
  journal      = {CoRR},
  volume       = {abs/2309.03869},
  year         = {2023},
  url          = {https://doi.org/10.48550/arXiv.2309.03869},
  doi          = {10.48550/ARXIV.2309.03869},
  eprinttype    = {arXiv},
  eprint       = {2309.03869},
  timestamp    = {Tue, 12 Sep 2023 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2309-03869.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2309-08030,
  author       = {Ju{-}Chieh Chou and
                  Chung{-}Ming Chien and
                  Karen Livescu},
  title        = {AV2Wav: Diffusion-Based Re-synthesis from Continuous Self-supervised
                  Features for Audio-Visual Speech Enhancement},
  journal      = {CoRR},
  volume       = {abs/2309.08030},
  year         = {2023},
  url          = {https://doi.org/10.48550/arXiv.2309.08030},
  doi          = {10.48550/ARXIV.2309.08030},
  eprinttype    = {arXiv},
  eprint       = {2309.08030},
  timestamp    = {Tue, 26 Sep 2023 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2309-08030.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2310-03456,
  author       = {Edward Fish and
                  Jon Weinbren and
                  Andrew Gilbert},
  title        = {Multi-Resolution Audio-Visual Feature Fusion for Temporal Action Localization},
  journal      = {CoRR},
  volume       = {abs/2310.03456},
  year         = {2023},
  url          = {https://doi.org/10.48550/arXiv.2310.03456},
  doi          = {10.48550/ARXIV.2310.03456},
  eprinttype    = {arXiv},
  eprint       = {2310.03456},
  timestamp    = {Thu, 19 Oct 2023 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2310-03456.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2310-03827,
  author       = {Sneha Muppalla and
                  Shan Jia and
                  Siwei Lyu},
  title        = {Integrating Audio-Visual Features for Multimodal Deepfake Detection},
  journal      = {CoRR},
  volume       = {abs/2310.03827},
  year         = {2023},
  url          = {https://doi.org/10.48550/arXiv.2310.03827},
  doi          = {10.48550/ARXIV.2310.03827},
  eprinttype    = {arXiv},
  eprint       = {2310.03827},
  timestamp    = {Thu, 19 Oct 2023 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2310-03827.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/vi/WangSWMZL22,
  author       = {Lei Wang and
                  Guodao Sun and
                  Yunchao Wang and
                  Ji Ma and
                  Xiaomin Zhao and
                  Ronghua Liang},
  title        = {AFExplorer: Visual analysis and interactive selection of audio features},
  journal      = {Vis. Informatics},
  volume       = {6},
  number       = {1},
  pages        = {47--55},
  year         = {2022},
  url          = {https://doi.org/10.1016/j.visinf.2022.02.003},
  doi          = {10.1016/J.VISINF.2022.02.003},
  timestamp    = {Thu, 02 Jun 2022 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/vi/WangSWMZL22.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/ialp/LuDLZZSWY22,
  author       = {Shangjun Lu and
                  Xiaoxia Du and
                  Juan Liu and
                  Yu{-}Mei Zhang and
                  Shaofeng Zhao and
                  Rongfeng Su and
                  Lan Wang and
                  Nan Yan},
  editor       = {Rong Tong and
                  Yanfeng Lu and
                  Minghui Dong and
                  Wengao Gong and
                  Haizhou Li},
  title        = {A New Method for Predicting Severity Level of Dysarthric Speech Based
                  on Joint Feature-Sample Selection using Audio-Visual Data},
  booktitle    = {International Conference on Asian Language Processing, {IALP} 2022,
                  Singapore, October 27-28, 2022},
  pages        = {190--195},
  publisher    = {{IEEE}},
  year         = {2022},
  url          = {https://doi.org/10.1109/IALP57159.2022.9961300},
  doi          = {10.1109/IALP57159.2022.9961300},
  timestamp    = {Fri, 09 Dec 2022 16:21:50 +0100},
  biburl       = {https://dblp.org/rec/conf/ialp/LuDLZZSWY22.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/interspeech/HongKYR22,
  author       = {Joanna Hong and
                  Minsu Kim and
                  Daehun Yoo and
                  Yong Man Ro},
  editor       = {Hanseok Ko and
                  John H. L. Hansen},
  title        = {Visual Context-driven Audio Feature Enhancement for Robust End-to-End
                  Audio-Visual Speech Recognition},
  booktitle    = {Interspeech 2022, 23rd Annual Conference of the International Speech
                  Communication Association, Incheon, Korea, 18-22 September 2022},
  pages        = {2838--2842},
  publisher    = {{ISCA}},
  year         = {2022},
  url          = {https://doi.org/10.21437/Interspeech.2022-11311},
  doi          = {10.21437/INTERSPEECH.2022-11311},
  timestamp    = {Wed, 21 Jun 2023 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/conf/interspeech/HongKYR22.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/interspeech/WeiHYLD22,
  author       = {Jie Wei and
                  Guanyu Hu and
                  Xinyu Yang and
                  Anh Tuan Luu and
                  Yizhuo Dong},
  editor       = {Hanseok Ko and
                  John H. L. Hansen},
  title        = {Audio-Visual Domain Adaptation Feature Fusion for Speech Emotion Recognition},
  booktitle    = {Interspeech 2022, 23rd Annual Conference of the International Speech
                  Communication Association, Incheon, Korea, 18-22 September 2022},
  pages        = {1988--1992},
  publisher    = {{ISCA}},
  year         = {2022},
  url          = {https://doi.org/10.21437/Interspeech.2022-703},
  doi          = {10.21437/INTERSPEECH.2022-703},
  timestamp    = {Wed, 21 Jun 2023 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/conf/interspeech/WeiHYLD22.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/mwscas/FanarasTAM22,
  author       = {Konstantinos Fanaras and
                  Antonios Tragoudaras and
                  Charalampos Antoniadis and
                  Yehia Massoud},
  title        = {Audio-visual Speaker Diarization: Improved Voice Activity Detection
                  with {CNN} based Feature Extraction},
  booktitle    = {65th {IEEE} International Midwest Symposium on Circuits and Systems,
                  {MWSCAS} 2022, Fukuoka, Japan, August 7-10, 2022},
  pages        = {1--4},
  publisher    = {{IEEE}},
  year         = {2022},
  url          = {https://doi.org/10.1109/MWSCAS54063.2022.9859533},
  doi          = {10.1109/MWSCAS54063.2022.9859533},
  timestamp    = {Mon, 01 May 2023 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/conf/mwscas/FanarasTAM22.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/sdf/GeeromsAKKVM22,
  author       = {Warre Geeroms and
                  Gianni Allebosch and
                  Stijn Kindt and
                  Loubna Kadri and
                  Peter Veelaert and
                  Nilesh Madhu},
  title        = {Audio-Visual Active Speaker Identification: {A} comparison of dense
                  image-based features and sparse facial landmark-based features},
  booktitle    = {Sensor Data Fusion: Trends, Solutions, Applications, {SDF} 2022, Bonn,
                  Germany, October 12-14, 2022},
  pages        = {1--6},
  publisher    = {{IEEE}},
  year         = {2022},
  url          = {https://doi.org/10.1109/SDF55338.2022.9931697},
  doi          = {10.1109/SDF55338.2022.9931697},
  timestamp    = {Fri, 18 Nov 2022 20:51:17 +0100},
  biburl       = {https://dblp.org/rec/conf/sdf/GeeromsAKKVM22.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2203-02655,
  author       = {Junwen Xiong and
                  Peng Zhang and
                  Lei Xie and
                  Wei Huang and
                  Yufei Zha and
                  Yanning Zhang},
  title        = {Audio-visual speech separation based on joint feature representation
                  with cross-modal attention},
  journal      = {CoRR},
  volume       = {abs/2203.02655},
  year         = {2022},
  url          = {https://doi.org/10.48550/arXiv.2203.02655},
  doi          = {10.48550/ARXIV.2203.02655},
  eprinttype    = {arXiv},
  eprint       = {2203.02655},
  timestamp    = {Fri, 10 Mar 2023 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2203-02655.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2203-15183,
  author       = {Jialu Li and
                  Mark Hasegawa{-}Johnson and
                  Nancy L. McElwain},
  title        = {Visualizations of Complex Sequences of Family-Infant Vocalizations
                  Using Bag-of-Audio-Words Approach Based on Wav2vec 2.0 Features},
  journal      = {CoRR},
  volume       = {abs/2203.15183},
  year         = {2022},
  url          = {https://doi.org/10.48550/arXiv.2203.15183},
  doi          = {10.48550/ARXIV.2203.15183},
  eprinttype    = {arXiv},
  eprint       = {2203.15183},
  timestamp    = {Tue, 05 Apr 2022 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2203-15183.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2207-06020,
  author       = {Joanna Hong and
                  Minsu Kim and
                  Daehun Yoo and
                  Yong Man Ro},
  title        = {Visual Context-driven Audio Feature Enhancement for Robust End-to-End
                  Audio-Visual Speech Recognition},
  journal      = {CoRR},
  volume       = {abs/2207.06020},
  year         = {2022},
  url          = {https://doi.org/10.48550/arXiv.2207.06020},
  doi          = {10.48550/ARXIV.2207.06020},
  eprinttype    = {arXiv},
  eprint       = {2207.06020},
  timestamp    = {Sun, 02 Oct 2022 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2207-06020.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/access/IshikawaHS21,
  author       = {Reina Ishikawa and
                  Ryo Hachiuma and
                  Hideo Saito},
  title        = {Self-Supervised Audio-Visual Feature Learning for Single-Modal Incremental
                  Terrain Type Clustering},
  journal      = {{IEEE} Access},
  volume       = {9},
  pages        = {64346--64357},
  year         = {2021},
  url          = {https://doi.org/10.1109/ACCESS.2021.3075582},
  doi          = {10.1109/ACCESS.2021.3075582},
  timestamp    = {Sun, 16 May 2021 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/access/IshikawaHS21.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/eetct/MehtaKK21,
  author       = {Pooja Mehta and
                  Sahil Kaswan and
                  Jaspreet Kaur},
  title        = {An Enhanced {ANN-HMM} based classification of video recordings with
                  the aid of audio-visual feature extraction},
  journal      = {{EAI} Endorsed Trans. Creative Technol.},
  volume       = {8},
  number       = {28},
  pages        = {e1},
  year         = {2021},
  url          = {https://doi.org/10.4108/eai.31-3-2021.169172},
  doi          = {10.4108/EAI.31-3-2021.169172},
  timestamp    = {Mon, 25 Oct 2021 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/eetct/MehtaKK21.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/entropy/WilkesVM21,
  author       = {Ben Wilkes and
                  Igor Vatolkin and
                  Heinrich M{\"{u}}ller},
  title        = {Statistical and Visual Analysis of Audio, Text, and Image Features
                  for Multi-Modal Music Genre Recognition},
  journal      = {Entropy},
  volume       = {23},
  number       = {11},
  pages        = {1502},
  year         = {2021},
  url          = {https://doi.org/10.3390/e23111502},
  doi          = {10.3390/E23111502},
  timestamp    = {Wed, 15 Dec 2021 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/journals/entropy/WilkesVM21.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/fi/IsobeTHGN21,
  author       = {Shinnosuke Isobe and
                  Satoshi Tamura and
                  Satoru Hayamizu and
                  Yuuto Gotoh and
                  Masaki Nose},
  title        = {Multi-Angle Lipreading with Angle Classification-Based Feature Extraction
                  and Its Application to Audio-Visual Speech Recognition},
  journal      = {Future Internet},
  volume       = {13},
  number       = {7},
  pages        = {182},
  year         = {2021},
  url          = {https://doi.org/10.3390/fi13070182},
  doi          = {10.3390/FI13070182},
  timestamp    = {Thu, 12 Aug 2021 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/fi/IsobeTHGN21.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/ijcini/HandaAK21,
  author       = {Anand Handa and
                  Rashi Agarwal and
                  Narendra Kohli},
  title        = {Audio-Visual Emotion Recognition System Using Multi-Modal Features},
  journal      = {Int. J. Cogn. Informatics Nat. Intell.},
  volume       = {15},
  number       = {4},
  pages        = {1--14},
  year         = {2021},
  url          = {https://doi.org/10.4018/ijcini.20211001.oa34},
  doi          = {10.4018/IJCINI.20211001.OA34},
  timestamp    = {Tue, 28 Feb 2023 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/journals/ijcini/HandaAK21.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/sensors/QuanMN21,
  author       = {Jingyu Quan and
                  Yoshihiro Miyake and
                  Takayuki Nozawa},
  title        = {Incorporating Interpersonal Synchronization Features for Automatic
                  Emotion Recognition from Visual and Audio Data during Communication},
  journal      = {Sensors},
  volume       = {21},
  number       = {16},
  pages        = {5317},
  year         = {2021},
  url          = {https://doi.org/10.3390/s21165317},
  doi          = {10.3390/S21165317},
  timestamp    = {Wed, 01 Sep 2021 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/sensors/QuanMN21.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/sivp/DebnathR21,
  author       = {Saswati Debnath and
                  Pinki Roy},
  title        = {Appearance and shape-based hybrid visual feature extraction: toward
                  audio-visual automatic speech recognition},
  journal      = {Signal Image Video Process.},
  volume       = {15},
  number       = {1},
  pages        = {25--32},
  year         = {2021},
  url          = {https://doi.org/10.1007/s11760-020-01717-0},
  doi          = {10.1007/S11760-020-01717-0},
  timestamp    = {Thu, 14 Oct 2021 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/sivp/DebnathR21.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/speech/FarhoudiS21,
  author       = {Zeinab Farhoudi and
                  Saeed Setayeshi},
  title        = {Fusion of deep learning features with mixture of brain emotional learning
                  for audio-visual emotion recognition},
  journal      = {Speech Commun.},
  volume       = {127},
  pages        = {92--103},
  year         = {2021},
  url          = {https://doi.org/10.1016/j.specom.2020.12.001},
  doi          = {10.1016/J.SPECOM.2020.12.001},
  timestamp    = {Mon, 05 Feb 2024 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/journals/speech/FarhoudiS21.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/cascon/ZhouGMSRY21,
  author       = {Jingmin Zhou and
                  Adam Gariba and
                  Vida Movahedi and
                  Mariah Martin Shein and
                  Andre Rosa and
                  Ruiqi Yu},
  editor       = {Vio Onut and
                  Farhana H. Zulkernine},
  title        = {Multi-label video categorization using visual and audio transcript
                  features},
  booktitle    = {{CASCON} '21: Proceedings of the 31st Annual International Conference
                  on Computer Science and Software Engineering, Toronto, Ontario, Canada,
                  November 22 - 25, 2021},
  pages        = {23--32},
  publisher    = {{ACM}},
  year         = {2021},
  url          = {https://dl.acm.org/doi/10.5555/3507788.3507793},
  doi          = {10.5555/3507788.3507793},
  timestamp    = {Fri, 29 Jul 2022 16:56:38 +0200},
  biburl       = {https://dblp.org/rec/conf/cascon/ZhouGMSRY21.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/cis/Wang21,
  author       = {Yufei Wang},
  title        = {Efficient Audio-Visual Speaker Recognition Via Deep Multi-Modal Feature
                  Fusion},
  booktitle    = {17th International Conference on Computational Intelligence and Security
                  {CIS} 2021, Chengdu, China, November 19-22, 2021},
  pages        = {99--103},
  publisher    = {{IEEE}},
  year         = {2021},
  url          = {https://doi.org/10.1109/CIS54983.2021.00029},
  doi          = {10.1109/CIS54983.2021.00029},
  timestamp    = {Wed, 16 Feb 2022 17:26:48 +0100},
  biburl       = {https://dblp.org/rec/conf/cis/Wang21.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/icassp/ShetuCH21,
  author       = {Shrishti Saha Shetu and
                  Soumitro Chakrabarty and
                  Emanu{\"{e}}l Anco Peter Habets},
  title        = {An Empirical Study of Visual Features for {DNN} Based Audio-Visual
                  Speech Enhancement in Multi-Talker Environments},
  booktitle    = {{IEEE} International Conference on Acoustics, Speech and Signal Processing,
                  {ICASSP} 2021, Toronto, ON, Canada, June 6-11, 2021},
  pages        = {8418--8422},
  publisher    = {{IEEE}},
  year         = {2021},
  url          = {https://doi.org/10.1109/ICASSP39728.2021.9414000},
  doi          = {10.1109/ICASSP39728.2021.9414000},
  timestamp    = {Fri, 09 Jul 2021 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/conf/icassp/ShetuCH21.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/ijcnn/ChengTQJ21,
  author       = {Huijie Cheng and
                  Yun Tie and
                  Lin Qi and
                  Cong Jin},
  title        = {Context-Aware Based Visual-Audio Feature Fusion for Emotion Recognition},
  booktitle    = {International Joint Conference on Neural Networks, {IJCNN} 2021, Shenzhen,
                  China, July 18-22, 2021},
  pages        = {1--8},
  publisher    = {{IEEE}},
  year         = {2021},
  url          = {https://doi.org/10.1109/IJCNN52387.2021.9533473},
  doi          = {10.1109/IJCNN52387.2021.9533473},
  timestamp    = {Thu, 17 Mar 2022 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/conf/ijcnn/ChengTQJ21.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/ijcnn/ZhangXSHQX21,
  author       = {Peng Zhang and
                  Jiaming Xu and
                  Jing Shi and
                  Yunzhe Hao and
                  Lei Qin and
                  Bo Xu},
  title        = {Audio-Visual Speech Separation with Visual Features Enhanced by Adversarial
                  Training},
  booktitle    = {International Joint Conference on Neural Networks, {IJCNN} 2021, Shenzhen,
                  China, July 18-22, 2021},
  pages        = {1--8},
  publisher    = {{IEEE}},
  year         = {2021},
  url          = {https://doi.org/10.1109/IJCNN52387.2021.9533660},
  doi          = {10.1109/IJCNN52387.2021.9533660},
  timestamp    = {Wed, 29 Sep 2021 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/conf/ijcnn/ZhangXSHQX21.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/mm/TaoPDQS021,
  author       = {Ruijie Tao and
                  Zexu Pan and
                  Rohan Kumar Das and
                  Xinyuan Qian and
                  Mike Zheng Shou and
                  Haizhou Li},
  editor       = {Heng Tao Shen and
                  Yueting Zhuang and
                  John R. Smith and
                  Yang Yang and
                  Pablo C{\'{e}}sar and
                  Florian Metze and
                  Balakrishnan Prabhakaran},
  title        = {Is Someone Speaking?: Exploring Long-term Temporal Features for Audio-visual
                  Active Speaker Detection},
  booktitle    = {{MM} '21: {ACM} Multimedia Conference, Virtual Event, China, October
                  20 - 24, 2021},
  pages        = {3927--3935},
  publisher    = {{ACM}},
  year         = {2021},
  url          = {https://doi.org/10.1145/3474085.3475587},
  doi          = {10.1145/3474085.3475587},
  timestamp    = {Mon, 22 Apr 2024 21:24:20 +0200},
  biburl       = {https://dblp.org/rec/conf/mm/TaoPDQS021.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/visapp/HuMOM21,
  author       = {Feiyan Hu and
                  Eva Mohedano and
                  Noel E. O'Connor and
                  Kevin McGuinness},
  editor       = {Giovanni Maria Farinella and
                  Petia Radeva and
                  Jos{\'{e}} Braz and
                  Kadi Bouatouch},
  title        = {Temporal Bilinear Encoding Network of Audio-visual Features at Low
                  Sampling Rates},
  booktitle    = {Proceedings of the 16th International Joint Conference on Computer
                  Vision, Imaging and Computer Graphics Theory and Applications, {VISIGRAPP}
                  2021, Volume 5: VISAPP, Online Streaming, February 8-10, 2021},
  pages        = {637--644},
  publisher    = {{SCITEPRESS}},
  year         = {2021},
  url          = {https://doi.org/10.5220/0010337306370644},
  doi          = {10.5220/0010337306370644},
  timestamp    = {Tue, 06 Jun 2023 14:58:00 +0200},
  biburl       = {https://dblp.org/rec/conf/visapp/HuMOM21.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/wacv/MazumderSPN21,
  author       = {Pratik Mazumder and
                  Pravendra Singh and
                  Kranti Kumar Parida and
                  Vinay P. Namboodiri},
  title        = {AVGZSLNet: Audio-Visual Generalized Zero-Shot Learning by Reconstructing
                  Label Features from Multi-Modal Embeddings},
  booktitle    = {{IEEE} Winter Conference on Applications of Computer Vision, {WACV}
                  2021, Waikoloa, HI, USA, January 3-8, 2021},
  pages        = {3089--3098},
  publisher    = {{IEEE}},
  year         = {2021},
  url          = {https://doi.org/10.1109/WACV48630.2021.00313},
  doi          = {10.1109/WACV48630.2021.00313},
  timestamp    = {Mon, 03 Jan 2022 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/conf/wacv/MazumderSPN21.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2101-05975,
  author       = {Xinmeng Xu and
                  Dongxiang Xu and
                  Jie Jia and
                  Yang Wang and
                  Binbin Chen},
  title        = {{MFFCN:} Multi-layer Feature Fusion Convolution Network for Audio-visual
                  Speech Enhancement},
  journal      = {CoRR},
  volume       = {abs/2101.05975},
  year         = {2021},
  url          = {https://arxiv.org/abs/2101.05975},
  eprinttype    = {arXiv},
  eprint       = {2101.05975},
  timestamp    = {Fri, 20 Jan 2023 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2101-05975.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2101-06268,
  author       = {Xinmeng Xu and
                  Yang Wang and
                  Dongxiang Xu and
                  Cong Zhang and
                  Yiyuan Peng and
                  Jie Jia and
                  Binbin Chen},
  title        = {Attentional Multi-layer Feature Fusion Convolution Network for Audio-visual
                  Speech Enhancement},
  journal      = {CoRR},
  volume       = {abs/2101.06268},
  year         = {2021},
  url          = {https://arxiv.org/abs/2101.06268},
  eprinttype    = {arXiv},
  eprint       = {2101.06268},
  timestamp    = {Fri, 20 Jan 2023 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2101-06268.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2103-14189,
  author       = {Taylan K. Sen and
                  Gazi Naven and
                  Luke Gerstner and
                  Daryl Bagley and
                  Raiyan Abdul Baten and
                  Wasifur Rahman and
                  Md. Kamrul Hasan and
                  Kurtis Glenn Haut and
                  Abdullah Al Mamun and
                  Samiha Samrose and
                  Anne Solbu and
                  R. Eric Barnes and
                  Mark G. Frank and
                  Ehsan Hoque},
  title        = {{DBATES:} DataBase of Audio features, Text, and visual Expressions
                  in competitive debate Speeches},
  journal      = {CoRR},
  volume       = {abs/2103.14189},
  year         = {2021},
  url          = {https://arxiv.org/abs/2103.14189},
  eprinttype    = {arXiv},
  eprint       = {2103.14189},
  timestamp    = {Thu, 17 Nov 2022 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2103-14189.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2107-06592,
  author       = {Ruijie Tao and
                  Zexu Pan and
                  Rohan Kumar Das and
                  Xinyuan Qian and
                  Mike Zheng Shou and
                  Haizhou Li},
  title        = {Is Someone Speaking? Exploring Long-term Temporal Features for Audio-visual
                  Active Speaker Detection},
  journal      = {CoRR},
  volume       = {abs/2107.06592},
  year         = {2021},
  url          = {https://arxiv.org/abs/2107.06592},
  eprinttype    = {arXiv},
  eprint       = {2107.06592},
  timestamp    = {Thu, 22 Jul 2021 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2107-06592.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/computers/FeradovMG20,
  author       = {Firgan Feradov and
                  Iosif Mporas and
                  Todor Ganchev},
  title        = {Evaluation of Features in Detection of Dislike Responses to Audio-Visual
                  Stimuli from {EEG} Signals},
  journal      = {Comput.},
  volume       = {9},
  number       = {2},
  pages        = {33},
  year         = {2020},
  url          = {https://doi.org/10.3390/computers9020033},
  doi          = {10.3390/COMPUTERS9020033},
  timestamp    = {Sat, 05 Sep 2020 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/computers/FeradovMG20.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/ijon/HaoCLWX20,
  author       = {Man Hao and
                  Weihua Cao and
                  Zhentao Liu and
                  Min Wu and
                  Peng Xiao},
  title        = {Visual-audio emotion recognition based on multi-task and ensemble
                  learning with multiple features},
  journal      = {Neurocomputing},
  volume       = {391},
  pages        = {42--51},
  year         = {2020},
  url          = {https://doi.org/10.1016/j.neucom.2020.01.048},
  doi          = {10.1016/J.NEUCOM.2020.01.048},
  timestamp    = {Fri, 22 May 2020 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/ijon/HaoCLWX20.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/ijon/WangWH20a,
  author       = {Zhan Wang and
                  Lizhi Wang and
                  Hua Huang},
  title        = {Joint low rank embedded multiple features learning for audio-visual
                  emotion recognition},
  journal      = {Neurocomputing},
  volume       = {388},
  pages        = {324--333},
  year         = {2020},
  url          = {https://doi.org/10.1016/j.neucom.2020.01.017},
  doi          = {10.1016/J.NEUCOM.2020.01.017},
  timestamp    = {Mon, 05 Feb 2024 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/journals/ijon/WangWH20a.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/npl/KhanSAT20,
  author       = {Abdullah Aman Khan and
                  Jie Shao and
                  Waqar Ali and
                  Saifullah Tumrani},
  title        = {Content-Aware Summarization of Broadcast Sports Videos: An Audio-Visual
                  Feature Extraction Approach},
  journal      = {Neural Process. Lett.},
  volume       = {52},
  number       = {3},
  pages        = {1945--1968},
  year         = {2020},
  url          = {https://doi.org/10.1007/s11063-020-10200-3},
  doi          = {10.1007/S11063-020-10200-3},
  timestamp    = {Tue, 28 Mar 2023 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/npl/KhanSAT20.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/sensors/JamilFRUBFM20,
  author       = {Sonain Jamil and
                  Fawad and
                  MuhibUr Rahman and
                  Amin Ullah and
                  Salman Badnava and
                  Masoud Forsat and
                  Seyed Sajad Mirjavadi},
  title        = {Malicious {UAV} Detection Using Integrated Audio and Visual Features
                  for Public Safety Applications},
  journal      = {Sensors},
  volume       = {20},
  number       = {14},
  pages        = {3923},
  year         = {2020},
  url          = {https://doi.org/10.3390/s20143923},
  doi          = {10.3390/S20143923},
  timestamp    = {Thu, 13 Aug 2020 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/sensors/JamilFRUBFM20.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/taslp/SuLWY20,
  author       = {Rongfeng Su and
                  Xunying Liu and
                  Lan Wang and
                  Jingzhou Yang},
  title        = {Cross-Domain Deep Visual Feature Generation for Mandarin Audio-Visual
                  Speech Recognition},
  journal      = {{IEEE} {ACM} Trans. Audio Speech Lang. Process.},
  volume       = {28},
  pages        = {185--197},
  year         = {2020},
  url          = {https://doi.org/10.1109/TASLP.2019.2950602},
  doi          = {10.1109/TASLP.2019.2950602},
  timestamp    = {Fri, 13 Mar 2020 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/journals/taslp/SuLWY20.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/fgr/HormannMKR20,
  author       = {Stefan H{\"{o}}rmann and
                  Abdul Moiz and
                  Martin Knoche and
                  Gerhard Rigoll},
  title        = {Attention Fusion for Audio-Visual Person Verification Using Multi-Scale
                  Features},
  booktitle    = {15th {IEEE} International Conference on Automatic Face and Gesture
                  Recognition, {FG} 2020, Buenos Aires, Argentina, November 16-20, 2020},
  pages        = {281--285},
  publisher    = {{IEEE}},
  year         = {2020},
  url          = {https://doi.org/10.1109/FG47880.2020.00074},
  doi          = {10.1109/FG47880.2020.00074},
  timestamp    = {Tue, 18 Jul 2023 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/conf/fgr/HormannMKR20.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/icip/LeeL20,
  author       = {Hansol Lee and
                  Gyemin Lee},
  title        = {Hierarchical Model For Long-Length Video Summarization With Adversarially
                  Enhanced Audio/Visual Features},
  booktitle    = {{IEEE} International Conference on Image Processing, {ICIP} 2020,
                  Abu Dhabi, United Arab Emirates, October 25-28, 2020},
  pages        = {723--727},
  publisher    = {{IEEE}},
  year         = {2020},
  url          = {https://doi.org/10.1109/ICIP40778.2020.9190636},
  doi          = {10.1109/ICIP40778.2020.9190636},
  timestamp    = {Tue, 07 May 2024 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/conf/icip/LeeL20.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/icip/LiuCS20,
  author       = {Hong Liu and
                  Zhengyan Chen and
                  Wei Shi},
  title        = {Robust Audio-Visual Mandarin Speech Recognition Based On Adaptive
                  Decision Fusion And Tone Features},
  booktitle    = {{IEEE} International Conference on Image Processing, {ICIP} 2020,
                  Abu Dhabi, United Arab Emirates, October 25-28, 2020},
  pages        = {1381--1385},
  publisher    = {{IEEE}},
  year         = {2020},
  url          = {https://doi.org/10.1109/ICIP40778.2020.9190894},
  doi          = {10.1109/ICIP40778.2020.9190894},
  timestamp    = {Wed, 08 Dec 2021 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/conf/icip/LiuCS20.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/icpr/IshikawaHKS20,
  author       = {Reina Ishikawa and
                  Ryo Hachiuma and
                  Akiyoshi Kurobe and
                  Hideo Saito},
  title        = {Single-modal Incremental Terrain Clustering from Self-Supervised Audio-Visual
                  Feature Learning},
  booktitle    = {25th International Conference on Pattern Recognition, {ICPR} 2020,
                  Virtual Event / Milan, Italy, January 10-15, 2021},
  pages        = {9399--9406},
  publisher    = {{IEEE}},
  year         = {2020},
  url          = {https://doi.org/10.1109/ICPR48806.2021.9412638},
  doi          = {10.1109/ICPR48806.2021.9412638},
  timestamp    = {Fri, 07 May 2021 08:42:33 +0200},
  biburl       = {https://dblp.org/rec/conf/icpr/IshikawaHKS20.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/icpr/LiuXY20,
  author       = {Hong Liu and
                  Wanlu Xu and
                  Bing Yang},
  title        = {Audio-Visual Speech Recognition Using {A} Two-Step Feature Fusion
                  Strategy},
  booktitle    = {25th International Conference on Pattern Recognition, {ICPR} 2020,
                  Virtual Event / Milan, Italy, January 10-15, 2021},
  pages        = {1896--1903},
  publisher    = {{IEEE}},
  year         = {2020},
  url          = {https://doi.org/10.1109/ICPR48806.2021.9412454},
  doi          = {10.1109/ICPR48806.2021.9412454},
  timestamp    = {Tue, 14 Mar 2023 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/conf/icpr/LiuXY20.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/mapr/VoNDNDNNL020,
  author       = {Hung{-}Quoc Vo and
                  Dung{-}Minh Nguyen and
                  Tien Do and
                  Vinh{-}Tiep Nguyen and
                  Vu{-}Minh{-}Hieu Dang and
                  Nhat{-}Duy Nguyen and
                  Thanh Duc Ngo and
                  Duy{-}Dinh Le and
                  Shin'ichi Satoh},
  title        = {Searching For Desired Person Doing Desired Action based on Visual
                  and Audio Feature in Large Scale Video Database},
  booktitle    = {International Conference on Multimedia Analysis and Pattern Recognition,
                  {MAPR} 2020, Hanoi, Vietnam, October 8-9, 2020},
  pages        = {1--6},
  publisher    = {{IEEE}},
  year         = {2020},
  url          = {https://doi.org/10.1109/MAPR49794.2020.9237781},
  doi          = {10.1109/MAPR49794.2020.9237781},
  timestamp    = {Wed, 03 Nov 2021 16:47:39 +0100},
  biburl       = {https://dblp.org/rec/conf/mapr/VoNDNDNNL020.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/mmm/GuoZYFH020,
  author       = {Xiaona Guo and
                  Wei Zhong and
                  Long Ye and
                  Li Fang and
                  Yan Heng and
                  Qin Zhang},
  editor       = {Yong Man Ro and
                  Wen{-}Huang Cheng and
                  Junmo Kim and
                  Wei{-}Ta Chu and
                  Peng Cui and
                  Jung{-}Woo Choi and
                  Min{-}Chun Hu and
                  Wesley De Neve},
  title        = {Global Affective Video Content Regression Based on Complementary Audio-Visual
                  Features},
  booktitle    = {MultiMedia Modeling - 26th International Conference, {MMM} 2020, Daejeon,
                  South Korea, January 5-8, 2020, Proceedings, Part {II}},
  series       = {Lecture Notes in Computer Science},
  volume       = {11962},
  pages        = {540--550},
  publisher    = {Springer},
  year         = {2020},
  url          = {https://doi.org/10.1007/978-3-030-37734-2\_44},
  doi          = {10.1007/978-3-030-37734-2\_44},
  timestamp    = {Mon, 09 Nov 2020 15:46:42 +0100},
  biburl       = {https://dblp.org/rec/conf/mmm/GuoZYFH020.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/qomex/MartinezHF20,
  author       = {Helard Becerra Martinez and
                  Andrew Hines and
                  Myl{\`{e}}ne C. Q. Farias},
  title        = {How Deep is Your Encoder: An Analysis of Features Descriptors for
                  an Autoencoder-Based Audio-Visual Quality Metric},
  booktitle    = {Twelfth International Conference on Quality of Multimedia Experience,
                  QoMEX 2020, Athlone, Ireland, May 26-28, 2020},
  pages        = {1--6},
  publisher    = {{IEEE}},
  year         = {2020},
  url          = {https://doi.org/10.1109/QoMEX48832.2020.9123142},
  doi          = {10.1109/QOMEX48832.2020.9123142},
  timestamp    = {Thu, 14 Oct 2021 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/conf/qomex/MartinezHF20.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/visapp/ChebbiJ20,
  author       = {Safa Chebbi and
                  Sofia Ben Jebara},
  editor       = {Giovanni Maria Farinella and
                  Petia Radeva and
                  Jos{\'{e}} Braz},
  title        = {An Audio-Visual based Feature Level Fusion Approach Applied to Deception
                  Detection},
  booktitle    = {Proceedings of the 15th International Joint Conference on Computer
                  Vision, Imaging and Computer Graphics Theory and Applications, {VISIGRAPP}
                  2020, Volume 4: VISAPP, Valletta, Malta, February 27-29, 2020},
  pages        = {197--205},
  publisher    = {{SCITEPRESS}},
  year         = {2020},
  url          = {https://doi.org/10.5220/0008896201970205},
  doi          = {10.5220/0008896201970205},
  timestamp    = {Thu, 16 Apr 2020 15:04:56 +0200},
  biburl       = {https://dblp.org/rec/conf/visapp/ChebbiJ20.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2003-11100,
  author       = {Helard Becerra Martinez and
                  Andrew Hines and
                  Myl{\`{e}}ne C. Q. Farias},
  title        = {How deep is your encoder: an analysis of features descriptors for
                  an autoencoder-based audio-visual quality metric},
  journal      = {CoRR},
  volume       = {abs/2003.11100},
  year         = {2020},
  url          = {https://arxiv.org/abs/2003.11100},
  eprinttype    = {arXiv},
  eprint       = {2003.11100},
  timestamp    = {Wed, 01 Apr 2020 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2003-11100.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2004-12031,
  author       = {Zakaria Aldeneh and
                  Anushree Prasanna Kumar and
                  Barry{-}John Theobald and
                  Erik Marchi and
                  Sachin Kajarekar and
                  Devang Naik and
                  Ahmed Hussen Abdelaziz},
  title        = {Self-supervised Learning of Visual Speech Features with Audiovisual
                  Speech Enhancement},
  journal      = {CoRR},
  volume       = {abs/2004.12031},
  year         = {2020},
  url          = {https://arxiv.org/abs/2004.12031},
  eprinttype    = {arXiv},
  eprint       = {2004.12031},
  timestamp    = {Tue, 28 Apr 2020 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2004-12031.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2005-13402,
  author       = {Pratik Mazumder and
                  Pravendra Singh and
                  Kranti Kumar Parida and
                  Vinay P. Namboodiri},
  title        = {AVGZSLNet: Audio-Visual Generalized Zero-Shot Learning by Reconstructing
                  Label Features from Multi-Modal Embeddings},
  journal      = {CoRR},
  volume       = {abs/2005.13402},
  year         = {2020},
  url          = {https://arxiv.org/abs/2005.13402},
  eprinttype    = {arXiv},
  eprint       = {2005.13402},
  timestamp    = {Thu, 28 May 2020 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2005-13402.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2011-04359,
  author       = {Shrishti Saha Shetu and
                  Soumitro Chakrabarty and
                  Emanu{\"{e}}l Anco Peter Habets},
  title        = {An Empirical Study of Visual Features for {DNN} based Audio-Visual
                  Speech Enhancement in Multi-talker Environments},
  journal      = {CoRR},
  volume       = {abs/2011.04359},
  year         = {2020},
  url          = {https://arxiv.org/abs/2011.04359},
  eprinttype    = {arXiv},
  eprint       = {2011.04359},
  timestamp    = {Thu, 12 Nov 2020 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2011-04359.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2012-10283,
  author       = {Feiyan Hu and
                  Eva Mohedano and
                  Noel E. O'Connor and
                  Kevin McGuinness},
  title        = {Temporal Bilinear Encoding Network of Audio-Visual Features at Low
                  Sampling Rates},
  journal      = {CoRR},
  volume       = {abs/2012.10283},
  year         = {2020},
  url          = {https://arxiv.org/abs/2012.10283},
  eprinttype    = {arXiv},
  eprint       = {2012.10283},
  timestamp    = {Mon, 04 Jan 2021 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2012-10283.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/dsp/SaudiKA19,
  author       = {Ali S. Saudi and
                  Mahmoud I. Khalil and
                  Hazem M. Abbas},
  title        = {Improved features and dynamic stream weight adaption for robust Audio-Visual
                  Speech Recognition framework},
  journal      = {Digit. Signal Process.},
  volume       = {89},
  pages        = {17--29},
  year         = {2019},
  url          = {https://doi.org/10.1016/j.dsp.2019.02.016},
  doi          = {10.1016/J.DSP.2019.02.016},
  timestamp    = {Mon, 26 Jun 2023 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/dsp/SaudiKA19.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/ieicet/NaranchimegZA19,
  author       = {Bold Naranchimeg and
                  Chao Zhang and
                  Takuya Akashi},
  title        = {Cross-Domain Deep Feature Combination for Bird Species Classification
                  with Audio-Visual Data},
  journal      = {{IEICE} Trans. Inf. Syst.},
  volume       = {102-D},
  number       = {10},
  pages        = {2033--2042},
  year         = {2019},
  url          = {https://doi.org/10.1587/transinf.2018EDP7383},
  doi          = {10.1587/TRANSINF.2018EDP7383},
  timestamp    = {Thu, 23 Jun 2022 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/ieicet/NaranchimegZA19.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/iet-ipr/JavedIMMA19,
  author       = {Ali Javed and
                  Aun Irtaza and
                  Hafiz Malik and
                  Muhammad Tariq Mahmood and
                  Syed Muhammad Adnan Shah},
  title        = {Multimodal framework based on audio-visual features for summarisation
                  of cricket videos},
  journal      = {{IET} Image Process.},
  volume       = {13},
  number       = {4},
  pages        = {615--622},
  year         = {2019},
  url          = {https://doi.org/10.1049/iet-ipr.2018.5589},
  doi          = {10.1049/IET-IPR.2018.5589},
  timestamp    = {Tue, 21 Mar 2023 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/journals/iet-ipr/JavedIMMA19.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/mta/DhirajBG19,
  author       = {Dhiraj and
                  Rohit Biswas and
                  Nischay Ghattamaraju},
  title        = {An effective analysis of deep learning based approaches for audio
                  based feature extraction and its visualization},
  journal      = {Multim. Tools Appl.},
  volume       = {78},
  number       = {17},
  pages        = {23949--23972},
  year         = {2019},
  url          = {https://doi.org/10.1007/s11042-018-6706-x},
  doi          = {10.1007/S11042-018-6706-X},
  timestamp    = {Mon, 11 May 2020 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/mta/DhirajBG19.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/globalsip/HaiderPAL19,
  author       = {Fasih Haider and
                  Senja Pollak and
                  Pierre Albert and
                  Saturnino Luz},
  title        = {Extracting Audio-Visual Features for Emotion Recognition Through Active
                  Feature Selection},
  booktitle    = {2019 {IEEE} Global Conference on Signal and Information Processing,
                  GlobalSIP 2019, Ottawa, ON, Canada, November 11-14, 2019},
  pages        = {1--5},
  publisher    = {{IEEE}},
  year         = {2019},
  url          = {https://doi.org/10.1109/GlobalSIP45357.2019.8969360},
  doi          = {10.1109/GLOBALSIP45357.2019.8969360},
  timestamp    = {Wed, 07 Dec 2022 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/conf/globalsip/HaiderPAL19.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/icassp/HoriA0WHCMCLDEB19,
  author       = {Chiori Hori and
                  Huda AlAmri and
                  Jue Wang and
                  Gordon Wichern and
                  Takaaki Hori and
                  Anoop Cherian and
                  Tim K. Marks and
                  Vincent Cartillier and
                  Raphael Gontijo Lopes and
                  Abhishek Das and
                  Irfan Essa and
                  Dhruv Batra and
                  Devi Parikh},
  title        = {End-to-end Audio Visual Scene-aware Dialog Using Multimodal Attention-based
                  Video Features},
  booktitle    = {{IEEE} International Conference on Acoustics, Speech and Signal Processing,
                  {ICASSP} 2019, Brighton, United Kingdom, May 12-17, 2019},
  pages        = {2352--2356},
  publisher    = {{IEEE}},
  year         = {2019},
  url          = {https://doi.org/10.1109/ICASSP.2019.8682583},
  doi          = {10.1109/ICASSP.2019.8682583},
  timestamp    = {Wed, 16 Oct 2019 14:14:52 +0200},
  biburl       = {https://dblp.org/rec/conf/icassp/HoriA0WHCMCLDEB19.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/iccvw/LeeL19,
  author       = {Hansol Lee and
                  Gyemin Lee},
  title        = {Summarizing Long-Length Videos with GAN-Enhanced Audio/Visual Features},
  booktitle    = {2019 {IEEE/CVF} International Conference on Computer Vision Workshops,
                  {ICCV} Workshops 2019, Seoul, Korea (South), October 27-28, 2019},
  pages        = {3727--3731},
  publisher    = {{IEEE}},
  year         = {2019},
  url          = {https://doi.org/10.1109/ICCVW.2019.00462},
  doi          = {10.1109/ICCVW.2019.00462},
  timestamp    = {Tue, 07 May 2024 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/conf/iccvw/LeeL19.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/iftc/GuoZYF019,
  author       = {Xiaona Guo and
                  Wei Zhong and
                  Long Ye and
                  Li Fang and
                  Qin Zhang},
  editor       = {Guangtao Zhai and
                  Jun Zhou and
                  Hua Yang and
                  Ping An and
                  Xiaokang Yang},
  title        = {Affective Video Content Analysis Based on Two Compact Audio-Visual
                  Features},
  booktitle    = {Digital {TV} and Wireless Multimedia Communication - 16th International
                  Forum, {IFTC} 2019, Shanghai, China, September 19-20, 2019, Revised
                  Selected Papers},
  series       = {Communications in Computer and Information Science},
  volume       = {1181},
  pages        = {355--364},
  publisher    = {Springer},
  year         = {2019},
  url          = {https://doi.org/10.1007/978-981-15-3341-9\_29},
  doi          = {10.1007/978-981-15-3341-9\_29},
  timestamp    = {Wed, 11 Mar 2020 14:23:07 +0100},
  biburl       = {https://dblp.org/rec/conf/iftc/GuoZYF019.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/nips/KumarOSHN19,
  author       = {Shachi H. Kumar and
                  Eda Okur and
                  Saurav Sahay and
                  Jonathan Huang and
                  Lama Nachman},
  title        = {Leveraging Topics and Audio Features with Multimodal Attention for
                  Audio Visual Scene-Aware Dialog},
  booktitle    = {Visually Grounded Interaction and Language (ViGIL), NeurIPS 2019 Workshop,
                  Vancouver, Canada, December 13, 2019},
  year         = {2019},
  url          = {https://vigilworkshop.github.io/static/papers/39.pdf},
  timestamp    = {Thu, 12 Mar 2020 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/conf/nips/KumarOSHN19.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-1912-10131,
  author       = {Shachi H. Kumar and
                  Eda Okur and
                  Saurav Sahay and
                  Jonathan Huang and
                  Lama Nachman},
  title        = {Leveraging Topics and Audio Features with Multimodal Attention for
                  Audio Visual Scene-Aware Dialog},
  journal      = {CoRR},
  volume       = {abs/1912.10131},
  year         = {2019},
  url          = {http://arxiv.org/abs/1912.10131},
  eprinttype    = {arXiv},
  eprint       = {1912.10131},
  timestamp    = {Fri, 03 Jan 2020 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/journals/corr/abs-1912-10131.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-1912-10132,
  author       = {Shachi H. Kumar and
                  Eda Okur and
                  Saurav Sahay and
                  Jonathan Huang and
                  Lama Nachman},
  title        = {Exploring Context, Attention and Audio Features for Audio Visual Scene-Aware
                  Dialog},
  journal      = {CoRR},
  volume       = {abs/1912.10132},
  year         = {2019},
  url          = {http://arxiv.org/abs/1912.10132},
  eprinttype    = {arXiv},
  eprint       = {1912.10132},
  timestamp    = {Fri, 03 Jan 2020 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/journals/corr/abs-1912-10132.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/dsp/RahmaniAS18,
  author       = {Mohammad Hasan Rahmani and
                  Farshad Almasganj and
                  Seyyed Ali Seyyedsalehi},
  title        = {Audio-visual feature fusion via deep neural networks for automatic
                  speech recognition},
  journal      = {Digit. Signal Process.},
  volume       = {82},
  pages        = {54--63},
  year         = {2018},
  url          = {https://doi.org/10.1016/j.dsp.2018.06.004},
  doi          = {10.1016/J.DSP.2018.06.004},
  timestamp    = {Sat, 30 Sep 2023 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/dsp/RahmaniAS18.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/tcsv/ZhangZHGT18,
  author       = {Shiqing Zhang and
                  Shiliang Zhang and
                  Tiejun Huang and
                  Wen Gao and
                  Qi Tian},
  title        = {Learning Affective Features With a Hybrid Deep Model for Audio-Visual
                  Emotion Recognition},
  journal      = {{IEEE} Trans. Circuits Syst. Video Technol.},
  volume       = {28},
  number       = {10},
  pages        = {3030--3043},
  year         = {2018},
  url          = {https://doi.org/10.1109/TCSVT.2017.2719043},
  doi          = {10.1109/TCSVT.2017.2719043},
  timestamp    = {Thu, 02 Dec 2021 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/journals/tcsv/ZhangZHGT18.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/tmm/BeyanCBM18,
  author       = {Cigdem Beyan and
                  Francesca Capozzi and
                  Cristina Becchio and
                  Vittorio Murino},
  title        = {Prediction of the Leadership Style of an Emergent Leader Using Audio
                  and Visual Nonverbal Features},
  journal      = {{IEEE} Trans. Multim.},
  volume       = {20},
  number       = {2},
  pages        = {441--456},
  year         = {2018},
  url          = {https://doi.org/10.1109/TMM.2017.2740062},
  doi          = {10.1109/TMM.2017.2740062},
  timestamp    = {Thu, 01 Oct 2020 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/tmm/BeyanCBM18.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/eccv/OwensE18,
  author       = {Andrew Owens and
                  Alexei A. Efros},
  editor       = {Vittorio Ferrari and
                  Martial Hebert and
                  Cristian Sminchisescu and
                  Yair Weiss},
  title        = {Audio-Visual Scene Analysis with Self-Supervised Multisensory Features},
  booktitle    = {Computer Vision - {ECCV} 2018 - 15th European Conference, Munich,
                  Germany, September 8-14, 2018, Proceedings, Part {VI}},
  series       = {Lecture Notes in Computer Science},
  volume       = {11210},
  pages        = {639--658},
  publisher    = {Springer},
  year         = {2018},
  url          = {https://doi.org/10.1007/978-3-030-01231-1\_39},
  doi          = {10.1007/978-3-030-01231-1\_39},
  timestamp    = {Sun, 02 Oct 2022 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/conf/eccv/OwensE18.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/gcce/HaruyamaTOH18,
  author       = {Tomoki Haruyama and
                  Sho Takahashi and
                  Takahiro Ogawa and
                  Miki Haseyama},
  title        = {Estimation of Important Scenes in Soccer Videos Based on Collaborative
                  Use of Audio-Visual {CNN} Features},
  booktitle    = {{IEEE} 7th Global Conference on Consumer Electronics, {GCCE} 2018,
                  Nara, Japan, October 9-12, 2018},
  pages        = {710--711},
  publisher    = {{IEEE}},
  year         = {2018},
  url          = {https://doi.org/10.1109/GCCE.2018.8574727},
  doi          = {10.1109/GCCE.2018.8574727},
  timestamp    = {Wed, 12 Jan 2022 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/conf/gcce/HaruyamaTOH18.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/huc/MatsudaFTAYM18,
  author       = {Yuki Matsuda and
                  Dmitrii Fedotov and
                  Yuta Takahashi and
                  Yutaka Arakawa and
                  Keiichi Yasumoto and
                  Wolfgang Minker},
  title        = {EmoTour: Multimodal Emotion Recognition using Physiological and Audio-Visual
                  Features},
  booktitle    = {Proceedings of the 2018 {ACM} International Joint Conference and 2018
                  International Symposium on Pervasive and Ubiquitous Computing and
                  Wearable Computers, UbiComp/ISWC 2018 Adjunct, Singapore, October
                  08-12, 2018},
  pages        = {946--951},
  publisher    = {{ACM}},
  year         = {2018},
  url          = {https://doi.org/10.1145/3267305.3267687},
  doi          = {10.1145/3267305.3267687},
  timestamp    = {Mon, 28 Aug 2023 21:17:48 +0200},
  biburl       = {https://dblp.org/rec/conf/huc/MatsudaFTAYM18.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/icassp/AkhtarBF18,
  author       = {Zahid Akhtar and
                  Stefany Bedoya and
                  Tiago H. Falk},
  title        = {Improved Audio-Visual Laughter Detection Via Multi-Scale Multi-Resolution
                  Image Texture Features and Classifier Fusion},
  booktitle    = {2018 {IEEE} International Conference on Acoustics, Speech and Signal
                  Processing, {ICASSP} 2018, Calgary, AB, Canada, April 15-20, 2018},
  pages        = {3106--3110},
  publisher    = {{IEEE}},
  year         = {2018},
  url          = {https://doi.org/10.1109/ICASSP.2018.8461611},
  doi          = {10.1109/ICASSP.2018.8461611},
  timestamp    = {Tue, 21 Mar 2023 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/conf/icassp/AkhtarBF18.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/interspeech/SuLW18,
  author       = {Rongfeng Su and
                  Xunying Liu and
                  Lan Wang},
  editor       = {B. Yegnanarayana},
  title        = {Semi-supervised Cross-domain Visual Feature Learning for Audio-Visual
                  Broadcast Speech Transcription},
  booktitle    = {Interspeech 2018, 19th Annual Conference of the International Speech
                  Communication Association, Hyderabad, India, 2-6 September 2018},
  pages        = {3509--3513},
  publisher    = {{ISCA}},
  year         = {2018},
  url          = {https://doi.org/10.21437/Interspeech.2018-1063},
  doi          = {10.21437/INTERSPEECH.2018-1063},
  timestamp    = {Fri, 21 May 2021 08:16:43 +0200},
  biburl       = {https://dblp.org/rec/conf/interspeech/SuLW18.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/interspeech/TamuraHEHT18,
  author       = {Satoshi Tamura and
                  Kento Horio and
                  Hajime Endo and
                  Satoru Hayamizu and
                  Tomoki Toda},
  editor       = {B. Yegnanarayana},
  title        = {Audio-visual Voice Conversion Using Deep Canonical Correlation Analysis
                  for Deep Bottleneck Features},
  booktitle    = {Interspeech 2018, 19th Annual Conference of the International Speech
                  Communication Association, Hyderabad, India, 2-6 September 2018},
  pages        = {2469--2473},
  publisher    = {{ISCA}},
  year         = {2018},
  url          = {https://doi.org/10.21437/Interspeech.2018-2286},
  doi          = {10.21437/INTERSPEECH.2018-2286},
  timestamp    = {Fri, 09 Apr 2021 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/conf/interspeech/TamuraHEHT18.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/prcv/HouWW18,
  author       = {Congcong Hou and
                  Xiaoyu Wu and
                  Ge Wang},
  editor       = {Jian{-}Huang Lai and
                  Cheng{-}Lin Liu and
                  Xilin Chen and
                  Jie Zhou and
                  Tieniu Tan and
                  Nanning Zheng and
                  Hongbin Zha},
  title        = {End-to-End Bloody Video Recognition by Audio-Visual Feature Fusion},
  booktitle    = {Pattern Recognition and Computer Vision - First Chinese Conference,
                  {PRCV} 2018, Guangzhou, China, November 23-26, 2018, Proceedings,
                  Part {I}},
  series       = {Lecture Notes in Computer Science},
  volume       = {11256},
  pages        = {501--510},
  publisher    = {Springer},
  year         = {2018},
  url          = {https://doi.org/10.1007/978-3-030-03398-9\_43},
  doi          = {10.1007/978-3-030-03398-9\_43},
  timestamp    = {Fri, 03 Dec 2021 12:20:53 +0100},
  biburl       = {https://dblp.org/rec/conf/prcv/HouWW18.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-1804-03641,
  author       = {Andrew Owens and
                  Alexei A. Efros},
  title        = {Audio-Visual Scene Analysis with Self-Supervised Multisensory Features},
  journal      = {CoRR},
  volume       = {abs/1804.03641},
  year         = {2018},
  url          = {http://arxiv.org/abs/1804.03641},
  eprinttype    = {arXiv},
  eprint       = {1804.03641},
  timestamp    = {Mon, 13 Aug 2018 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/abs-1804-03641.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-1805-00625,
  author       = {Didan Deng and
                  Yuqian Zhou and
                  Jimin Pi and
                  Bertram E. Shi},
  title        = {Multimodal Utterance-level Affect Analysis using Visual, Audio and
                  Text Features},
  journal      = {CoRR},
  volume       = {abs/1805.00625},
  year         = {2018},
  url          = {http://arxiv.org/abs/1805.00625},
  eprinttype    = {arXiv},
  eprint       = {1805.00625},
  timestamp    = {Mon, 13 Aug 2018 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/abs-1805-00625.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-1806-08409,
  author       = {Chiori Hori and
                  Huda AlAmri and
                  Jue Wang and
                  Gordon Wichern and
                  Takaaki Hori and
                  Anoop Cherian and
                  Tim K. Marks and
                  Vincent Cartillier and
                  Raphael Gontijo Lopes and
                  Abhishek Das and
                  Irfan Essa and
                  Dhruv Batra and
                  Devi Parikh},
  title        = {End-to-End Audio Visual Scene-Aware Dialog using Multimodal Attention-Based
                  Video Features},
  journal      = {CoRR},
  volume       = {abs/1806.08409},
  year         = {2018},
  url          = {http://arxiv.org/abs/1806.08409},
  eprinttype    = {arXiv},
  eprint       = {1806.08409},
  timestamp    = {Thu, 31 Jan 2019 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/journals/corr/abs-1806-08409.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-1807-00612,
  author       = {Mehmet Ali Arabaci and
                  Fatih {\"{O}}zkan and
                  Elif S{\"{u}}rer and
                  Peter Jancovic and
                  Alptekin Temizel},
  title        = {Multi-modal Egocentric Activity Recognition using Audio-Visual Features},
  journal      = {CoRR},
  volume       = {abs/1807.00612},
  year         = {2018},
  url          = {http://arxiv.org/abs/1807.00612},
  eprinttype    = {arXiv},
  eprint       = {1807.00612},
  timestamp    = {Mon, 17 May 2021 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/abs-1807-00612.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-1811-10199,
  author       = {Bold Naranchimeg and
                  Chao Zhang and
                  Takuya Akashi},
  title        = {Cross-domain Deep Feature Combination for Bird Species Classification
                  with Audio-visual Data},
  journal      = {CoRR},
  volume       = {abs/1811.10199},
  year         = {2018},
  url          = {http://arxiv.org/abs/1811.10199},
  eprinttype    = {arXiv},
  eprint       = {1811.10199},
  timestamp    = {Tue, 14 Jan 2020 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/journals/corr/abs-1811-10199.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-1812-08407,
  author       = {Shachi H. Kumar and
                  Eda Okur and
                  Saurav Sahay and
                  Juan Jose Alvarado Leanos and
                  Jonathan Huang and
                  Lama Nachman},
  title        = {Context, Attention and Audio Feature Explorations for Audio Visual
                  Scene-Aware Dialog},
  journal      = {CoRR},
  volume       = {abs/1812.08407},
  year         = {2018},
  url          = {http://arxiv.org/abs/1812.08407},
  eprinttype    = {arXiv},
  eprint       = {1812.08407},
  timestamp    = {Wed, 02 Jan 2019 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/journals/corr/abs-1812-08407.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/computation/PapakostasSGSSM17,
  author       = {Michalis Papakostas and
                  Evaggelos Spyrou and
                  Theodoros Giannakopoulos and
                  Giorgos Siantikos and
                  Dimitrios Sgouropoulos and
                  Phivos Mylonas and
                  Fillia Makedon},
  title        = {Deep Visual Attributes vs. Hand-Crafted Audio Features on Multidomain
                  Speech Emotion Recognition},
  journal      = {Comput.},
  volume       = {5},
  number       = {2},
  pages        = {26},
  year         = {2017},
  url          = {https://doi.org/10.3390/computation5020026},
  doi          = {10.3390/COMPUTATION5020026},
  timestamp    = {Thu, 01 Oct 2020 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/computation/PapakostasSGSSM17.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/jrm/NakadaiK17,
  author       = {Kazuhiro Nakadai and
                  Tomoaki Koiwa},
  title        = {Psychologically-Inspired Audio-Visual Speech Recognition Using Coarse
                  Speech Recognition and Missing Feature Theory},
  journal      = {J. Robotics Mechatronics},
  volume       = {29},
  number       = {1},
  pages        = {105--113},
  year         = {2017},
  url          = {https://doi.org/10.20965/jrm.2017.p0105},
  doi          = {10.20965/JRM.2017.P0105},
  timestamp    = {Wed, 01 Apr 2020 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/jrm/NakadaiK17.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/mta/GharavianBS17,
  author       = {Davood Gharavian and
                  Mahdi Bejani and
                  Mansour Sheikhan},
  title        = {Audio-visual emotion recognition using {FCBF} feature selection method
                  and particle swarm optimization for fuzzy {ARTMAP} neural networks},
  journal      = {Multim. Tools Appl.},
  volume       = {76},
  number       = {2},
  pages        = {2331--2352},
  year         = {2017},
  url          = {https://doi.org/10.1007/s11042-015-3180-6},
  doi          = {10.1007/S11042-015-3180-6},
  timestamp    = {Mon, 26 Oct 2020 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/journals/mta/GharavianBS17.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/prl/NanniCLSB17,
  author       = {Loris Nanni and
                  Yandre M. G. Costa and
                  Diego Rafael Lucio and
                  Carlos Nascimento Silla Jr. and
                  Sheryl Brahnam},
  title        = {Combining visual and acoustic features for audio classification tasks},
  journal      = {Pattern Recognit. Lett.},
  volume       = {88},
  pages        = {49--56},
  year         = {2017},
  url          = {https://doi.org/10.1016/j.patrec.2017.01.013},
  doi          = {10.1016/J.PATREC.2017.01.013},
  timestamp    = {Sat, 22 Feb 2020 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/journals/prl/NanniCLSB17.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/speech/SuiTB17,
  author       = {Chao Sui and
                  Roberto Togneri and
                  Mohammed Bennamoun},
  title        = {A cascade gray-stereo visual feature extraction method for visual
                  and audio-visual speech recognition},
  journal      = {Speech Commun.},
  volume       = {90},
  pages        = {26--38},
  year         = {2017},
  url          = {https://doi.org/10.1016/j.specom.2017.01.005},
  doi          = {10.1016/J.SPECOM.2017.01.005},
  timestamp    = {Sat, 22 Feb 2020 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/journals/speech/SuiTB17.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/ccbr/LiuLFZD17,
  author       = {Yu{-}Hang Liu and
                  Xin Liu and
                  Wentao Fan and
                  Bineng Zhong and
                  Ji{-}Xiang Du},
  editor       = {Jie Zhou and
                  Yunhong Wang and
                  Zhenan Sun and
                  Yong Xu and
                  Linlin Shen and
                  Jianjiang Feng and
                  Shiguang Shan and
                  Yu Qiao and
                  Zhenhua Guo and
                  Shiqi Yu},
  title        = {Efficient Audio-Visual Speaker Recognition via Deep Heterogeneous
                  Feature Fusion},
  booktitle    = {Biometric Recognition - 12th Chinese Conference, {CCBR} 2017, Shenzhen,
                  China, October 28-29, 2017, Proceedings},
  series       = {Lecture Notes in Computer Science},
  volume       = {10568},
  pages        = {575--583},
  publisher    = {Springer},
  year         = {2017},
  url          = {https://doi.org/10.1007/978-3-319-69923-3\_62},
  doi          = {10.1007/978-3-319-69923-3\_62},
  timestamp    = {Tue, 04 Oct 2022 18:09:04 +0200},
  biburl       = {https://dblp.org/rec/conf/ccbr/LiuLFZD17.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/ciss/MendatWRNA17,
  author       = {Daniel R. Mendat and
                  James E. West and
                  Sudarshan Ramenahalli and
                  Ernst Niebur and
                  Andreas G. Andreou},
  title        = {Audio-Visual beamforming with the Eigenmike microphone array an omni-camera
                  and cognitive auditory features},
  booktitle    = {51st Annual Conference on Information Sciences and Systems, {CISS}
                  2017, Baltimore, MD, USA, March 22-24, 2017},
  pages        = {1--4},
  publisher    = {{IEEE}},
  year         = {2017},
  url          = {https://doi.org/10.1109/CISS.2017.7926180},
  doi          = {10.1109/CISS.2017.7926180},
  timestamp    = {Sun, 25 Jul 2021 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/conf/ciss/MendatWRNA17.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/vcip/BokshiTBH17,
  author       = {Mangona Bokshi and
                  Fei Tao and
                  Carlos Busso and
                  John H. L. Hansen},
  title        = {Assessment and classification of singing quality based on audio-visual
                  features},
  booktitle    = {2017 {IEEE} Visual Communications and Image Processing, {VCIP} 2017,
                  St. Petersburg, FL, USA, December 10-13, 2017},
  pages        = {1--4},
  publisher    = {{IEEE}},
  year         = {2017},
  url          = {https://doi.org/10.1109/VCIP.2017.8305078},
  doi          = {10.1109/VCIP.2017.8305078},
  timestamp    = {Thu, 14 Oct 2021 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/conf/vcip/BokshiTBH17.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/ijst/BiswasSC16,
  author       = {Astik Biswas and
                  Prakash Kumar Sahu and
                  Mahesh Chandra},
  title        = {Multiple cameras audio visual speech recognition using active appearance
                  model visual features in car environment},
  journal      = {Int. J. Speech Technol.},
  volume       = {19},
  number       = {1},
  pages        = {159--171},
  year         = {2016},
  url          = {https://doi.org/10.1007/s10772-016-9332-x},
  doi          = {10.1007/S10772-016-9332-X},
  timestamp    = {Mon, 26 Oct 2020 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/journals/ijst/BiswasSC16.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/eccv/SubramaniamPMBM16,
  author       = {Arulkumar Subramaniam and
                  Vismay Patel and
                  Ashish Mishra and
                  Prashanth Balasubramanian and
                  Anurag Mittal},
  editor       = {Gang Hua and
                  Herv{\'{e}} J{\'{e}}gou},
  title        = {Bi-modal First Impressions Recognition Using Temporally Ordered Deep
                  Audio and Stochastic Visual Features},
  booktitle    = {Computer Vision - {ECCV} 2016 Workshops - Amsterdam, The Netherlands,
                  October 8-10 and 15-16, 2016, Proceedings, Part {III}},
  series       = {Lecture Notes in Computer Science},
  volume       = {9915},
  pages        = {337--348},
  year         = {2016},
  url          = {https://doi.org/10.1007/978-3-319-49409-8\_27},
  doi          = {10.1007/978-3-319-49409-8\_27},
  timestamp    = {Sun, 02 Jun 2019 21:17:49 +0200},
  biburl       = {https://dblp.org/rec/conf/eccv/SubramaniamPMBM16.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/icwsm/PereiraPPBD16,
  author       = {Mois{\'{e}}s Henrique Ramos Pereira and
                  Fl{\'{a}}vio Luis Cardeal P{\'{a}}dua and
                  Adriano C{\'{e}}sar Machado Pereira and
                  Fabr{\'{\i}}cio Benevenuto and
                  Daniel Hasan Dalip},
  title        = {Fusing Audio, Textual, and Visual Features for Sentiment Analysis
                  of News Videos},
  booktitle    = {Proceedings of the Tenth International Conference on Web and Social
                  Media, Cologne, Germany, May 17-20, 2016},
  pages        = {659--662},
  publisher    = {{AAAI} Press},
  year         = {2016},
  url          = {http://www.aaai.org/ocs/index.php/ICWSM/ICWSM16/paper/view/13144},
  timestamp    = {Fri, 05 Feb 2021 11:07:46 +0100},
  biburl       = {https://dblp.org/rec/conf/icwsm/PereiraPPBD16.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/interspeech/Heckmann16,
  author       = {Martin Heckmann},
  editor       = {Nelson Morgan},
  title        = {Feature-Level Decision Fusion for Audio-Visual Word Prominence Detection},
  booktitle    = {Interspeech 2016, 17th Annual Conference of the International Speech
                  Communication Association, San Francisco, CA, USA, September 8-12,
                  2016},
  pages        = {575--579},
  publisher    = {{ISCA}},
  year         = {2016},
  url          = {https://doi.org/10.21437/Interspeech.2016-163},
  doi          = {10.21437/INTERSPEECH.2016-163},
  timestamp    = {Mon, 26 Jun 2023 16:43:56 +0200},
  biburl       = {https://dblp.org/rec/conf/interspeech/Heckmann16.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/interspeech/TakashimaATAMON16,
  author       = {Yuki Takashima and
                  Ryo Aihara and
                  Tetsuya Takiguchi and
                  Yasuo Ariki and
                  Nobuyuki Mitani and
                  Kiyohiro Omori and
                  Kaoru Nakazono},
  editor       = {Nelson Morgan},
  title        = {Audio-Visual Speech Recognition Using Bimodal-Trained Bottleneck Features
                  for a Person with Severe Hearing Loss},
  booktitle    = {Interspeech 2016, 17th Annual Conference of the International Speech
                  Communication Association, San Francisco, CA, USA, September 8-12,
                  2016},
  pages        = {277--281},
  publisher    = {{ISCA}},
  year         = {2016},
  url          = {https://doi.org/10.21437/Interspeech.2016-721},
  doi          = {10.21437/INTERSPEECH.2016-721},
  timestamp    = {Fri, 29 Jan 2021 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/conf/interspeech/TakashimaATAMON16.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/mldm/MuhammadD16,
  author       = {Atta Muhammad and
                  Sher Muhammad Daudpota},
  editor       = {Petra Perner},
  title        = {Content Based Identification of Talk Show Videos Using Audio Visual
                  Features},
  booktitle    = {Machine Learning and Data Mining in Pattern Recognition - 12th International
                  Conference, {MLDM} 2016, New York, NY, USA, July 16-21, 2016, Proceedings},
  series       = {Lecture Notes in Computer Science},
  volume       = {9729},
  pages        = {267--283},
  publisher    = {Springer},
  year         = {2016},
  url          = {https://doi.org/10.1007/978-3-319-41920-6\_20},
  doi          = {10.1007/978-3-319-41920-6\_20},
  timestamp    = {Wed, 07 Dec 2022 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/conf/mldm/MuhammadD16.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/sccc/LucioMC16,
  author       = {Diego Rafael Lucio and
                  Yandre Maldonado e Gomes da Costa},
  editor       = {Claudio Cubillos and
                  Hern{\'{a}}n Astudillo},
  title        = {Bird species classification using visual and acoustic features extracted
                  from audio signal},
  booktitle    = {35th International Conference of the Chilean Computer Science Society,
                  {SCCC} 2016, Valpara{\'{\i}}so, Chile, October 10-14, 2016},
  pages        = {1--12},
  publisher    = {{IEEE}},
  year         = {2016},
  url          = {https://doi.org/10.1109/SCCC.2016.7836063},
  doi          = {10.1109/SCCC.2016.7836063},
  timestamp    = {Wed, 16 Oct 2019 14:14:56 +0200},
  biburl       = {https://dblp.org/rec/conf/sccc/LucioMC16.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/sigcse/DokeP16,
  author       = {Abhay Doke and
                  Niranjan Pedanekar},
  editor       = {Carl Alphonce and
                  Jodi L. Tims and
                  Michael E. Caspersen and
                  Stephen H. Edwards},
  title        = {Lights, Camera, but No Action: Exploring Affective Audio-Visual Features
                  of Educational Videos (Abstract Only)},
  booktitle    = {Proceedings of the 47th {ACM} Technical Symposium on Computing Science
                  Education, {SIGCSE} 2016, Memphis, TN, USA, March 02 - 05, 2016},
  pages        = {686},
  publisher    = {{ACM}},
  year         = {2016},
  url          = {https://doi.org/10.1145/2839509.2850535},
  doi          = {10.1145/2839509.2850535},
  timestamp    = {Mon, 13 Dec 2021 09:32:31 +0100},
  biburl       = {https://dblp.org/rec/conf/sigcse/DokeP16.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/tsp/PalecekC16,
  author       = {Karel Palecek and
                  Josef Chaloupka},
  title        = {Depth-based features in audio-visual speech recognition},
  booktitle    = {39th International Conference on Telecommunications and Signal Processing,
                  {TSP} 2016, Vienna, Austria, June 27-29, 2016},
  pages        = {303--306},
  publisher    = {{IEEE}},
  year         = {2016},
  url          = {https://doi.org/10.1109/TSP.2016.7760884},
  doi          = {10.1109/TSP.2016.7760884},
  timestamp    = {Mon, 05 Feb 2024 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/conf/tsp/PalecekC16.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/PereiraPPBD16,
  author       = {Mois{\'{e}}s H. R. Pereira and
                  Fl{\'{a}}vio L. C. P{\'{a}}dua and
                  Adriano C. M. Pereira and
                  Fabr{\'{\i}}cio Benevenuto and
                  Daniel Hasan Dalip},
  title        = {Fusing Audio, Textual and Visual Features for Sentiment Analysis of
                  News Videos},
  journal      = {CoRR},
  volume       = {abs/1604.02612},
  year         = {2016},
  url          = {http://arxiv.org/abs/1604.02612},
  eprinttype    = {arXiv},
  eprint       = {1604.02612},
  timestamp    = {Mon, 12 Aug 2019 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/PereiraPPBD16.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/SubramaniamPMBM16,
  author       = {Arulkumar Subramaniam and
                  Vismay Patel and
                  Ashish Mishra and
                  Prashanth Balasubramanian and
                  Anurag Mittal},
  title        = {Bi-modal First Impressions Recognition using Temporally Ordered Deep
                  Audio and Stochastic Visual Features},
  journal      = {CoRR},
  volume       = {abs/1610.10048},
  year         = {2016},
  url          = {http://arxiv.org/abs/1610.10048},
  eprinttype    = {arXiv},
  eprint       = {1610.10048},
  timestamp    = {Mon, 13 Aug 2018 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/SubramaniamPMBM16.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/ijst/BordeVMY15,
  author       = {Prashant L. Borde and
                  Amarsinh Varpe and
                  Ramesh R. Manza and
                  Pravin L. Yannawar},
  title        = {Recognition of isolated words using Zernike and {MFCC} features for
                  audio visual speech recognition},
  journal      = {Int. J. Speech Technol.},
  volume       = {18},
  number       = {2},
  pages        = {167--175},
  year         = {2015},
  url          = {https://doi.org/10.1007/s10772-014-9257-1},
  doi          = {10.1007/S10772-014-9257-1},
  timestamp    = {Mon, 08 Jun 2020 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/ijst/BordeVMY15.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/taffco/HanJH0L15,
  author       = {Junwei Han and
                  Xiang Ji and
                  Xintao Hu and
                  Lei Guo and
                  Tianming Liu},
  title        = {Arousal Recognition Using Audio-Visual Features and FMRI-Based Brain
                  Response},
  journal      = {{IEEE} Trans. Affect. Comput.},
  volume       = {6},
  number       = {4},
  pages        = {337--347},
  year         = {2015},
  url          = {https://doi.org/10.1109/TAFFC.2015.2411280},
  doi          = {10.1109/TAFFC.2015.2411280},
  timestamp    = {Mon, 19 Feb 2024 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/journals/taffco/HanJH0L15.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/apsipa/TamuraNKOITH15,
  author       = {Satoshi Tamura and
                  Hiroshi Ninomiya and
                  Norihide Kitaoka and
                  Shin Osuga and
                  Yurie Iribe and
                  Kazuya Takeda and
                  Satoru Hayamizu},
  title        = {Audio-visual speech recognition using deep bottleneck features and
                  high-performance lipreading},
  booktitle    = {Asia-Pacific Signal and Information Processing Association Annual
                  Summit and Conference, {APSIPA} 2015, Hong Kong, December 16-19, 2015},
  pages        = {575--582},
  publisher    = {{IEEE}},
  year         = {2015},
  url          = {https://doi.org/10.1109/APSIPA.2015.7415335},
  doi          = {10.1109/APSIPA.2015.7415335},
  timestamp    = {Wed, 16 Oct 2019 14:14:55 +0200},
  biburl       = {https://dblp.org/rec/conf/apsipa/TamuraNKOITH15.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/caip/CastroMG15,
  author       = {Francisco M. Castro and
                  Manuel J. Mar{\'{\i}}n{-}Jim{\'{e}}nez and
                  Nicol{\'{a}}s Guil},
  editor       = {George Azzopardi and
                  Nicolai Petkov},
  title        = {Empirical Study of Audio-Visual Features Fusion for Gait Recognition},
  booktitle    = {Computer Analysis of Images and Patterns - 16th International Conference,
                  {CAIP} 2015, Valletta, Malta, September 2-4, 2015 Proceedings, Part
                  {I}},
  series       = {Lecture Notes in Computer Science},
  volume       = {9256},
  pages        = {727--739},
  publisher    = {Springer},
  year         = {2015},
  url          = {https://doi.org/10.1007/978-3-319-23192-1\_61},
  doi          = {10.1007/978-3-319-23192-1\_61},
  timestamp    = {Tue, 14 May 2019 10:00:53 +0200},
  biburl       = {https://dblp.org/rec/conf/caip/CastroMG15.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/cores/ForczmanskiM15,
  author       = {Pawel Forczmanski and
                  Tomasz Maka},
  editor       = {Robert Burduk and
                  Konrad Jackowski and
                  Marek Kurzynski and
                  Michal Wozniak and
                  Andrzej Zolnierek},
  title        = {Investigating Combinations of Visual Audio Features and Distance Metrics
                  in the Problem of Audio Classification},
  booktitle    = {Proceedings of the 9th International Conference on Computer Recognition
                  Systems {CORES} 2015, Wroclaw, Poland, 25-27 May 2015},
  series       = {Advances in Intelligent Systems and Computing},
  volume       = {403},
  pages        = {733--744},
  publisher    = {Springer},
  year         = {2015},
  url          = {https://doi.org/10.1007/978-3-319-26227-7\_69},
  doi          = {10.1007/978-3-319-26227-7\_69},
  timestamp    = {Sat, 09 Apr 2022 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/conf/cores/ForczmanskiM15.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/ecir/SchindlerR15,
  author       = {Alexander Schindler and
                  Andreas Rauber},
  editor       = {Allan Hanbury and
                  Gabriella Kazai and
                  Andreas Rauber and
                  Norbert Fuhr},
  title        = {An Audio-Visual Approach to Music Genre Classification through Affective
                  Color Features},
  booktitle    = {Advances in Information Retrieval - 37th European Conference on {IR}
                  Research, {ECIR} 2015, Vienna, Austria, March 29 - April 2, 2015.
                  Proceedings},
  series       = {Lecture Notes in Computer Science},
  volume       = {9022},
  pages        = {61--67},
  year         = {2015},
  url          = {https://doi.org/10.1007/978-3-319-16354-3\_8},
  doi          = {10.1007/978-3-319-16354-3\_8},
  timestamp    = {Tue, 14 May 2019 10:00:37 +0200},
  biburl       = {https://dblp.org/rec/conf/ecir/SchindlerR15.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/icdsp/TahirCMDDMT15,
  author       = {Yasir Tahir and
                  Debsubhra Chakraborty and
                  Tomasz Maszczyk and
                  Shoko Dauwels and
                  Justin Dauwels and
                  Nadia Magnenat{-}Thalmann and
                  Daniel Thalmann},
  title        = {Real-time sociometrics from audio-visual features for two-person dialogs},
  booktitle    = {2015 {IEEE} International Conference on Digital Signal Processing,
                  {DSP} 2015, Singapore, July 21-24, 2015},
  pages        = {823--827},
  publisher    = {{IEEE}},
  year         = {2015},
  url          = {https://doi.org/10.1109/ICDSP.2015.7251991},
  doi          = {10.1109/ICDSP.2015.7251991},
  timestamp    = {Sun, 02 Oct 2022 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/conf/icdsp/TahirCMDDMT15.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/icinco/SidorovSIM15,
  author       = {Maxim Sidorov and
                  Evgenii Sopov and
                  Ilia Ivanov and
                  Wolfgang Minker},
  editor       = {Joaquim Filipe and
                  Kurosh Madani and
                  Oleg Yu. Gusikhin and
                  Jurek Z. Sasiadek},
  title        = {Feature and Decision Level Audio-visual Data Fusion in Emotion Recognition
                  Problem},
  booktitle    = {{ICINCO} 2015 - Proceedings of the 12th International Conference on
                  Informatics in Control, Automation and Robotics, Volume 2, Colmar,
                  Alsace, France, 21-23 July, 2015},
  pages        = {246--251},
  publisher    = {SciTePress},
  year         = {2015},
  url          = {https://doi.org/10.5220/0005527002460251},
  doi          = {10.5220/0005527002460251},
  timestamp    = {Mon, 09 Aug 2021 17:01:45 +0200},
  biburl       = {https://dblp.org/rec/conf/icinco/SidorovSIM15.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/interspeech/CornuM15,
  author       = {Thomas Le Cornu and
                  Ben Milner},
  title        = {Reconstructing intelligible audio speech from visual speech features},
  booktitle    = {{INTERSPEECH} 2015, 16th Annual Conference of the International Speech
                  Communication Association, Dresden, Germany, September 6-10, 2015},
  pages        = {3355--3359},
  publisher    = {{ISCA}},
  year         = {2015},
  url          = {https://doi.org/10.21437/Interspeech.2015-139},
  doi          = {10.21437/INTERSPEECH.2015-139},
  timestamp    = {Fri, 23 Jun 2023 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/conf/interspeech/CornuM15.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/interspeech/NinomiyaKTIT15,
  author       = {Hiroshi Ninomiya and
                  Norihide Kitaoka and
                  Satoshi Tamura and
                  Yurie Iribe and
                  Kazuya Takeda},
  title        = {Integration of deep bottleneck features for audio-visual speech recognition},
  booktitle    = {{INTERSPEECH} 2015, 16th Annual Conference of the International Speech
                  Communication Association, Dresden, Germany, September 6-10, 2015},
  pages        = {563--567},
  publisher    = {{ISCA}},
  year         = {2015},
  url          = {https://doi.org/10.21437/Interspeech.2015-204},
  doi          = {10.21437/INTERSPEECH.2015-204},
  timestamp    = {Fri, 23 Jun 2023 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/conf/interspeech/NinomiyaKTIT15.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/mediaeval/NishiIS15,
  author       = {Fumito Nishi and
                  Nakamasa Inoue and
                  Koichi Shinoda},
  editor       = {Martha A. Larson and
                  Bogdan Ionescu and
                  Mats Sj{\"{o}}berg and
                  Xavier Anguera and
                  Johann Poignant and
                  Michael Riegler and
                  Maria Eskevich and
                  Claudia Hauff and
                  Richard F. E. Sutcliffe and
                  Gareth J. F. Jones and
                  Yi{-}Hsuan Yang and
                  Mohammad Soleymani and
                  Symeon Papadopoulos},
  title        = {Combining Audio Features and Visual I-Vector @ MediaEval 2015 Multimodal
                  Person Discovery in Broadcast {TV}},
  booktitle    = {Working Notes Proceedings of the MediaEval 2015 Workshop, Wurzen,
                  Germany, September 14-15, 2015},
  series       = {{CEUR} Workshop Proceedings},
  volume       = {1436},
  publisher    = {CEUR-WS.org},
  year         = {2015},
  url          = {https://ceur-ws.org/Vol-1436/Paper39.pdf},
  timestamp    = {Fri, 10 Mar 2023 16:22:12 +0100},
  biburl       = {https://dblp.org/rec/conf/mediaeval/NishiIS15.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/retis/BiswasSBC15,
  author       = {Astik Biswas and
                  Prakash Kumar Sahu and
                  Anirban Bhowmick and
                  Mahesh Chandra},
  title        = {VidTIMIT audio visual phoneme recognition using {AAM} visual features
                  and human auditory motivated acoustic wavelet features},
  booktitle    = {2nd {IEEE} International Conference on Recent Trends in Information
                  Systems, ReTIS 2015, Kolkata, India, July 9-11, 2015},
  pages        = {428--433},
  publisher    = {{IEEE}},
  year         = {2015},
  url          = {https://doi.org/10.1109/ReTIS.2015.7232917},
  doi          = {10.1109/RETIS.2015.7232917},
  timestamp    = {Sun, 25 Oct 2020 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/conf/retis/BiswasSBC15.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/acisc/IslamS14,
  author       = {Md. Rabiul Islam and
                  Md. Abdus Sobhan},
  title        = {Feature Fusion Based Audio-Visual Speaker Identification Using Hidden
                  Markov Model under Different Lighting Variations},
  journal      = {Appl. Comput. Intell. Soft Comput.},
  volume       = {2014},
  pages        = {831830:1--831830:7},
  year         = {2014},
  url          = {https://doi.org/10.1155/2014/831830},
  doi          = {10.1155/2014/831830},
  timestamp    = {Wed, 22 Jul 2020 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/acisc/IslamS14.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/fgcs/LiuYXT14,
  author       = {Yizhi Liu and
                  Ying Yang and
                  Hongtao Xie and
                  Sheng Tang},
  title        = {Fusing audio vocabulary with visual features for pornographic video
                  detection},
  journal      = {Future Gener. Comput. Syst.},
  volume       = {31},
  pages        = {69--76},
  year         = {2014},
  url          = {https://doi.org/10.1016/j.future.2012.08.012},
  doi          = {10.1016/J.FUTURE.2012.08.012},
  timestamp    = {Wed, 19 Feb 2020 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/journals/fgcs/LiuYXT14.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/isci/Rinaldi14,
  author       = {Antonio M. Rinaldi},
  title        = {A multimedia ontology model based on linguistic properties and audio-visual
                  features},
  journal      = {Inf. Sci.},
  volume       = {277},
  pages        = {234--246},
  year         = {2014},
  url          = {https://doi.org/10.1016/j.ins.2014.02.017},
  doi          = {10.1016/J.INS.2014.02.017},
  timestamp    = {Tue, 06 Jun 2017 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/isci/Rinaldi14.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/cse/LiXLZB14,
  author       = {Chen Li and
                  Yuxiang Xie and
                  Xidao Luan and
                  Kaichao Zhang and
                  Liang Bai},
  editor       = {Xingang Liu and
                  Didier El Baz and
                  Ching{-}Hsien Hsu and
                  Kai Kang and
                  Weifeng Chen},
  title        = {Automatic Movie Summarization Based on the Visual-Audio Features},
  booktitle    = {17th {IEEE} International Conference on Computational Science and
                  Engineering, {CSE} 2014, Chengdu, China, December 19-21, 2014},
  pages        = {1758--1761},
  publisher    = {{IEEE} Computer Society},
  year         = {2014},
  url          = {https://doi.org/10.1109/CSE.2014.322},
  doi          = {10.1109/CSE.2014.322},
  timestamp    = {Fri, 24 Mar 2023 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/conf/cse/LiXLZB14.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/euvip/MilaniCTTA14,
  author       = {Simone Milani and
                  Luca Cuccovillo and
                  Marco Tagliasacchi and
                  Stefano Tubaro and
                  Patrick Aichroth},
  title        = {Video camera identification using audio-visual features},
  booktitle    = {5th European Workshop on Visual Information Processing, {EUVIP} 2014,
                  Villetaneuse, Paris, France, December 10-12, 2014},
  pages        = {1--6},
  publisher    = {{IEEE}},
  year         = {2014},
  url          = {https://doi.org/10.1109/EUVIP.2014.7018382},
  doi          = {10.1109/EUVIP.2014.7018382},
  timestamp    = {Sun, 02 Oct 2022 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/conf/euvip/MilaniCTTA14.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/icassp/SawadaTTH14,
  author       = {Kohei Sawada and
                  Masanori Takehara and
                  Satoshi Tamura and
                  Satoru Hayamizu},
  title        = {Audio-visual voice conversion using noise-robust features},
  booktitle    = {{IEEE} International Conference on Acoustics, Speech and Signal Processing,
                  {ICASSP} 2014, Florence, Italy, May 4-9, 2014},
  pages        = {7899--7903},
  publisher    = {{IEEE}},
  year         = {2014},
  url          = {https://doi.org/10.1109/ICASSP.2014.6855138},
  doi          = {10.1109/ICASSP.2014.6855138},
  timestamp    = {Wed, 16 Oct 2019 14:14:52 +0200},
  biburl       = {https://dblp.org/rec/conf/icassp/SawadaTTH14.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/icpram/KacheleGZMS14,
  author       = {Markus K{\"{a}}chele and
                  Michael Glodek and
                  Dimitri Zharkov and
                  Sascha Meudt and
                  Friedhelm Schwenker},
  editor       = {Maria De Marsico and
                  Antoine Tabbone and
                  Ana L. N. Fred},
  title        = {Fusion of Audio-visual Features using Hierarchical Classifier Systems
                  for the Recognition of Affective States and the State of Depression},
  booktitle    = {{ICPRAM} 2014 - Proceedings of the 3rd International Conference on
                  Pattern Recognition Applications and Methods, ESEO, Angers, Loire
                  Valley, France, 6-8 March, 2014},
  pages        = {671--678},
  publisher    = {SciTePress},
  year         = {2014},
  url          = {https://doi.org/10.5220/0004828606710678},
  doi          = {10.5220/0004828606710678},
  timestamp    = {Tue, 23 May 2017 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/conf/icpram/KacheleGZMS14.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/intcompsymp/ZengC14,
  author       = {Yi{-}Chong Zeng and
                  Wen{-}Tsung Chang},
  editor       = {William Cheng{-}Chung Chu and
                  Han{-}Chieh Chao and
                  Stephen Jenn{-}Hwa Yang},
  title        = {Fast Seriation of Multiple Homogeneous-content Videos Using Audio-visual
                  Features},
  booktitle    = {Intelligent Systems and Applications - Proceedings of the International
                  Computer Symposium {(ICS)} held at Taichung, Taiwan, December 12-14,
                  2014},
  series       = {Frontiers in Artificial Intelligence and Applications},
  volume       = {274},
  pages        = {1157--1166},
  publisher    = {{IOS} Press},
  year         = {2014},
  url          = {https://doi.org/10.3233/978-1-61499-484-8-1157},
  doi          = {10.3233/978-1-61499-484-8-1157},
  timestamp    = {Sun, 21 May 2017 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/conf/intcompsymp/ZengC14.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/interspeech/Fernandez-Martinez14,
  author       = {Fernando Fern{\'{a}}ndez Mart{\'{\i}}nez and
                  Alejandro Hern{\'{a}}ndez{-}Garc{\'{\i}}a and
                  Ascensi{\'{o}}n Gallardo{-}Antol{\'{\i}}n and
                  Fernando D{\'{\i}}az{-}de{-}Mar{\'{\i}}a},
  title        = {Combining audio-visual features for viewers' perception classification
                  of Youtube car commercials},
  booktitle    = {2nd International Workshop on Speech, Language and Audio in Multimedia,
                  {SLAM} 2014, Penang, Malaysia, September 11-12, 2014},
  pages        = {14--18},
  publisher    = {{ISCA}},
  year         = {2014},
  url          = {http://www.isca-speech.org/archive/slam\_2014/slm4\_014.html},
  timestamp    = {Tue, 11 Jul 2023 11:45:03 +0200},
  biburl       = {https://dblp.org/rec/conf/interspeech/Fernandez-Martinez14.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/isccsp/IbrahimM14,
  author       = {Mohd Zamri Ibrahim and
                  David J. Mulvaney},
  title        = {A lip geometry approach for feature-fusion based audio-visual speech
                  recognition},
  booktitle    = {6th International Symposium on Communications, Control and Signal
                  Processing, {ISCCSP} 2014, Athens, Greece, May 21-23, 2014},
  pages        = {644--647},
  publisher    = {{IEEE}},
  year         = {2014},
  url          = {https://doi.org/10.1109/ISCCSP.2014.6877957},
  doi          = {10.1109/ISCCSP.2014.6877957},
  timestamp    = {Sun, 17 Sep 2023 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/conf/isccsp/IbrahimM14.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/mm/EspinosaEPMAR14,
  author       = {Humberto P{\'{e}}rez Espinosa and
                  Hugo Jair Escalante and
                  Luis Villase{\~{n}}or Pineda and
                  Manuel Montes{-}y{-}G{\'{o}}mez and
                  David Pinto Avenda{\~{n}}o and
                  Ver{\'{o}}nica Reyes{-}Meza},
  editor       = {Michel F. Valstar and
                  Bj{\"{o}}rn W. Schuller and
                  Jarek Krajewski and
                  Roddy Cowie and
                  Maja Pantic},
  title        = {Fusing Affective Dimensions and Audio-Visual Features from Segmented
                  Video for Depression Recognition: INAOE-BUAP's Participation at AVEC'14
                  Challenge},
  booktitle    = {Proceedings of the 4th International Workshop on Audio/Visual Emotion
                  Challenge, {AVEC} '14, Orlando, Florida, USA, November 7, 2014},
  pages        = {49--55},
  publisher    = {{ACM}},
  year         = {2014},
  url          = {https://doi.org/10.1145/2661806.2661815},
  doi          = {10.1145/2661806.2661815},
  timestamp    = {Sat, 09 Apr 2022 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/conf/mm/EspinosaEPMAR14.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/nime/Tsiros14,
  author       = {Augoustinos Tsiros},
  editor       = {Baptiste Caramiaux and
                  Koray Tahiroglu and
                  Rebecca Fiebrink and
                  Atau Tanaka},
  title        = {Evaluating the Perceived Similarity Between Audio-Visual Features
                  Using Corpus-Based Concatenative Synthesis},
  booktitle    = {14th International Conference on New Interfaces for Musical Expression,
                  {NIME} 2014, London, United Kingdom, June 30 - July 4, 2014},
  pages        = {421--426},
  publisher    = {nime.org},
  year         = {2014},
  url          = {https://doi.org/10.5281/zenodo.1178965},
  doi          = {10.5281/ZENODO.1178965},
  timestamp    = {Tue, 04 Apr 2023 16:52:05 +0200},
  biburl       = {https://dblp.org/rec/conf/nime/Tsiros14.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/noms/RainerT14,
  author       = {Benjamin Rainer and
                  Christian Timmerer},
  title        = {A subjective evaluation using crowdsourcing of Adaptive Media Playout
                  utilizing audio-visual content features},
  booktitle    = {2014 {IEEE} Network Operations and Management Symposium, {NOMS} 2014,
                  Krakow, Poland, May 5-9, 2014},
  pages        = {1--7},
  publisher    = {{IEEE}},
  year         = {2014},
  url          = {https://doi.org/10.1109/NOMS.2014.6838406},
  doi          = {10.1109/NOMS.2014.6838406},
  timestamp    = {Wed, 16 Oct 2019 14:14:54 +0200},
  biburl       = {https://dblp.org/rec/conf/noms/RainerT14.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/BordeaVMY14,
  author       = {Prashant L. Borde and
                  Amarsinh Varpe and
                  Ramesh R. Manza and
                  Pravin L. Yannawar},
  title        = {Recognition of Isolated Words using Zernike and {MFCC} features for
                  Audio Visual Speech Recognition},
  journal      = {CoRR},
  volume       = {abs/1407.1165},
  year         = {2014},
  url          = {http://arxiv.org/abs/1407.1165},
  eprinttype    = {arXiv},
  eprint       = {1407.1165},
  timestamp    = {Mon, 13 Aug 2018 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/BordeaVMY14.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/avsp/BarbulescuHBR13,
  author       = {Adela Barbulescu and
                  Thomas Hueber and
                  G{\'{e}}rard Bailly and
                  R{\'{e}}mi Ronfard},
  editor       = {Slim Ouni and
                  Fr{\'{e}}d{\'{e}}ric Berthommier and
                  Alexandra Jesse},
  title        = {Audio-visual speaker conversion using prosody features},
  booktitle    = {Auditory-Visual Speech Processing, {AVSP} 2013, Annecy, France, August
                  29 - September 1, 2013},
  pages        = {11--16},
  publisher    = {{ISCA}},
  year         = {2013},
  url          = {http://www.isca-speech.org/archive/avsp13/av13\_011.html},
  timestamp    = {Tue, 16 Nov 2021 11:36:14 +0100},
  biburl       = {https://dblp.org/rec/conf/avsp/BarbulescuHBR13.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/avsp/ShenTH13,
  author       = {Peng Shen and
                  Satoshi Tamura and
                  Satoru Hayamizu},
  editor       = {Slim Ouni and
                  Fr{\'{e}}d{\'{e}}ric Berthommier and
                  Alexandra Jesse},
  title        = {Audio-visual interaction in sparse representation features for noise
                  robust audio-visual speech recognition},
  booktitle    = {Auditory-Visual Speech Processing, {AVSP} 2013, Annecy, France, August
                  29 - September 1, 2013},
  pages        = {43--48},
  publisher    = {{ISCA}},
  year         = {2013},
  url          = {http://www.isca-speech.org/archive/avsp13/av13\_043.html},
  timestamp    = {Tue, 16 Nov 2021 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/conf/avsp/ShenTH13.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/cvpr/WangZH13,
  author       = {Tao Wang and
                  Zhigang Zhu and
                  Riad I. Hammoud},
  title        = {Audio-Visual Feature Fusion for Vehicles Classification in a Surveillance
                  System},
  booktitle    = {{IEEE} Conference on Computer Vision and Pattern Recognition, {CVPR}
                  Workshops 2013, Portland, OR, USA, June 23-28, 2013},
  pages        = {381--386},
  publisher    = {{IEEE} Computer Society},
  year         = {2013},
  url          = {https://doi.org/10.1109/CVPRW.2013.64},
  doi          = {10.1109/CVPRW.2013.64},
  timestamp    = {Fri, 24 Mar 2023 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/conf/cvpr/WangZH13.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/dev/FirminoT13,
  author       = {Emiliano Firmino and
                  Mauro Te{\'{o}}filo},
  editor       = {William D. Tucker and
                  Antoine B. Bagula and
                  Margaret Martonosi and
                  Bhaskaran Raman},
  title        = {Visually impaired navigation assistant for emerging market using tactile
                  floor, feature phone and audio descriptions},
  booktitle    = {Annual Symposium on Computing for Development, {ACM} DEV-4, Cape Town,
                  South Africa - December 06 - 07, 2013},
  pages        = {20:1--20:2},
  publisher    = {{ACM}},
  year         = {2013},
  url          = {https://doi.org/10.1145/2537052.2537072},
  doi          = {10.1145/2537052.2537072},
  timestamp    = {Tue, 06 Nov 2018 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/conf/dev/FirminoT13.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/icmcs/SayedelahlAK13,
  author       = {Aya Sayedelahl and
                  Rodrigo Araujo and
                  Mohamed S. Kamel},
  title        = {Audio-visual feature-decision level fusion for spontaneous emotion
                  estimation in speech conversations},
  booktitle    = {2013 {IEEE} International Conference on Multimedia and Expo Workshops,
                  San Jose, CA, USA, July 15-19, 2013},
  pages        = {1--6},
  publisher    = {{IEEE} Computer Society},
  year         = {2013},
  url          = {https://doi.org/10.1109/ICMEW.2013.6618372},
  doi          = {10.1109/ICMEW.2013.6618372},
  timestamp    = {Fri, 24 Mar 2023 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/conf/icmcs/SayedelahlAK13.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/interspeech/KhanM13,
  author       = {Faheem Khan and
                  Ben Milner},
  editor       = {Fr{\'{e}}d{\'{e}}ric Bimbot and
                  Christophe Cerisara and
                  C{\'{e}}cile Fougeron and
                  Guillaume Gravier and
                  Lori Lamel and
                  Fran{\c{c}}ois Pellegrino and
                  Pascal Perrier},
  title        = {Speaker separation using visual speech features and single-channel
                  audio},
  booktitle    = {{INTERSPEECH} 2013, 14th Annual Conference of the International Speech
                  Communication Association, Lyon, France, August 25-29, 2013},
  pages        = {3264--3268},
  publisher    = {{ISCA}},
  year         = {2013},
  url          = {https://doi.org/10.21437/Interspeech.2013-723},
  doi          = {10.21437/INTERSPEECH.2013-723},
  timestamp    = {Fri, 23 Jun 2023 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/conf/interspeech/KhanM13.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/interspeech/TiippanaTVV13,
  author       = {Kaisa Tiippana and
                  Mikko Tiainen and
                  Lari Vainio and
                  Martti Vainio},
  editor       = {Fr{\'{e}}d{\'{e}}ric Bimbot and
                  Christophe Cerisara and
                  C{\'{e}}cile Fougeron and
                  Guillaume Gravier and
                  Lori Lamel and
                  Fran{\c{c}}ois Pellegrino and
                  Pascal Perrier},
  title        = {Acoustic and visual phonetic features in the mcgurk effect - an audiovisual
                  speech illusion},
  booktitle    = {{INTERSPEECH} 2013, 14th Annual Conference of the International Speech
                  Communication Association, Lyon, France, August 25-29, 2013},
  pages        = {1634--1638},
  publisher    = {{ISCA}},
  year         = {2013},
  url          = {https://doi.org/10.21437/Interspeech.2013-424},
  doi          = {10.21437/INTERSPEECH.2013-424},
  timestamp    = {Fri, 23 Jun 2023 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/conf/interspeech/TiippanaTVV13.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/iscas/ZhangZX13,
  author       = {Bo Zhang and
                  Jiancheng Zou and
                  Bo Xu},
  title        = {Context-dependent audio-visual and temporal features fusion for {TV}
                  commercial detection},
  booktitle    = {2013 {IEEE} International Symposium on Circuits and Systems (ISCAS2013),
                  Beijing, China, May 19-23, 2013},
  pages        = {5--8},
  publisher    = {{IEEE}},
  year         = {2013},
  url          = {https://doi.org/10.1109/ISCAS.2013.6571768},
  doi          = {10.1109/ISCAS.2013.6571768},
  timestamp    = {Wed, 16 Oct 2019 14:14:49 +0200},
  biburl       = {https://dblp.org/rec/conf/iscas/ZhangZX13.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/mir/GodinNW13,
  author       = {Fr{\'{e}}deric Godin and
                  Wesley De Neve and
                  Rik Van de Walle},
  editor       = {Ramesh C. Jain and
                  Balakrishnan Prabhakaran and
                  Marcel Worring and
                  John R. Smith and
                  Tat{-}Seng Chua},
  title        = {Towards fusion of collective knowledge and audio-visual content features
                  for annotating broadcast video},
  booktitle    = {International Conference on Multimedia Retrieval, ICMR'13, Dallas,
                  TX, USA, April 16-19, 2013},
  pages        = {329--332},
  publisher    = {{ACM}},
  year         = {2013},
  url          = {https://doi.org/10.1145/2461466.2461530},
  doi          = {10.1145/2461466.2461530},
  timestamp    = {Mon, 22 Apr 2024 21:24:20 +0200},
  biburl       = {https://dblp.org/rec/conf/mir/GodinNW13.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/mm/RudovicPP13,
  author       = {Ognjen Rudovic and
                  Stavros Petridis and
                  Maja Pantic},
  editor       = {Alejandro Jaimes and
                  Nicu Sebe and
                  Nozha Boujemaa and
                  Daniel Gatica{-}Perez and
                  David A. Shamma and
                  Marcel Worring and
                  Roger Zimmermann},
  title        = {Bimodal log-linear regression for fusion of audio and visual features},
  booktitle    = {{ACM} Multimedia Conference, {MM} '13, Barcelona, Spain, October 21-25,
                  2013},
  pages        = {789--792},
  publisher    = {{ACM}},
  year         = {2013},
  url          = {https://doi.org/10.1145/2502081.2502207},
  doi          = {10.1145/2502081.2502207},
  timestamp    = {Tue, 06 Nov 2018 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/conf/mm/RudovicPP13.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/mmm/ZhangFX13,
  author       = {Bo Zhang and
                  Bailan Feng and
                  Bo Xu},
  editor       = {Shipeng Li and
                  Abdulmotaleb El{-}Saddik and
                  Meng Wang and
                  Tao Mei and
                  Nicu Sebe and
                  Shuicheng Yan and
                  Richang Hong and
                  Cathal Gurrin},
  title        = {Fusion of Audio-Visual Features and Statistical Property for Commercial
                  Segmentation},
  booktitle    = {Advances in Multimedia Modeling, 19th International Conference, {MMM}
                  2013, Huangshan, China, January 7-9, 2013, Proceedings, Part {I}},
  series       = {Lecture Notes in Computer Science},
  volume       = {7732},
  pages        = {250--260},
  publisher    = {Springer},
  year         = {2013},
  url          = {https://doi.org/10.1007/978-3-642-35725-1\_23},
  doi          = {10.1007/978-3-642-35725-1\_23},
  timestamp    = {Mon, 23 Nov 2020 15:58:17 +0100},
  biburl       = {https://dblp.org/rec/conf/mmm/ZhangFX13.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/taslp/NewmanC12,
  author       = {Jacob L. Newman and
                  Stephen J. Cox},
  title        = {Language Identification Using Visual Features},
  journal      = {{IEEE} Trans. Speech Audio Process.},
  volume       = {20},
  number       = {7},
  pages        = {1936--1947},
  year         = {2012},
  url          = {https://doi.org/10.1109/TASL.2012.2191956},
  doi          = {10.1109/TASL.2012.2191956},
  timestamp    = {Sun, 17 May 2020 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/taslp/NewmanC12.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/acssc/SuiTHB12,
  author       = {Chao Sui and
                  Roberto Togneri and
                  Serajul Haque and
                  Mohammed Bennamoun},
  editor       = {Michael B. Matthews},
  title        = {Discrimination comparison between audio and visual features},
  booktitle    = {Conference Record of the Forty Sixth Asilomar Conference on Signals,
                  Systems and Computers, {ACSCC} 2012, Pacific Grove, CA, USA, November
                  4-7, 2012},
  pages        = {1609--1612},
  publisher    = {{IEEE}},
  year         = {2012},
  url          = {https://doi.org/10.1109/ACSSC.2012.6489302},
  doi          = {10.1109/ACSSC.2012.6489302},
  timestamp    = {Sat, 19 Oct 2019 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/conf/acssc/SuiTHB12.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/apsipa/ShenTH12,
  author       = {Peng Shen and
                  Satoshi Tamura and
                  Satoru Hayamizu},
  title        = {Feature reconstruction using sparse imputation for noise robust audio-visual
                  speech recognition},
  booktitle    = {Asia-Pacific Signal and Information Processing Association Annual
                  Summit and Conference, {APSIPA} 2012, Hollywood, CA, USA, December
                  3-6, 2012},
  pages        = {1--4},
  publisher    = {{IEEE}},
  year         = {2012},
  url          = {https://ieeexplore.ieee.org/document/6411773/},
  timestamp    = {Sun, 08 Aug 2021 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/conf/apsipa/ShenTH12.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/iconip/AlmaadeedAA12,
  author       = {Noor Almaadeed and
                  Amar Aggoun and
                  Abbes Amira},
  editor       = {Tingwen Huang and
                  Zhigang Zeng and
                  Chuandong Li and
                  Chi{-}Sing Leung},
  title        = {Audio-Visual Feature Fusion for Speaker Identification},
  booktitle    = {Neural Information Processing - 19th International Conference, {ICONIP}
                  2012, Doha, Qatar, November 12-15, 2012, Proceedings, Part {I}},
  series       = {Lecture Notes in Computer Science},
  volume       = {7663},
  pages        = {56--67},
  publisher    = {Springer},
  year         = {2012},
  url          = {https://doi.org/10.1007/978-3-642-34475-6\_8},
  doi          = {10.1007/978-3-642-34475-6\_8},
  timestamp    = {Tue, 30 Jun 2020 11:04:50 +0200},
  biburl       = {https://dblp.org/rec/conf/iconip/AlmaadeedAA12.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/slt/KashiwagiSMH12,
  author       = {Yosuke Kashiwagi and
                  Masayuki Suzuki and
                  Nobuaki Minematsu and
                  Keikichi Hirose},
  title        = {Audio-visual feature integration based on piecewise linear transformation
                  for noise robust automatic speech recognition},
  booktitle    = {2012 {IEEE} Spoken Language Technology Workshop (SLT), Miami, FL,
                  USA, December 2-5, 2012},
  pages        = {149--152},
  publisher    = {{IEEE}},
  year         = {2012},
  url          = {https://doi.org/10.1109/SLT.2012.6424213},
  doi          = {10.1109/SLT.2012.6424213},
  timestamp    = {Wed, 16 Oct 2019 14:14:53 +0200},
  biburl       = {https://dblp.org/rec/conf/slt/KashiwagiSMH12.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/centeris/OliveiraAA11,
  author       = {Rita Oliveira and
                  Jorge Ferraz de Abreu and
                  Ana Margarida Pisco Almeida},
  editor       = {Maria Manuela Cruz{-}Cunha and
                  Jo{\~{a}}o Varaj{\~{a}}o and
                  Philip Powell and
                  Ricardo Martinho},
  title        = {An iTV Audio Description Service: Suggesting Requirements and Features
                  for Visually Impaired Users},
  booktitle    = {ENTERprise Information Systems - International Conference, {CENTERIS}
                  2011, Vilamoura, Portugal, October 5-7, 2011, Proceedings, Part {III}},
  series       = {Communications in Computer and Information Science},
  volume       = {221},
  pages        = {59--68},
  publisher    = {Springer},
  year         = {2011},
  url          = {https://doi.org/10.1007/978-3-642-24352-3\_7},
  doi          = {10.1007/978-3-642-24352-3\_7},
  timestamp    = {Tue, 23 Apr 2024 23:18:35 +0200},
  biburl       = {https://dblp.org/rec/conf/centeris/OliveiraAA11.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/cikm/IslamABR11,
  author       = {Muhammad Asiful Islam and
                  Faisal Ahmed and
                  Yevgen Borodin and
                  I. V. Ramakrishnan},
  editor       = {Craig Macdonald and
                  Iadh Ounis and
                  Ian Ruthven},
  title        = {Tightly coupling visual and linguistic features for enriching audio-based
                  web browsing experience},
  booktitle    = {Proceedings of the 20th {ACM} Conference on Information and Knowledge
                  Management, {CIKM} 2011, Glasgow, United Kingdom, October 24-28, 2011},
  pages        = {2085--2088},
  publisher    = {{ACM}},
  year         = {2011},
  url          = {https://doi.org/10.1145/2063576.2063896},
  doi          = {10.1145/2063576.2063896},
  timestamp    = {Tue, 06 Nov 2018 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/conf/cikm/IslamABR11.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/eusipco/LiuNWJC11,
  author       = {Qingju Liu and
                  Syed Mohsen Naqvi and
                  Wenwu Wang and
                  Philip J. B. Jackson and
                  Jonathon A. Chambers},
  title        = {Robust feature selection for scaling ambiguity reduction in audio-visual
                  convolutive {BSS}},
  booktitle    = {Proceedings of the 19th European Signal Processing Conference, {EUSIPCO}
                  2011, Barcelona, Spain, August 29 - Sept. 2, 2011},
  pages        = {1060--1064},
  publisher    = {{IEEE}},
  year         = {2011},
  url          = {https://ieeexplore.ieee.org/document/7074127/},
  timestamp    = {Thu, 05 Jan 2023 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/conf/eusipco/LiuNWJC11.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/icassp/ChandrasekharSR11,
  author       = {Vijay Chandrasekhar and
                  Mehmet Emre Sargin and
                  David A. Ross},
  title        = {Automatic Language Identification in music videos with low level audio
                  and visual features},
  booktitle    = {Proceedings of the {IEEE} International Conference on Acoustics, Speech,
                  and Signal Processing, {ICASSP} 2011, May 22-27, 2011, Prague Congress
                  Center, Prague, Czech Republic},
  pages        = {5724--5727},
  publisher    = {{IEEE}},
  year         = {2011},
  url          = {https://doi.org/10.1109/ICASSP.2011.5947660},
  doi          = {10.1109/ICASSP.2011.5947660},
  timestamp    = {Tue, 23 Jul 2019 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/conf/icassp/ChandrasekharSR11.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/maics/SchlittenhartWSI11,
  author       = {Isaac Schlittenhart and
                  Jason Winters and
                  Kyle Springer and
                  Atsushi Inoue},
  editor       = {Sofia Visa and
                  Atsushi Inoue and
                  Anca L. Ralescu},
  title        = {Toward Robust Features for Remote Audio-Visual Classroom},
  booktitle    = {Proceedings of The 22nd Midwest Artificial Intelligence and Cognitive
                  Science Conference 2011, Cincinnati, Ohio, USA, April 16-17, 2011},
  series       = {{CEUR} Workshop Proceedings},
  volume       = {710},
  pages        = {202--207},
  publisher    = {CEUR-WS.org},
  year         = {2011},
  url          = {https://ceur-ws.org/Vol-710/paper38.pdf},
  timestamp    = {Fri, 10 Mar 2023 16:22:19 +0100},
  biburl       = {https://dblp.org/rec/conf/maics/SchlittenhartWSI11.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/mediaeval/AcarSA11,
  author       = {Esra Acar and
                  Stephan Spiegel and
                  Sahin Albayrak},
  editor       = {Martha A. Larson and
                  Adam Rae and
                  Claire{-}H{\'{e}}l{\`{e}}ne Demarty and
                  Christoph Kofler and
                  Florian Metze and
                  Rapha{\"{e}}l Troncy and
                  Vasileios Mezaris and
                  Gareth J. F. Jones},
  title        = {MediaEval 2011 Affect Task: Violent Scene Detection combining audio
                  and visual Features with {SVM}},
  booktitle    = {Working Notes Proceedings of the MediaEval 2011 Workshop, Santa Croce
                  in Fossabanda, Pisa, Italy, September 1-2, 2011},
  series       = {{CEUR} Workshop Proceedings},
  volume       = {807},
  publisher    = {CEUR-WS.org},
  year         = {2011},
  url          = {https://ceur-ws.org/Vol-807/acar\_TUB\_Violence\_me11wn.pdf},
  timestamp    = {Fri, 10 Mar 2023 16:22:12 +0100},
  biburl       = {https://dblp.org/rec/conf/mediaeval/AcarSA11.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/psivt/KomaiAT11,
  author       = {Yuto Komai and
                  Yasuo Ariki and
                  Tetsuya Takiguchi},
  editor       = {Yo{-}Sung Ho},
  title        = {Audio-Visual Speech Recognition Based on {AAM} Parameter and Phoneme
                  Analysis of Visual Feature},
  booktitle    = {Advances in Image and Video Technology - 5th Pacific Rim Symposium,
                  {PSIVT} 2011, Gwangju, South Korea, November 20-23, 2011, Proceedings,
                  Part {I}},
  series       = {Lecture Notes in Computer Science},
  volume       = {7087},
  pages        = {97--108},
  publisher    = {Springer},
  year         = {2011},
  url          = {https://doi.org/10.1007/978-3-642-25367-6\_9},
  doi          = {10.1007/978-3-642-25367-6\_9},
  timestamp    = {Tue, 14 May 2019 10:00:47 +0200},
  biburl       = {https://dblp.org/rec/conf/psivt/KomaiAT11.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/trecvid/PereraOLKBLMLMH11,
  author       = {A. G. Amitha Perera and
                  Sangmin Oh and
                  Matthew J. Leotta and
                  Ilseo Kim and
                  Byungki Byun and
                  Chin{-}Hui Lee and
                  Scott McCloskey and
                  Jingchen Liu and
                  Ben Miller and
                  Zhi Feng Huang and
                  Arash Vahdat and
                  Weilong Yang and
                  Greg Mori and
                  Kevin Tang and
                  Daphne Koller and
                  Li Fei{-}Fei and
                  Kang Li and
                  Gang Chen and
                  Jason J. Corso and
                  Yun Fu and
                  Rohini K. Srihari},
  editor       = {Paul Over and
                  George Awad and
                  Jonathan G. Fiscus and
                  Brian Antonishek and
                  Martial Michel and
                  Alan F. Smeaton and
                  Wessel Kraaij and
                  Georges Qu{\'{e}}not},
  title        = {{GENIE} {TRECVID} 2011 Multimedia Event Detection: Late-Fusion Approaches
                  to Combine Multiple Audio-Visual features},
  booktitle    = {2011 {TREC} Video Retrieval Evaluation, {TRECVID} 2011, Gaithersburg,
                  MD, USA, December 5-7, 2011},
  publisher    = {National Institute of Standards and Technology {(NIST)}},
  year         = {2011},
  url          = {https://www-nlpir.nist.gov/projects/tvpubs/tv11.papers/genie.pdf},
  timestamp    = {Tue, 10 Jan 2023 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/conf/trecvid/PereraOLKBLMLMH11.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/trustcom/LiuWZT11,
  author       = {Yizhi Liu and
                  Xiangdong Wang and
                  Yongdong Zhang and
                  Sheng Tang},
  title        = {Fusing Audio-Words with Visual Features for Pornographic Video Detection},
  booktitle    = {{IEEE} 10th International Conference on Trust, Security and Privacy
                  in Computing and Communications, TrustCom 2011, Changsha, China, 16-18
                  November, 2011},
  pages        = {1488--1493},
  publisher    = {{IEEE} Computer Society},
  year         = {2011},
  url          = {https://doi.org/10.1109/TrustCom.2011.205},
  doi          = {10.1109/TRUSTCOM.2011.205},
  timestamp    = {Fri, 24 Mar 2023 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/conf/trustcom/LiuWZT11.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/csl/DeanS10,
  author       = {David Dean and
                  Sridha Sridharan},
  title        = {Dynamic visual features for audio-visual speaker verification},
  journal      = {Comput. Speech Lang.},
  volume       = {24},
  number       = {2},
  pages        = {136--149},
  year         = {2010},
  url          = {https://doi.org/10.1016/j.csl.2009.03.007},
  doi          = {10.1016/J.CSL.2009.03.007},
  timestamp    = {Thu, 20 Feb 2020 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/journals/csl/DeanS10.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/ijsc/ShahHN10,
  author       = {Dhaval Shah and
                  Kyu Jeong Han and
                  Shrikanth S. Narayanan},
  title        = {Robust Multimodal Person Recognition Using Low-Complexity Audio-Visual
                  Feature Fusion Approaches},
  journal      = {Int. J. Semantic Comput.},
  volume       = {4},
  number       = {2},
  pages        = {155--179},
  year         = {2010},
  url          = {https://doi.org/10.1142/S1793351X10000985},
  doi          = {10.1142/S1793351X10000985},
  timestamp    = {Fri, 03 Jul 2020 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/ijsc/ShahHN10.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/tmm/ShresthaBWS10,
  author       = {Prarthana Shrestha and
                  Mauro Barbieri and
                  Hans Weda and
                  Dragan Sekulovski},
  title        = {Synchronization of Multiple Camera Videos Using Audio-Visual Features},
  journal      = {{IEEE} Trans. Multim.},
  volume       = {12},
  number       = {1},
  pages        = {79--92},
  year         = {2010},
  url          = {https://doi.org/10.1109/TMM.2009.2036285},
  doi          = {10.1109/TMM.2009.2036285},
  timestamp    = {Thu, 01 Oct 2020 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/tmm/ShresthaBWS10.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/vlsisp/MuneesawangGA10,
  author       = {Paisarn Muneesawang and
                  Ling Guan and
                  Tahir Amin},
  title        = {A New Learning Algorithm for the Fusion of Adaptive Audio-Visual Features
                  for the Retrieval and Classification of Movie Clips},
  journal      = {J. Signal Process. Syst.},
  volume       = {59},
  number       = {2},
  pages        = {177--188},
  year         = {2010},
  url          = {https://doi.org/10.1007/s11265-008-0290-7},
  doi          = {10.1007/S11265-008-0290-7},
  timestamp    = {Thu, 12 Mar 2020 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/journals/vlsisp/MuneesawangGA10.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/icassp/PetridisAP10,
  author       = {Stavros Petridis and
                  Ali Asghar and
                  Maja Pantic},
  title        = {Classifying laughter and speech using audio-visual feature prediction},
  booktitle    = {Proceedings of the {IEEE} International Conference on Acoustics, Speech,
                  and Signal Processing, {ICASSP} 2010, 14-19 March 2010, Sheraton Dallas
                  Hotel, Dallas, Texas, {USA}},
  pages        = {5254--5257},
  publisher    = {{IEEE}},
  year         = {2010},
  url          = {https://doi.org/10.1109/ICASSP.2010.5494992},
  doi          = {10.1109/ICASSP.2010.5494992},
  timestamp    = {Fri, 19 May 2017 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/conf/icassp/PetridisAP10.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/icmcs/BoyarAASCA10,
  author       = {Mujdat Boyar and
                  {\"{O}}zg{\"{u}}r Alan and
                  Samet Akpinar and
                  Orkunt Sabuncu and
                  Nihan K. {\c{C}}i{\c{c}}ekli and
                  Ferda Nur Alpaslan},
  title        = {Event boundary detection using audio-visual features and web-casting
                  texts with imprecise time information},
  booktitle    = {Proceedings of the 2010 {IEEE} International Conference on Multimedia
                  and Expo, {ICME} 2010, 19-23 July 2010, Singapore},
  pages        = {578--583},
  publisher    = {{IEEE} Computer Society},
  year         = {2010},
  url          = {https://doi.org/10.1109/ICME.2010.5583864},
  doi          = {10.1109/ICME.2010.5583864},
  timestamp    = {Fri, 24 Mar 2023 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/conf/icmcs/BoyarAASCA10.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/interspeech/ButkoN10,
  author       = {Taras Butko and
                  Climent Nadeu},
  editor       = {Takao Kobayashi and
                  Keikichi Hirose and
                  Satoshi Nakamura},
  title        = {A fast one-pass-training feature selection technique for GMM-based
                  acoustic event detection with audio-visual data},
  booktitle    = {{INTERSPEECH} 2010, 11th Annual Conference of the International Speech
                  Communication Association, Makuhari, Chiba, Japan, September 26-30,
                  2010},
  pages        = {2338--2341},
  publisher    = {{ISCA}},
  year         = {2010},
  url          = {https://doi.org/10.21437/Interspeech.2010-640},
  doi          = {10.21437/INTERSPEECH.2010-640},
  timestamp    = {Fri, 23 Jun 2023 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/conf/interspeech/ButkoN10.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/iscslp/JiangWWSV10,
  author       = {Dongmei Jiang and
                  Peng Wu and
                  Fengna Wang and
                  Hichem Sahli and
                  Werner Verhelst},
  title        = {Audio visual speech recognition based on multi-stream {DBN} models
                  with Articulatory Features},
  booktitle    = {7th International Symposium on Chinese Spoken Language Processing,
                  {ISCSLP} 2010, November 29 2010-December 3, 2010, Tainan {\&}
                  Sun Moon Lake, Taiwan},
  pages        = {190--193},
  publisher    = {{IEEE}},
  year         = {2010},
  url          = {https://doi.org/10.1109/ISCSLP.2010.5684915},
  doi          = {10.1109/ISCSLP.2010.5684915},
  timestamp    = {Wed, 16 Oct 2019 14:14:48 +0200},
  biburl       = {https://dblp.org/rec/conf/iscslp/JiangWWSV10.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/sac/RoyM10,
  author       = {Anindya Roy and
                  S{\'{e}}bastien Marcel},
  editor       = {Sung Y. Shin and
                  Sascha Ossowski and
                  Michael Schumacher and
                  Mathew J. Palakal and
                  Chih{-}Cheng Hung},
  title        = {Visual processing-inspired fern-audio features for noise-robust speaker
                  verification},
  booktitle    = {Proceedings of the 2010 {ACM} Symposium on Applied Computing (SAC),
                  Sierre, Switzerland, March 22-26, 2010},
  pages        = {1491--1495},
  publisher    = {{ACM}},
  year         = {2010},
  url          = {https://doi.org/10.1145/1774088.1774407},
  doi          = {10.1145/1774088.1774407},
  timestamp    = {Sun, 02 Jun 2019 21:18:37 +0200},
  biburl       = {https://dblp.org/rec/conf/sac/RoyM10.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@phdthesis{DBLP:phd/ch/Gurban09,
  author       = {Mihai Gurban},
  title        = {Multimodal feature extraction and fusion for audio-visual speech recognition},
  school       = {EPFL, Switzerland},
  year         = {2009},
  url          = {https://doi.org/10.5075/epfl-thesis-4292},
  doi          = {10.5075/EPFL-THESIS-4292},
  timestamp    = {Fri, 29 Jul 2022 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/phd/ch/Gurban09.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/tsp/GurbanT09,
  author       = {Mihai Gurban and
                  Jean{-}Philippe Thiran},
  title        = {Information theoretic feature extraction for audio-visual speech recognition},
  journal      = {{IEEE} Trans. Signal Process.},
  volume       = {57},
  number       = {12},
  pages        = {4765--4776},
  year         = {2009},
  url          = {https://doi.org/10.1109/TSP.2009.2026513},
  doi          = {10.1109/TSP.2009.2026513},
  timestamp    = {Tue, 10 Mar 2020 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/journals/tsp/GurbanT09.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/icig/JiangLRSV09,
  author       = {Dongmei Jiang and
                  Peizhen Liu and
                  Ilse Ravyse and
                  Hichem Sahli and
                  Werner Verhelst},
  title        = {Video Realistic Mouth Animation Based on an Audio Visual {DBN} Model
                  with Articulatory Features and Constrained Asynchrony},
  booktitle    = {Proceedings of the Fifth International Conference on Image and Graphics,
                  {ICIG} 2009, Xi'an, Shanxi, China, 20-23 September 2009},
  pages        = {658--662},
  publisher    = {{IEEE} Computer Society},
  year         = {2009},
  url          = {https://doi.org/10.1109/ICIG.2009.51},
  doi          = {10.1109/ICIG.2009.51},
  timestamp    = {Fri, 24 Mar 2023 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/conf/icig/JiangLRSV09.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/icig/ZhuYL09,
  author       = {Songhao Zhu and
                  Junchi Yan and
                  Yuncai Liu},
  title        = {Improving Semantic Scene Categorization by Exploiting Audio-Visual
                  Features},
  booktitle    = {Proceedings of the Fifth International Conference on Image and Graphics,
                  {ICIG} 2009, Xi'an, Shanxi, China, 20-23 September 2009},
  pages        = {435--440},
  publisher    = {{IEEE} Computer Society},
  year         = {2009},
  url          = {https://doi.org/10.1109/ICIG.2009.17},
  doi          = {10.1109/ICIG.2009.17},
  timestamp    = {Fri, 24 Mar 2023 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/conf/icig/ZhuYL09.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/icip/WuZX09,
  author       = {Guanyong Wu and
                  Jie Zhu and
                  Haihua Xu},
  title        = {A hybrid visual feature extraction method for audio-visual speech
                  recognition},
  booktitle    = {Proceedings of the International Conference on Image Processing, {ICIP}
                  2009, 7-10 November 2009, Cairo, Egypt},
  pages        = {1829--1832},
  publisher    = {{IEEE}},
  year         = {2009},
  url          = {https://doi.org/10.1109/ICIP.2009.5413573},
  doi          = {10.1109/ICIP.2009.5413573},
  timestamp    = {Thu, 19 Dec 2019 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/conf/icip/WuZX09.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/interspeech/AlmajaiM09,
  author       = {Ibrahim Almajai and
                  Ben Milner},
  title        = {Enhancing audio speech using visual speech features},
  booktitle    = {{INTERSPEECH} 2009, 10th Annual Conference of the International Speech
                  Communication Association, Brighton, United Kingdom, September 6-10,
                  2009},
  pages        = {1959--1962},
  publisher    = {{ISCA}},
  year         = {2009},
  url          = {https://doi.org/10.21437/Interspeech.2009-576},
  doi          = {10.21437/INTERSPEECH.2009-576},
  timestamp    = {Fri, 23 Jun 2023 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/conf/interspeech/AlmajaiM09.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/mmsp/DasB09,
  author       = {Amitava Das and
                  Vaibhav Bedia},
  title        = {Audio-visual person authentication with multiple face-profiles and
                  compressed-feature-dynamics signatures of spoken passwords},
  booktitle    = {2009 {IEEE} International Workshop on Multimedia Signal Processing,
                  {MMSP} '09, Rio de Janeiro, Brazil, October 5-7, 2009},
  pages        = {1--6},
  publisher    = {{IEEE}},
  year         = {2009},
  url          = {https://doi.org/10.1109/MMSP.2009.5293273},
  doi          = {10.1109/MMSP.2009.5293273},
  timestamp    = {Wed, 16 Oct 2019 14:14:49 +0200},
  biburl       = {https://dblp.org/rec/conf/mmsp/DasB09.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/smc/PaoLWL09,
  author       = {Tsang{-}Long Pao and
                  Wen{-}Yuan Liao and
                  Tsan{-}Nung Wu and
                  Ching{-}Yi Lin},
  title        = {Automatic Visual Feature Extraction for Mandarin Audio-Visual Speech
                  Recognition},
  booktitle    = {Proceedings of the {IEEE} International Conference on Systems, Man
                  and Cybernetics, San Antonio, TX, USA, 11-14 October 2009},
  pages        = {2936--2940},
  publisher    = {{IEEE}},
  year         = {2009},
  url          = {https://doi.org/10.1109/ICSMC.2009.5346011},
  doi          = {10.1109/ICSMC.2009.5346011},
  timestamp    = {Wed, 16 Oct 2019 14:14:51 +0200},
  biburl       = {https://dblp.org/rec/conf/smc/PaoLWL09.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/ieicet/DendaNY08,
  author       = {Yuki Denda and
                  Takanobu Nishiura and
                  Yoichi Yamashita},
  title        = {Omnidirectional Audio-Visual Talker Localization Based on Dynamic
                  Fusion of Audio-Visual Features Using Validity and Reliability Criteria},
  journal      = {{IEICE} Trans. Inf. Syst.},
  volume       = {91-D},
  number       = {3},
  pages        = {598--606},
  year         = {2008},
  url          = {https://doi.org/10.1093/ietisy/e91-d.3.598},
  doi          = {10.1093/IETISY/E91-D.3.598},
  timestamp    = {Sat, 11 Apr 2020 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/ieicet/DendaNY08.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/avsp/HaqJE08,
  author       = {Sanaul Haq and
                  Philip J. B. Jackson and
                  James D. Edge},
  editor       = {Roland G{\"{o}}cke and
                  Patrick Lucey and
                  Simon Lucey},
  title        = {Audio-visual feature selection and reduction for emotion classification},
  booktitle    = {International Conference on Auditory-Visual Speech Processing 2008,
                  Moreton Island, Queensland, Australia, September 26-29, 2008},
  pages        = {185--190},
  publisher    = {{ISCA}},
  year         = {2008},
  url          = {http://www.isca-speech.org/archive\_open/avsp08/av08\_185.html},
  timestamp    = {Wed, 10 Feb 2021 22:00:47 +0100},
  biburl       = {https://dblp.org/rec/conf/avsp/HaqJE08.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/eusipco/AlmajaiM08,
  author       = {Ibrahim Almajai and
                  Ben P. Milner},
  title        = {Using audio-visual features for robust voice activity detection in
                  clean and noisy speech},
  booktitle    = {2008 16th European Signal Processing Conference, {EUSIPCO} 2008, Lausanne,
                  Switzerland, August 25-29, 2008},
  pages        = {1--5},
  publisher    = {{IEEE}},
  year         = {2008},
  url          = {https://ieeexplore.ieee.org/document/7080692/},
  timestamp    = {Mon, 09 Aug 2021 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/conf/eusipco/AlmajaiM08.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/eusipco/DemirEYT08,
  author       = {Yasemin Demir and
                  Engin Erzin and
                  Y{\"{u}}cel Yemez and
                  A. Murat Tekalp},
  title        = {Evaluation of audio features for audio-visual analysis of dance figures},
  booktitle    = {2008 16th European Signal Processing Conference, {EUSIPCO} 2008, Lausanne,
                  Switzerland, August 25-29, 2008},
  pages        = {1--4},
  publisher    = {{IEEE}},
  year         = {2008},
  url          = {https://ieeexplore.ieee.org/document/7080401/},
  timestamp    = {Mon, 09 Aug 2021 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/conf/eusipco/DemirEYT08.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/icip/TerrySK08,
  author       = {Louis H. Terry and
                  Derek J. Shiell and
                  Aggelos K. Katsaggelos},
  title        = {Feature space video stream consistency estimation for dynamic stream
                  weighting in audio-visual speech recognition},
  booktitle    = {Proceedings of the International Conference on Image Processing, {ICIP}
                  2008, October 12-15, 2008, San Diego, California, {USA}},
  pages        = {1316--1319},
  publisher    = {{IEEE}},
  year         = {2008},
  url          = {https://doi.org/10.1109/ICIP.2008.4712005},
  doi          = {10.1109/ICIP.2008.4712005},
  timestamp    = {Tue, 21 Mar 2023 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/conf/icip/TerrySK08.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/icvgip/DasMT08,
  author       = {Amitava Das and
                  Ohil K. Manyam and
                  Makarand Tapaswi},
  title        = {Audio-Visual Person Authentication with Multiple Visualized-Speech
                  Features and Multiple Face Profiles},
  booktitle    = {Sixth Indian Conference on Computer Vision, Graphics {\&} Image
                  Processing, {ICVGIP} 2008, Bhubaneswar, India, 16-19 December 2008},
  pages        = {39--46},
  publisher    = {{IEEE} Computer Society},
  year         = {2008},
  url          = {https://doi.org/10.1109/ICVGIP.2008.106},
  doi          = {10.1109/ICVGIP.2008.106},
  timestamp    = {Fri, 24 Mar 2023 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/conf/icvgip/DasMT08.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/ism/MowerMN08,
  author       = {Emily Mower and
                  Maja J. Mataric and
                  Shrikanth S. Narayanan},
  title        = {Selection of Emotionally Salient Audio-Visual Features for Modeling
                  Human Evaluations of Synthetic Character Emotion Displays},
  booktitle    = {Tenth {IEEE} International Symposium on Multimedia (ISM2008), December
                  15-17, 2008, Berkeley, California, {USA}},
  pages        = {190--195},
  publisher    = {{IEEE} Computer Society},
  year         = {2008},
  url          = {https://doi.org/10.1109/ISM.2008.78},
  doi          = {10.1109/ISM.2008.78},
  timestamp    = {Fri, 24 Mar 2023 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/conf/ism/MowerMN08.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@incollection{DBLP:series/asc/Chaloupka08,
  author       = {Josef Chaloupka},
  editor       = {Marek Kurzynski and
                  Edward Puchala and
                  Michal Wozniak and
                  Andrzej Zolnierek},
  title        = {Extraction of the Visual Features from the Audio-Visual Speech Signal
                  and the Utilization of These Features for the Speaker Identification},
  booktitle    = {Computer Recognition Systems 2},
  series       = {Advances in Soft Computing},
  volume       = {45},
  pages        = {413--420},
  publisher    = {Springer},
  year         = {2008},
  url          = {https://doi.org/10.1007/978-3-540-75175-5\_52},
  doi          = {10.1007/978-3-540-75175-5\_52},
  timestamp    = {Thu, 07 Nov 2019 15:14:22 +0100},
  biburl       = {https://dblp.org/rec/series/asc/Chaloupka08.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/jmui/ChituRWW07,
  author       = {Alin Gavril Chitu and
                  L{\'{e}}on J. M. Rothkrantz and
                  Pascal Wiggers and
                  Jacek C. Wojdel},
  title        = {Comparison between different feature extraction techniques for audio-visual
                  speech recognition},
  journal      = {J. Multimodal User Interfaces},
  volume       = {1},
  number       = {1},
  pages        = {7--20},
  year         = {2007},
  url          = {https://doi.org/10.1007/BF02884428},
  doi          = {10.1007/BF02884428},
  timestamp    = {Sun, 28 May 2017 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/jmui/ChituRWW07.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/tce/OtsukaSKD07,
  author       = {Isao Otsuka and
                  Hidetsugu Suginohara and
                  Yoshiaki Kusunoki and
                  Ajay Divakaran},
  title        = {Detection of music segment boundaries using audio-visual features
                  for a personal video recorder},
  journal      = {{IEEE} Trans. Consumer Electron.},
  volume       = {53},
  number       = {1},
  pages        = {150--154},
  year         = {2007},
  url          = {https://doi.org/10.1109/TCE.2007.339517},
  doi          = {10.1109/TCE.2007.339517},
  timestamp    = {Thu, 09 Jul 2020 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/tce/OtsukaSKD07.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/aiprf/HuangZM07,
  author       = {Lin Huang and
                  Hanqi Zhuang and
                  Salvatore D. Morgera},
  editor       = {Dimitris A. Karras and
                  Chunping Li and
                  Zoran Majkic and
                  S. R. Mahadeva Prasanna},
  title        = {Audio-visual Based Person Recognition with Fusion at Feature Level},
  booktitle    = {International Conference on Artificial Intelligence and Pattern Recognition,
                  AIPR-07, Orlando, Florida, USA, July 9-12, 2007},
  pages        = {249--254},
  publisher    = {{ISRST}},
  year         = {2007},
  timestamp    = {Wed, 07 Aug 2019 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/conf/aiprf/HuangZM07.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/avsp/GanMY07,
  author       = {Tian Gan and
                  Wolfgang Menzel and
                  Shiqiang Yang},
  editor       = {Jean Vroomen and
                  Marc Swerts and
                  Emiel Krahmer},
  title        = {An audio-visual speech recognition framework based on articulatory
                  features},
  booktitle    = {Auditory-Visual Speech Processing 2007, {AVSP} 2007, Hilvarenbeek,
                  The Netherlands, August 31 - September 3, 2007},
  pages        = {1},
  publisher    = {{ISCA}},
  year         = {2007},
  url          = {http://www.isca-speech.org/archive\_open/avsp07/av07\_P01.html},
  timestamp    = {Wed, 10 Feb 2021 22:00:49 +0100},
  biburl       = {https://dblp.org/rec/conf/avsp/GanMY07.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/avss/KlausnerTLER07,
  author       = {Andreas Klausner and
                  Allan Tengg and
                  Christian Leistner and
                  Stefan Erb and
                  Bernhard Rinner},
  title        = {An audio-visual sensor fusion approach for feature based vehicle identification},
  booktitle    = {Fourth {IEEE} International Conference on Advanced Video and Signal
                  Based Surveillance, {AVSS} 2007, 5-7 September, 2007, Queen Mary,
                  University of London, London, United Kingdom},
  pages        = {111--116},
  publisher    = {{IEEE} Computer Society},
  year         = {2007},
  url          = {https://doi.org/10.1109/AVSS.2007.4425295},
  doi          = {10.1109/AVSS.2007.4425295},
  timestamp    = {Thu, 23 Mar 2023 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/conf/avss/KlausnerTLER07.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/clear/BernardinGS07,
  author       = {Keni Bernardin and
                  Tobias Gehrig and
                  Rainer Stiefelhagen},
  editor       = {Rainer Stiefelhagen and
                  Rachel Bowers and
                  Jonathan G. Fiscus},
  title        = {Multi-level Particle Filter Fusion of Features and Cues for Audio-Visual
                  Person Tracking},
  booktitle    = {Multimodal Technologies for Perception of Humans, International Evaluation
                  Workshops {CLEAR} 2007 and {RT} 2007, Baltimore, MD, USA, May 8-11,
                  2007, Revised Selected Papers},
  series       = {Lecture Notes in Computer Science},
  volume       = {4625},
  pages        = {70--81},
  publisher    = {Springer},
  year         = {2007},
  url          = {https://doi.org/10.1007/978-3-540-68585-2\_5},
  doi          = {10.1007/978-3-540-68585-2\_5},
  timestamp    = {Tue, 14 May 2019 10:00:42 +0200},
  biburl       = {https://dblp.org/rec/conf/clear/BernardinGS07.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/eusipco/CarbonerasGT07,
  author       = {Andres Valles Carboneras and
                  Mihai Gurban and
                  Jean{-}Philippe Thiran},
  title        = {Low-dimensional motion features for audio-visual speech recognition},
  booktitle    = {15th European Signal Processing Conference, {EUSIPCO} 2007, Poznan,
                  Poland, September 3-7, 2007},
  pages        = {297--301},
  publisher    = {{IEEE}},
  year         = {2007},
  url          = {https://ieeexplore.ieee.org/document/7098812/},
  timestamp    = {Mon, 09 Aug 2021 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/conf/eusipco/CarbonerasGT07.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/icassp/LivescuCHKBBKLYBDWFMS07,
  author       = {Karen Livescu and
                  {\"{O}}zg{\"{u}}r {\c{C}}etin and
                  Mark Hasegawa{-}Johnson and
                  Simon King and
                  Chris D. Bartels and
                  Nash M. Borges and
                  Arthur Kantor and
                  Partha Lal and
                  Lisa Yung and
                  Ari Bezman and
                  Stephen Dawson{-}Haggerty and
                  Bronwyn Woods and
                  Joe Frankel and
                  Mathew Magimai{-}Doss and
                  Kate Saenko},
  title        = {Articulatory Feature-Based Methods for Acoustic and Audio-Visual Speech
                  Recognition: Summary from the 2006 {JHU} Summer workshop},
  booktitle    = {Proceedings of the {IEEE} International Conference on Acoustics, Speech,
                  and Signal Processing, {ICASSP} 2007, Honolulu, Hawaii, USA, April
                  15-20, 2007},
  pages        = {621--624},
  publisher    = {{IEEE}},
  year         = {2007},
  url          = {https://doi.org/10.1109/ICASSP.2007.366989},
  doi          = {10.1109/ICASSP.2007.366989},
  timestamp    = {Mon, 29 Jan 2024 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/conf/icassp/LivescuCHKBBKLYBDWFMS07.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/interspeech/DendaNY07,
  author       = {Yuki Denda and
                  Takanobu Nishiura and
                  Yoichi Yamashita},
  title        = {Omnidirectional audio-visual talker localizer with dynamic feature
                  fusion based on validity and reliability criteria},
  booktitle    = {{INTERSPEECH} 2007, 8th Annual Conference of the International Speech
                  Communication Association, Antwerp, Belgium, August 27-31, 2007},
  pages        = {726--729},
  publisher    = {{ISCA}},
  year         = {2007},
  url          = {https://doi.org/10.21437/Interspeech.2007-300},
  doi          = {10.21437/INTERSPEECH.2007-300},
  timestamp    = {Fri, 23 Jun 2023 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/conf/interspeech/DendaNY07.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/interspeech/WuZ07,
  author       = {Guanyong Wu and
                  Jie Zhu},
  title        = {An extension 2DPCA based visual feature extraction method for audio-visual
                  speech recognition},
  booktitle    = {{INTERSPEECH} 2007, 8th Annual Conference of the International Speech
                  Communication Association, Antwerp, Belgium, August 27-31, 2007},
  pages        = {714--717},
  publisher    = {{ISCA}},
  year         = {2007},
  url          = {https://doi.org/10.21437/Interspeech.2007-297},
  doi          = {10.21437/INTERSPEECH.2007-297},
  timestamp    = {Fri, 23 Jun 2023 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/conf/interspeech/WuZ07.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/iros/KoiwaNI07,
  author       = {Tomoaki Koiwa and
                  Kazuhiro Nakadai and
                  Jun{-}ichi Imura},
  title        = {Coarse speech recognition by audio-visual integration based on missing
                  feature theory},
  booktitle    = {2007 {IEEE/RSJ} International Conference on Intelligent Robots and
                  Systems, October 29 - November 2, 2007, Sheraton Hotel and Marina,
                  San Diego, California, {USA}},
  pages        = {1751--1756},
  publisher    = {{IEEE}},
  year         = {2007},
  url          = {https://doi.org/10.1109/IROS.2007.4399300},
  doi          = {10.1109/IROS.2007.4399300},
  timestamp    = {Fri, 27 Mar 2020 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/conf/iros/KoiwaNI07.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/mmsp/DrugmanGT07,
  author       = {Thomas Drugman and
                  Mihai Gurban and
                  Jean{-}Philippe Thiran},
  title        = {Relevant Feature Selection for Audio-Visual Speech Recognition},
  booktitle    = {{IEEE} 9th Workshop on Multimedia Signal Processing, {MMSP} 2007,
                  Chania, Crete, Greece, October 1-3, 2007},
  pages        = {179--182},
  publisher    = {{IEEE}},
  year         = {2007},
  url          = {https://doi.org/10.1109/MMSP.2007.4412847},
  doi          = {10.1109/MMSP.2007.4412847},
  timestamp    = {Wed, 16 Oct 2019 14:14:49 +0200},
  biburl       = {https://dblp.org/rec/conf/mmsp/DrugmanGT07.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/premi/ChettyW07,
  author       = {Girija Chetty and
                  Michael Wagner},
  editor       = {Ashish Ghosh and
                  Rajat K. De and
                  Sankar K. Pal},
  title        = {Audio Visual Speaker Verification Based on Hybrid Fusion of Cross
                  Modal Features},
  booktitle    = {Pattern Recognition and Machine Intelligence, Second International
                  Conference, PReMI 2007, Kolkata, India, December 18-22, 2007, Proceedings},
  series       = {Lecture Notes in Computer Science},
  volume       = {4815},
  pages        = {469--478},
  publisher    = {Springer},
  year         = {2007},
  url          = {https://doi.org/10.1007/978-3-540-77046-6\_58},
  doi          = {10.1007/978-3-540-77046-6\_58},
  timestamp    = {Tue, 14 May 2019 10:00:41 +0200},
  biburl       = {https://dblp.org/rec/conf/premi/ChettyW07.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/icb/WuCM06,
  author       = {Zhiyong Wu and
                  Lianhong Cai and
                  Helen M. Meng},
  editor       = {David Zhang and
                  Anil K. Jain},
  title        = {Multi-level Fusion of Audio and Visual Features for Speaker Identification},
  booktitle    = {Advances in Biometrics, International Conference, {ICB} 2006, Hong
                  Kong, China, January 5-7, 2006, Proceedings},
  series       = {Lecture Notes in Computer Science},
  volume       = {3832},
  pages        = {493--499},
  publisher    = {Springer},
  year         = {2006},
  url          = {https://doi.org/10.1007/11608288\_66},
  doi          = {10.1007/11608288\_66},
  timestamp    = {Sat, 08 May 2021 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/conf/icb/WuCM06.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/interspeech/AlmajaiMD06,
  author       = {Ibrahim Almajai and
                  Ben Milner and
                  Jonathan Darch},
  title        = {Analysis of correlation between audio and visual speech features for
                  clean audio feature prediction in noise},
  booktitle    = {{INTERSPEECH} 2006 - ICSLP, Ninth International Conference on Spoken
                  Language Processing, Pittsburgh, PA, USA, September 17-21, 2006},
  publisher    = {{ISCA}},
  year         = {2006},
  url          = {https://doi.org/10.21437/Interspeech.2006-619},
  doi          = {10.21437/INTERSPEECH.2006-619},
  timestamp    = {Thu, 22 Jun 2023 16:42:16 +0200},
  biburl       = {https://dblp.org/rec/conf/interspeech/AlmajaiMD06.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/mlmi/Al-HamesHSR06,
  author       = {Marc A. Al{-}Hames and
                  Benedikt H{\"{o}}rnler and
                  Christoph Scheuermann and
                  Gerhard Rigoll},
  editor       = {Steve Renals and
                  Samy Bengio and
                  Jonathan G. Fiscus},
  title        = {Using Audio, Visual, and Lexical Features in a Multi-modal Virtual
                  Meeting Director},
  booktitle    = {Machine Learning for Multimodal Interaction, Third International Workshop,
                  {MLMI} 2006, Bethesda, MD, USA, May 1-4, 2006, Revised Selected Papers},
  series       = {Lecture Notes in Computer Science},
  volume       = {4299},
  pages        = {63--74},
  publisher    = {Springer},
  year         = {2006},
  url          = {https://doi.org/10.1007/11965152\_6},
  doi          = {10.1007/11965152\_6},
  timestamp    = {Tue, 31 Aug 2021 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/conf/mlmi/Al-HamesHSR06.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/mmsp/KepesiNPGGJ06,
  author       = {Mari{\'{a}}n K{\'{e}}pesi and
                  Michael Neffe and
                  Tuan Van Pham and
                  Michael Grabner and
                  Helmut Grabner and
                  Andreas Juffinger},
  title        = {Audio-Visual Feature Extraction for Semi-Automatic Annotation of Meetings},
  booktitle    = {{IEEE} 8th Workshop on Multimedia Signal Processing, {MMSP} 2006,
                  Victoria, BC, Canada, October 3-6, 2006},
  pages        = {207--211},
  publisher    = {{IEEE}},
  year         = {2006},
  url          = {https://doi.org/10.1109/MMSP.2006.285298},
  doi          = {10.1109/MMSP.2006.285298},
  timestamp    = {Wed, 16 Oct 2019 14:14:49 +0200},
  biburl       = {https://dblp.org/rec/conf/mmsp/KepesiNPGGJ06.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/tcsv/SadlierO05,
  author       = {David A. Sadlier and
                  Noel E. O'Connor},
  title        = {Event detection in field sports video using audio-visual features
                  and a support vector Machine},
  journal      = {{IEEE} Trans. Circuits Syst. Video Technol.},
  volume       = {15},
  number       = {10},
  pages        = {1225--1233},
  year         = {2005},
  url          = {https://doi.org/10.1109/TCSVT.2005.854237},
  doi          = {10.1109/TCSVT.2005.854237},
  timestamp    = {Tue, 25 Aug 2020 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/tcsv/SadlierO05.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/eusipco/SarginEYT05,
  author       = {Mehmet Emre Sargin and
                  Engin Erzin and
                  Y{\"{u}}cel Yemez and
                  A. Murat Tekalp},
  title        = {Lip feature extraction based on audio-visual correlation},
  booktitle    = {13th European Signal Processing Conference, {EUSIPCO} 2005, Antalya,
                  Turkey, September 4-8, 2005},
  pages        = {1--4},
  publisher    = {{IEEE}},
  year         = {2005},
  url          = {https://ieeexplore.ieee.org/document/7077967/},
  timestamp    = {Mon, 09 Aug 2021 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/conf/eusipco/SarginEYT05.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/icassp/ChangMC05,
  author       = {Shih{-}Fu Chang and
                  R. Manmatha and
                  Tat{-}Seng Chua},
  title        = {Combining text and audio-visual features in video indexing},
  booktitle    = {2005 {IEEE} International Conference on Acoustics, Speech, and Signal
                  Processing, {ICASSP} '05, Philadelphia, Pennsylvania, USA, March 18-23,
                  2005},
  pages        = {1005--1008},
  publisher    = {{IEEE}},
  year         = {2005},
  url          = {https://doi.org/10.1109/ICASSP.2005.1416476},
  doi          = {10.1109/ICASSP.2005.1416476},
  timestamp    = {Mon, 22 Jun 2020 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/conf/icassp/ChangMC05.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/icmcs/HuangMV05,
  author       = {Jing Huang and
                  Etienne Marcheret and
                  Karthik Visweswariah},
  title        = {Rapid Feature Space Speaker Adaptation for Multi-Stream HMM-Based
                  Audio-Visual Speech Recognition},
  booktitle    = {Proceedings of the 2005 {IEEE} International Conference on Multimedia
                  and Expo, {ICME} 2005, July 6-9, 2005, Amsterdam, The Netherlands},
  pages        = {338--341},
  publisher    = {{IEEE} Computer Society},
  year         = {2005},
  url          = {https://doi.org/10.1109/ICME.2005.1521429},
  doi          = {10.1109/ICME.2005.1521429},
  timestamp    = {Fri, 24 Mar 2023 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/conf/icmcs/HuangMV05.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/icnc/KimRK05,
  author       = {Myung{-}Won Kim and
                  Joung Woo Ryu and
                  Eun Ju Kim},
  editor       = {Lipo Wang and
                  Ke Chen and
                  Yew{-}Soon Ong},
  title        = {Speech Recognition by Integrating Audio, Visual and Contextual Features
                  Based on Neural Networks},
  booktitle    = {Advances in Natural Computation, First International Conference, {ICNC}
                  2005, Changsha, China, August 27-29, 2005, Proceedings, Part {II}},
  series       = {Lecture Notes in Computer Science},
  volume       = {3611},
  pages        = {155--164},
  publisher    = {Springer},
  year         = {2005},
  url          = {https://doi.org/10.1007/11539117\_25},
  doi          = {10.1007/11539117\_25},
  timestamp    = {Sun, 02 Jun 2019 21:14:27 +0200},
  biburl       = {https://dblp.org/rec/conf/icnc/KimRK05.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/interspeech/HuangP05,
  author       = {Jing Huang and
                  Daniel Povey},
  title        = {Discriminatively trained features using fMPE for multi-stream audio-visual
                  speech recognition},
  booktitle    = {{INTERSPEECH} 2005 - Eurospeech, 9th European Conference on Speech
                  Communication and Technology, Lisbon, Portugal, September 4-8, 2005},
  pages        = {777--780},
  publisher    = {{ISCA}},
  year         = {2005},
  url          = {https://doi.org/10.21437/Interspeech.2005-361},
  doi          = {10.21437/INTERSPEECH.2005-361},
  timestamp    = {Thu, 22 Jun 2023 16:42:16 +0200},
  biburl       = {https://dblp.org/rec/conf/interspeech/HuangP05.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/interspeech/HuangV05,
  author       = {Jing Huang and
                  Karthik Visweswariah},
  title        = {Improving lip-reading with feature space transforms for multi-stream
                  audio-visual speech recognition},
  booktitle    = {{INTERSPEECH} 2005 - Eurospeech, 9th European Conference on Speech
                  Communication and Technology, Lisbon, Portugal, September 4-8, 2005},
  pages        = {1221--1224},
  publisher    = {{ISCA}},
  year         = {2005},
  url          = {https://doi.org/10.21437/Interspeech.2005-373},
  doi          = {10.21437/INTERSPEECH.2005-373},
  timestamp    = {Thu, 22 Jun 2023 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/conf/interspeech/HuangV05.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/isspa/LewisP05,
  author       = {Trent W. Lewis and
                  David M. W. Powers},
  title        = {Distinctive feature fusion for improved audio-visual phoneme recognition},
  booktitle    = {Proceedings of the Eighth International Symposium on Signal Processing
                  and Its Applications, {ISSPA} 2005, 28-31 August 2005, Sydney, Australia},
  pages        = {62--65},
  publisher    = {{IEEE}},
  year         = {2005},
  url          = {https://doi.org/10.1109/ISSPA.2005.1580196},
  doi          = {10.1109/ISSPA.2005.1580196},
  timestamp    = {Wed, 16 Oct 2019 14:14:56 +0200},
  biburl       = {https://dblp.org/rec/conf/isspa/LewisP05.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/tsmc/KaynakZCSJC04,
  author       = {Mustafa Nazmi Kaynak and
                  Qi Zhi and
                  Adrian David Cheok and
                  Kuntal Sengupta and
                  Jian Zhang and
                  Chi Chung Ko},
  title        = {Analysis of lip geometric features for audio-visual speech recognition},
  journal      = {{IEEE} Trans. Syst. Man Cybern. Part {A}},
  volume       = {34},
  number       = {4},
  pages        = {564--570},
  year         = {2004},
  url          = {https://doi.org/10.1109/TSMCA.2004.826274},
  doi          = {10.1109/TSMCA.2004.826274},
  timestamp    = {Mon, 25 May 2020 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/tsmc/KaynakZCSJC04.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/icassp/AleksicK04,
  author       = {Petar S. Aleksic and
                  Aggelos K. Katsaggelos},
  title        = {Comparison of low- and high-level visual features for audio-visual
                  continuous automatic speech recognition},
  booktitle    = {2004 {IEEE} International Conference on Acoustics, Speech, and Signal
                  Processing, {ICASSP} 2004, Montreal, Quebec, Canada, May 17-21, 2004},
  pages        = {917--920},
  publisher    = {{IEEE}},
  year         = {2004},
  url          = {https://doi.org/10.1109/ICASSP.2004.1327261},
  doi          = {10.1109/ICASSP.2004.1327261},
  timestamp    = {Mon, 22 Jun 2020 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/conf/icassp/AleksicK04.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/icassp/JiangPNIN04,
  author       = {Jintao Jiang and
                  Gerasimos Potamianos and
                  Harriet J. Nock and
                  Giridharan Iyengar and
                  Chalapathy Neti},
  title        = {Improved face and feature finding for audio-visual speech recognition
                  in visually challenging environments},
  booktitle    = {2004 {IEEE} International Conference on Acoustics, Speech, and Signal
                  Processing, {ICASSP} 2004, Montreal, Quebec, Canada, May 17-21, 2004},
  pages        = {873--876},
  publisher    = {{IEEE}},
  year         = {2004},
  url          = {https://doi.org/10.1109/ICASSP.2004.1327250},
  doi          = {10.1109/ICASSP.2004.1327250},
  timestamp    = {Mon, 22 Jun 2020 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/conf/icassp/JiangPNIN04.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/mir/XuC04,
  author       = {Huaxin Xu and
                  Tat{-}Seng Chua},
  editor       = {Michael S. Lew and
                  Nicu Sebe and
                  Chabane Djeraba},
  title        = {The fusion of audio-visual features and external knowledge for event
                  detection in team sports video},
  booktitle    = {Proceedings of the 6th {ACM} {SIGMM} International Workshop on Multimedia
                  Information Retrieval, {MIR} 2004, October 15-16, 2004, New York,
                  NY, {USA}},
  pages        = {127--134},
  publisher    = {{ACM}},
  year         = {2004},
  url          = {https://doi.org/10.1145/1026711.1026733},
  doi          = {10.1145/1026711.1026733},
  timestamp    = {Tue, 06 Nov 2018 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/conf/mir/XuC04.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/webi/Liu04,
  author       = {Huayong Liu},
  title        = {Content-Based {TV} Sports Video Retrieval Based on Audio-Visual Features
                  and Text Information},
  booktitle    = {2004 {IEEE/WIC/ACM} International Conference on Web Intelligence {(WI}
                  2004), 20-24 September 2004, Beijing, China},
  pages        = {481--484},
  publisher    = {{IEEE} Computer Society},
  year         = {2004},
  url          = {https://doi.org/10.1109/WI.2004.10107},
  doi          = {10.1109/WI.2004.10107},
  timestamp    = {Thu, 23 Mar 2023 14:30:18 +0100},
  biburl       = {https://dblp.org/rec/conf/webi/Liu04.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/jvcir/WangDVCS03,
  author       = {Hualu Wang and
                  Ajay Divakaran and
                  Anthony Vetro and
                  Shih{-}Fu Chang and
                  Huifang Sun},
  title        = {Survey of compressed-domain features used in audio-visual indexing
                  and analysis},
  journal      = {J. Vis. Commun. Image Represent.},
  volume       = {14},
  number       = {2},
  pages        = {150--183},
  year         = {2003},
  url          = {https://doi.org/10.1016/S1047-3203(03)00019-1},
  doi          = {10.1016/S1047-3203(03)00019-1},
  timestamp    = {Sat, 22 Feb 2020 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/journals/jvcir/WangDVCS03.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/avbpa/FoxR03,
  author       = {Niall A. Fox and
                  Richard B. Reilly},
  editor       = {Josef Kittler and
                  Mark S. Nixon},
  title        = {Audio-Visual Speaker Identification Based on the Use of Dynamic Audio
                  and Visual Features},
  booktitle    = {Audio-and Video-Based Biometrie Person Authentication, 4th International
                  Conference, {AVBPA} 2003, Guildford, UK, June 9-11, 2003 Proceedings},
  series       = {Lecture Notes in Computer Science},
  volume       = {2688},
  pages        = {743--751},
  publisher    = {Springer},
  year         = {2003},
  url          = {https://doi.org/10.1007/3-540-44887-X\_86},
  doi          = {10.1007/3-540-44887-X\_86},
  timestamp    = {Mon, 15 Jun 2020 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/conf/avbpa/FoxR03.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/icip/SuganoINY03,
  author       = {Masaru Sugano and
                  Roger Isaksson and
                  Yasuyuki Nakajima and
                  Hiromasa Yanagihara},
  title        = {Shot genre classification using compressed audio-visual features},
  booktitle    = {Proceedings of the 2003 International Conference on Image Processing,
                  {ICIP} 2003, Barcelona, Catalonia, Spain, September 14-18, 2003},
  pages        = {17--20},
  publisher    = {{IEEE}},
  year         = {2003},
  url          = {https://doi.org/10.1109/ICIP.2003.1246605},
  doi          = {10.1109/ICIP.2003.1246605},
  timestamp    = {Tue, 02 Jan 2024 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/conf/icip/SuganoINY03.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/spieVIP/LeeYK03,
  author       = {Shih{-}Hung Lee and
                  Chia{-}Hung Yeh and
                  C.{-}C. Jay Kuo},
  editor       = {Zia{-}ur Rahman and
                  Robert A. Schowengerdt and
                  Stephen E. Reichenbach},
  title        = {Robust {TV} commercial detection based on audiovisual features},
  booktitle    = {Visual Information Processing XII, Orlando, FL, USA, April 21, 2003},
  series       = {{SPIE} Proceedings},
  volume       = {5108},
  pages        = {147--158},
  publisher    = {{SPIE}},
  year         = {2003},
  url          = {https://doi.org/10.1117/12.486775},
  doi          = {10.1117/12.486775},
  timestamp    = {Fri, 06 May 2022 13:45:23 +0200},
  biburl       = {https://dblp.org/rec/conf/spieVIP/LeeYK03.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/ejasp/AleksicWWK02,
  author       = {Petar S. Aleksic and
                  Jay J. Williams and
                  Zhilin Wu and
                  Aggelos K. Katsaggelos},
  title        = {Audio-Visual Speech Recognition Using {MPEG-4} Compliant Visual Features},
  journal      = {{EURASIP} J. Adv. Signal Process.},
  volume       = {2002},
  number       = {11},
  pages        = {1213--1227},
  year         = {2002},
  url          = {https://doi.org/10.1155/S1110865702206162},
  doi          = {10.1155/S1110865702206162},
  timestamp    = {Tue, 21 Mar 2023 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/journals/ejasp/AleksicWWK02.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/civr/KimCKK02,
  author       = {Kyungsu Kim and
                  Junho Choi and
                  Namjung Kim and
                  Pankoo Kim},
  editor       = {Michael S. Lew and
                  Nicu Sebe and
                  John P. Eakins},
  title        = {Extracting Semantic Information from Basketball Video Based on Audio-Visual
                  Features},
  booktitle    = {Image and Video Retrieval, International Conference, {CIVR} 2002,
                  London, UK, July 18-19, 2002, Proceedings},
  series       = {Lecture Notes in Computer Science},
  volume       = {2383},
  pages        = {278--288},
  publisher    = {Springer},
  year         = {2002},
  url          = {https://doi.org/10.1007/3-540-45479-9\_30},
  doi          = {10.1007/3-540-45479-9\_30},
  timestamp    = {Wed, 16 Jun 2021 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/conf/civr/KimCKK02.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/icassp/GoeckePN02,
  author       = {Roland Goecke and
                  Gerasimos Potamianos and
                  Chalapathy Neti},
  title        = {Noisy audio feature enhancement using audio-visual speech data},
  booktitle    = {Proceedings of the {IEEE} International Conference on Acoustics, Speech,
                  and Signal Processing, {ICASSP} 2002, May 13-17 2002, Orlando, Florida,
                  {USA}},
  pages        = {2025--2028},
  publisher    = {{IEEE}},
  year         = {2002},
  url          = {https://doi.org/10.1109/ICASSP.2002.5745030},
  doi          = {10.1109/ICASSP.2002.5745030},
  timestamp    = {Wed, 16 Oct 2019 14:14:52 +0200},
  biburl       = {https://dblp.org/rec/conf/icassp/GoeckePN02.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/icip/AleksicWWK02,
  author       = {Petar S. Aleksic and
                  Jay J. Williams and
                  Zhilin Wu and
                  Aggelos K. Katsaggelos},
  title        = {Audio-visual continuous speech recognition using {MPEG-4} compliant
                  visual features},
  booktitle    = {Proceedings of the 2002 International Conference on Image Processing,
                  {ICIP} 2002, Rochester, New York, USA, September 22-25, 2002},
  pages        = {960--963},
  publisher    = {{IEEE}},
  year         = {2002},
  url          = {https://doi.org/10.1109/ICIP.2002.1038187},
  doi          = {10.1109/ICIP.2002.1038187},
  timestamp    = {Wed, 16 Oct 2019 14:14:52 +0200},
  biburl       = {https://dblp.org/rec/conf/icip/AleksicWWK02.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/icpr/RasheedS02,
  author       = {Zeeshan Rasheed and
                  Mubarak Shah},
  title        = {Movie Genre Classification By Exploiting Audio-Visual Features Of
                  Previews},
  booktitle    = {16th International Conference on Pattern Recognition, {ICPR} 2002,
                  Quebec, Canada, August 11-15, 2002},
  pages        = {1086--1089},
  publisher    = {{IEEE} Computer Society},
  year         = {2002},
  url          = {https://doi.org/10.1109/ICPR.2002.1048494},
  doi          = {10.1109/ICPR.2002.1048494},
  timestamp    = {Fri, 24 Mar 2023 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/conf/icpr/RasheedS02.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/interspeech/HeckmannKSB02,
  author       = {Martin Heckmann and
                  Kristian Kroschel and
                  Christophe Savariaux and
                  Fr{\'{e}}d{\'{e}}ric Berthommier},
  editor       = {John H. L. Hansen and
                  Bryan L. Pellom},
  title        = {DCT-based video features for audio-visual speech recognition},
  booktitle    = {7th International Conference on Spoken Language Processing, {ICSLP2002}
                  - {INTERSPEECH} 2002, Denver, Colorado, USA, September 16-20, 2002},
  pages        = {1925--1928},
  publisher    = {{ISCA}},
  year         = {2002},
  url          = {https://doi.org/10.21437/ICSLP.2002-434},
  doi          = {10.21437/ICSLP.2002-434},
  timestamp    = {Thu, 22 Jun 2023 16:42:18 +0200},
  biburl       = {https://dblp.org/rec/conf/interspeech/HeckmannKSB02.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@phdthesis{DBLP:phd/us/Naphade01,
  author       = {Milind R. Naphade},
  title        = {A Probablistic Framework for Mapping Audio-Visual Features to High-Level
                  Semantics in Terms of Concepts and Context},
  school       = {University of Illinois Urbana-Champaign, {USA}},
  year         = {2001},
  url          = {https://hdl.handle.net/2142/80716},
  timestamp    = {Thu, 07 Sep 2023 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/phd/us/Naphade01.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/icassp/PotamianosLN01,
  author       = {Gerasimos Potamianos and
                  Juergen Luettin and
                  Chalapathy Neti},
  title        = {Hierarchical discriminant features for audio-visual {LVCSR}},
  booktitle    = {{IEEE} International Conference on Acoustics, Speech, and Signal Processing,
                  {ICASSP} 2001, 7-11 May, 2001, Salt Palace Convention Center, Salt
                  Lake City, Utah, USA, Proceedings},
  pages        = {165--168},
  publisher    = {{IEEE}},
  year         = {2001},
  url          = {https://doi.org/10.1109/ICASSP.2001.940793},
  doi          = {10.1109/ICASSP.2001.940793},
  timestamp    = {Thu, 23 Mar 2023 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/conf/icassp/PotamianosLN01.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/icmcs/MatthewsPNL01,
  author       = {Iain A. Matthews and
                  Gerasimos Potamianos and
                  Chalapathy Neti and
                  Juergen Luettin},
  title        = {A Comparison Of Model And Transform-Based Visual Features For Audio-Visual
                  {LVCSR}},
  booktitle    = {Proceedings of the 2001 {IEEE} International Conference on Multimedia
                  and Expo, {ICME} 2001, August 22-25, 2001, Tokyo, Japan},
  publisher    = {{IEEE} Computer Society},
  year         = {2001},
  url          = {https://doi.org/10.1109/ICME.2001.1237849},
  doi          = {10.1109/ICME.2001.1237849},
  timestamp    = {Fri, 24 Mar 2023 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/conf/icmcs/MatthewsPNL01.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/icmcs/YoshitakaM01,
  author       = {Atsuo Yoshitaka and
                  Masato Miyake},
  title        = {Scene Detection by Audio-Visual Features},
  booktitle    = {Proceedings of the 2001 {IEEE} International Conference on Multimedia
                  and Expo, {ICME} 2001, August 22-25, 2001, Tokyo, Japan},
  publisher    = {{IEEE} Computer Society},
  year         = {2001},
  url          = {https://doi.org/10.1109/ICME.2001.1237652},
  doi          = {10.1109/ICME.2001.1237652},
  timestamp    = {Fri, 24 Mar 2023 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/conf/icmcs/YoshitakaM01.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/mmsp/Chan01,
  author       = {Michael T. Chan},
  editor       = {Jean{-}Luc Dugelay and
                  Kenneth Rose},
  title        = {HMM-based audio-visual speech recognition integrating geometric and
                  appearance-based visual features},
  booktitle    = {Fourth {IEEE} Workshop on Multimedia Signal Processing, {MMSP} 2001,
                  Cannes, France, October 3-5, 2001},
  pages        = {9--14},
  publisher    = {{IEEE}},
  year         = {2001},
  url          = {https://doi.org/10.1109/MMSP.2001.962703},
  doi          = {10.1109/MMSP.2001.962703},
  timestamp    = {Wed, 16 Oct 2019 14:14:49 +0200},
  biburl       = {https://dblp.org/rec/conf/mmsp/Chan01.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/spieSR/LiWSD01,
  author       = {Dongge Li and
                  Gang Wei and
                  Ishwar K. Sethi and
                  Nevenka Dimitrova},
  editor       = {Minerva M. Yeung and
                  Chung{-}Sheng Li and
                  Rainer Lienhart},
  title        = {Fusion of visual and audio features for person identification in real
                  video},
  booktitle    = {Storage and Retrieval for Media Databases 2001, San Jose, CA, USA,
                  January 24, 2001},
  series       = {{SPIE} Proceedings},
  volume       = {4315},
  pages        = {180--187},
  publisher    = {{SPIE}},
  year         = {2001},
  url          = {https://doi.org/10.1117/12.410926},
  doi          = {10.1117/12.410926},
  timestamp    = {Sun, 02 Oct 2022 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/conf/spieSR/LiWSD01.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/taslp/WatanabeTN00,
  author       = {Akira Watanabe and
                  Shingo Tomishige and
                  Masahiro Nakatake},
  title        = {Speech visualization by integrating features for the hearing impaired},
  journal      = {{IEEE} Trans. Speech Audio Process.},
  volume       = {8},
  number       = {4},
  pages        = {454--466},
  year         = {2000},
  url          = {https://doi.org/10.1109/89.848226},
  doi          = {10.1109/89.848226},
  timestamp    = {Sun, 17 May 2020 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/taslp/WatanabeTN00.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/icip/PanLH00,
  author       = {Hao Pan and
                  Zhi{-}Pei Liang and
                  Thomas S. Huang},
  title        = {Fusing Audio and Visual Features of Speech},
  booktitle    = {Proceedings of the 2000 International Conference on Image Processing,
                  {ICIP} 2000, Vancouver, BC, Canada, September 10-13, 2000},
  pages        = {214--217},
  publisher    = {{IEEE}},
  year         = {2000},
  url          = {https://doi.org/10.1109/ICIP.2000.899333},
  doi          = {10.1109/ICIP.2000.899333},
  timestamp    = {Fri, 12 Jan 2024 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/conf/icip/PanLH00.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/icmcs/PanLH00,
  author       = {Hao Pan and
                  Zhi{-}Pei Liang and
                  Thomas S. Huang},
  title        = {A New Approach to Integrate Audio and Visual Features of Speech},
  booktitle    = {2000 {IEEE} International Conference on Multimedia and Expo, {ICME}
                  2000, New York, NY, USA, July 30 - August 2, 2000},
  pages        = {1093--1096},
  publisher    = {{IEEE} Computer Society},
  year         = {2000},
  url          = {https://doi.org/10.1109/ICME.2000.871551},
  doi          = {10.1109/ICME.2000.871551},
  timestamp    = {Fri, 12 Jan 2024 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/conf/icmcs/PanLH00.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/avsp/NiyogiPZ99,
  author       = {Partha Niyogi and
                  Eric Petajan and
                  Jialin Zhong},
  editor       = {Dominic W. Massaro},
  title        = {Feature based representation for audio-visual speech recognition},
  booktitle    = {Auditory-Visual Speech Processing, {AVSP} '99, Santa Cruz, CA, USA,
                  August 7-10, 1999},
  pages        = {16},
  publisher    = {{ISCA}},
  year         = {1999},
  url          = {http://www.isca-speech.org/archive\_open/avsp99/av99\_016.html},
  timestamp    = {Wed, 10 Feb 2021 22:00:51 +0100},
  biburl       = {https://dblp.org/rec/conf/avsp/NiyogiPZ99.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/icip/Pavlovic98,
  author       = {Vladimir Pavlovic},
  title        = {Multimodal Tracking and Classification of Audio-Visual Features},
  booktitle    = {Proceedings of the 1998 {IEEE} International Conference on Image Processing,
                  ICIP-98, Chicago, Illinois, USA, October 4-7, 1998},
  pages        = {343--347},
  publisher    = {{IEEE} Computer Society},
  year         = {1998},
  url          = {https://doi.org/10.1109/ICIP.1998.723492},
  doi          = {10.1109/ICIP.1998.723492},
  timestamp    = {Fri, 24 Mar 2023 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/conf/icip/Pavlovic98.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}
a service of  Schloss Dagstuhl - Leibniz Center for Informatics