iclr-blogposts · turian · Nov 23, 2024 · Nov 23, 2024 · Nov 23, 2024 · Nov 23, 2024
diff --git a/_posts/2025-04-28-better-scores-worse-generation.md b/_posts/2025-04-28-better-scores-worse-generation.md
diff --git a/assets/bibliography/2025-04-28-better-scores-worse-generation.bib b/assets/bibliography/2025-04-28-better-scores-worse-generation.bib
@@ -0,0 +1,113 @@
+@article{shi2024espnet,
+  title={ESPnet-Codec: Comprehensive Training and Evaluation of Neural Codecs for Audio, Music, and Speech},
+  author = {Shi, Jiatong and Tian, Jinchuan and Wu, Yihan and Jung, Jee-weon and Yip, Jia Qi and Masuyama, Yoshiki and Chen, William and Wu, Yuning and Tang, Yuxun and Baali, Massa and Alharhi, Dareen and Zhang, Dong and Deng, Ruifan and Srivastava, Tejes and Wu, Haibin and Liu, Alexander H. and Raj, Bhiksha and Jin, Qin and Song, Ruihua and Watanabe, Shinji},
+  journal={arXiv preprint arXiv:2409.15897},
+  year={2024}
+}
+
+@inproceedings{lee2023bigvgan,
+  title={BigVGAN: A universal neural vocoder with large-scale training},
+  author={Lee, Sang-gil and Ping, Wei and Ginsburg, Boris and Catanzaro, Bryan and Yoon, Sungroh},
+  booktitle={ICLR},
+  year={2023}
+}
+
+@article{soundstream,
+  author={Zeghidour, Neil and Luebs, Alejandro and Omran, Ahmed and Skoglund, Jan and Tagliasacchi, Marco},
+  journal={IEEE/ACM Transactions on Audio, Speech, and Language Processing},
+  title={SoundStream: An End-to-End Neural Audio Codec},
+  year={2022},
+  volume={30},
+  number={},
+  pages={495-507},
+}
+
+@article{defossez2022highfi,
+  title={High Fidelity Neural Audio Compression},
+  author={Défossez, Alexandre and Copet, Jade and Synnaeve, Gabriel and Adi, Yossi},
+  journal={arXiv preprint arXiv:2210.13438},
+  year={2022}
+}
+
+@inproceedings{dac,
+  author = {Kumar, Rithesh and Seetharaman, Prem and Luebs, Alejandro and Kumar, Ishaan and Kumar, Kundan},
+  title = {High-fidelity audio compression with improved RVQGAN},
+  year = {2024},
+  booktitle = {NeurIPS},
+}
+
+@article{zielinski2008bias,
+  author = {Zieliński, Sławomir and Rumsey, Francis and Bech, Søren},
+  year = {2008},
+  month = {01},
+  pages = {427-451},
+  title = {On Some Biases Encountered in Modern Audio Quality Listening Tests—A Review},
+  volume = {56},
+  journal = {Journal of the Audio Engineering Society},
+}
+
+@article{medin1993,
+  author = {Medin, Doug and Gentner, Dedre},
+  year = {1993},
+  month = {04},
+  pages = {254-278},
+  title = {Respects for Similarity},
+  volume = {100},
+  journal = {Psychological Review},
+}
+
+@article{fsd50k,
+  author = {Fonseca, Eduardo and Favory, Xavier and Pons, Jordi and Font, Frederic and Serra, Xavier},
+  title = {FSD50K: An Open Dataset of Human-Labeled Sound Events},
+  year = {2021},
+  volume = {30},
+  journal = {IEEE/ACM Transasctions on  Audio, Speech and Language Processing},
+  month = dec,
+  pages = {829–852},
+}
+
+@article{suied2014,
+  author = {Suied, C and Agus, TR and Thorpe, S and Mesgarani, N and Pressnitzer, D},
+  journal = {Journal of the Acoustical Society of America},
+  pages = {1380-1391},
+  title = {Auditory gist: Recognition of very short sound from timbre cues},
+  volume = {135},
+  year = {2014}
+}
+
+@article{visqol,
+  title = {ViSQOL: an objective speech quality model},
+  author = {Hines, Andrew and Skoglund, Jan and Kokaram, Anil and Harte, Naomi},
+  year = {2015},
+  journal = {EURASIP Journal on Audio, Speech, and Music Processing},
+  pages = {1-18},
+  volume = {2015 (13)}
+}
+
+@inproceedings{dpam,
+   title = {A Differentiable Perceptual Audio Metric Learned from Just Noticeable Differences},
+   author = {Manocha, Pranay and Finkelstein, Adam and Zhang, Richard and Bryan, Nicholas J. and Mysore, Gautham J. and Jin, Zeyu},
+   booktitle = {Interspeech},
+   year = {2020},
+}
+
+@inproceedings{zhang2018,
+  author={Zhang, Richard and Isola, Phillip and Efros, Alexei A. and Shechtman, Eli and Wang, Oliver},
+  booktitle={2018 IEEE/CVF Conference on Computer Vision and Pattern Recognition}, 
+  title={The Unreasonable Effectiveness of Deep Features as a Perceptual Metric}, 
+  year={2018},
+}
+
+@inproceedings{sundaram2024,
+  title={When Does Perceptual Alignment Benefit Vision Representations?},
+  author={Shobhita Sundaram and Stephanie Fu and Lukas Muttenthaler and Netanel Y. Tamir and Lucy Chai and Simon Kornblith and Trevor Darrell and Phillip Isola},
+  booktitle={NeurIPS},
+  year={2024},
+}
+
+@inproceedings{patrini2017,
+  author={Patrini, Giorgio and Rozza, Alessandro and Menon, Aditya Krishna and Nock, Richard and Qu, Lizhen},
+  booktitle={2017 IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
+  title={Making Deep Neural Networks Robust to Label Noise: A Loss Correction Approach},
+  year={2017},
+}
diff --git a/assets/img/2025-04-28-better-scores-worse-generation/3afc_roc_curves.png b/assets/img/2025-04-28-better-scores-worse-generation/3afc_roc_curves.png
diff --git a/assets/img/2025-04-28-better-scores-worse-generation/ax_roc_curves.png b/assets/img/2025-04-28-better-scores-worse-generation/ax_roc_curves.png
diff --git a/...n/bigvgan-descript-audio-codec-comparison-hifi-tts-dev-speech-data-1024x821.png b/...n/bigvgan-descript-audio-codec-comparison-hifi-tts-dev-speech-data-1024x821.png
diff --git a/...se-generation/bigvgan-descript-audio-codec-musdb18-hq-music-data-1-1024x412.png b/...se-generation/bigvgan-descript-audio-codec-musdb18-hq-music-data-1-1024x412.png
diff --git a/assets/img/2025-04-28-better-scores-worse-generation/bigvgan_pitch_error_1.wav b/assets/img/2025-04-28-better-scores-worse-generation/bigvgan_pitch_error_1.wav
diff --git a/assets/img/2025-04-28-better-scores-worse-generation/bigvgan_pitch_error_1_ref.wav b/assets/img/2025-04-28-better-scores-worse-generation/bigvgan_pitch_error_1_ref.wav
diff --git a/assets/img/2025-04-28-better-scores-worse-generation/bigvgan_temporal_error_3.wav b/assets/img/2025-04-28-better-scores-worse-generation/bigvgan_temporal_error_3.wav
diff --git a/assets/img/2025-04-28-better-scores-worse-generation/bigvgan_temporal_error_3_ref.wav b/assets/img/2025-04-28-better-scores-worse-generation/bigvgan_temporal_error_3_ref.wav
diff --git a/assets/img/2025-04-28-better-scores-worse-generation/dac_background_error_1.wav b/assets/img/2025-04-28-better-scores-worse-generation/dac_background_error_1.wav
diff --git a/assets/img/2025-04-28-better-scores-worse-generation/dac_background_error_1_ref.wav b/assets/img/2025-04-28-better-scores-worse-generation/dac_background_error_1_ref.wav
diff --git a/assets/img/2025-04-28-better-scores-worse-generation/dac_harmonic_distortion_error_3.wav b/assets/img/2025-04-28-better-scores-worse-generation/dac_harmonic_distortion_error_3.wav
diff --git a/assets/img/2025-04-28-better-scores-worse-generation/dac_harmonic_distortion_error_3_ref.wav b/assets/img/2025-04-28-better-scores-worse-generation/dac_harmonic_distortion_error_3_ref.wav