From 487600d0445dbd99e0bcb4e372943fea7b59ae12 Mon Sep 17 00:00:00 2001 From: Hung Dinh Xuan Date: Sat, 21 Dec 2024 15:59:39 +0900 Subject: [PATCH] add configs for train private data --- .../normal_largecorpus_multiview_cnsl.yaml | 1 + ...rmertcm_multiview_large_corpus_conf-2.yaml | 48 ++ notebooks/analyze.ipynb | 700 +++++++++--------- notebooks/eval.ipynb | 130 +++- notebooks/prepare_dataset.ipynb | 185 ++++- src/data/components/augwrapper.py | 20 +- src/data/normal_multiview_datamodule.py | 19 +- 7 files changed, 690 insertions(+), 413 deletions(-) create mode 100644 configs/experiment/xlsr_conformertcm_multiview_large_corpus_conf-2.yaml diff --git a/configs/data/normal_largecorpus_multiview_cnsl.yaml b/configs/data/normal_largecorpus_multiview_cnsl.yaml index 8f36302..621234c 100644 --- a/configs/data/normal_largecorpus_multiview_cnsl.yaml +++ b/configs/data/normal_largecorpus_multiview_cnsl.yaml @@ -29,6 +29,7 @@ args: augmentation_methods: ["RawBoost12", "pitch_1", "volume_10", "speed_01", "none"] # "none" is the original data wav_samp_rate: 16000 + trim_length: 64000 online_aug: true aug_dir: ${oc.env:LARGE_CORPUS_FOR_CNSL}/aug noise_path: ${oc.env:NOISE_PATH} diff --git a/configs/experiment/xlsr_conformertcm_multiview_large_corpus_conf-2.yaml b/configs/experiment/xlsr_conformertcm_multiview_large_corpus_conf-2.yaml new file mode 100644 index 0000000..68da92a --- /dev/null +++ b/configs/experiment/xlsr_conformertcm_multiview_large_corpus_conf-2.yaml @@ -0,0 +1,48 @@ +# @package _global_ + +# to execute this experiment run: +# python train.py experiment=example + +defaults: + - override /data: normal_largecorpus_multiview_cnsl + - override /model: xlsr_conformertcm_baseline_multiview + - override /callbacks: default_loss + - override /trainer: default + +# all parameters below will be merged with parameters from default configurations set above +# this allows you to overwrite only specified parameters + +tags: ["normal_largecorpus_multiview_cnsl", "xlsr_conformertcm_baseline_multiview"] + +seed: 1234 + +trainer: + max_epochs: 100 + gradient_clip_val: 0.0 + accelerator: cuda + +model: + optimizer: + lr: 0.000001 + weight_decay: 0.0001 + net: null + scheduler: null + compile: true + cross_entropy_weight: [0.7, 0.3] # Balanced weights for cross entropy loss + +data: + batch_size: 14 + num_workers: 8 + pin_memory: true + args: + augmentation_methods: ["RawBoost12", 'RawBoost12', 'RawBoostFull', "pitch_1", "volume_10", "speed_01", "none"] # "none" is the original data + padding_type: repeat + random_start: True + + +logger: + wandb: + tags: ${tags} + group: "normal_largecorpus_multiview_cnsl" + aim: + experiment: "normal_largecorpus_multiview_cnsl" diff --git a/notebooks/analyze.ipynb b/notebooks/analyze.ipynb index add77d0..8795665 100644 --- a/notebooks/analyze.ipynb +++ b/notebooks/analyze.ipynb @@ -1,353 +1,353 @@ { - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Processing audio files: 0%| | 0/31779 [00:00 40\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[43meval_to_score_file\u001b[49m\u001b[43m(\u001b[49m\u001b[43mscore_file\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m/datad/Datasets/in_the_wild.txt\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m)\n\u001b[1;32m 41\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n", - "Cell \u001b[0;32mIn[4], line 25\u001b[0m, in \u001b[0;36meval_to_score_file\u001b[0;34m(score_file, cm_key_file)\u001b[0m\n\u001b[1;32m 22\u001b[0m bona_cm \u001b[38;5;241m=\u001b[39m cm_scores[cm_scores[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m1_y\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mbonafide\u001b[39m\u001b[38;5;124m'\u001b[39m][\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m1_x\u001b[39m\u001b[38;5;124m'\u001b[39m]\u001b[38;5;241m.\u001b[39mvalues\n\u001b[1;32m 23\u001b[0m spoof_cm \u001b[38;5;241m=\u001b[39m cm_scores[cm_scores[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m1_y\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mspoof\u001b[39m\u001b[38;5;124m'\u001b[39m][\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m1_x\u001b[39m\u001b[38;5;124m'\u001b[39m]\u001b[38;5;241m.\u001b[39mvalues\n\u001b[0;32m---> 25\u001b[0m eer_cm, th \u001b[38;5;241m=\u001b[39m \u001b[43mem\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcompute_eer\u001b[49m\u001b[43m(\u001b[49m\u001b[43mbona_cm\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mspoof_cm\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 26\u001b[0m out_data \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124meer: \u001b[39m\u001b[38;5;132;01m{}\u001b[39;00m\u001b[38;5;130;01m\\t\u001b[39;00m\u001b[38;5;124mthreshold: \u001b[39m\u001b[38;5;132;01m{}\u001b[39;00m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;241m.\u001b[39mformat(\u001b[38;5;241m100\u001b[39m\u001b[38;5;241m*\u001b[39meer_cm, th)\n\u001b[1;32m 27\u001b[0m \u001b[38;5;28mprint\u001b[39m(out_data)\n", - "File \u001b[0;32m/data/hungdx/Lightning-hydra/notebooks/eval_metrics_DF.py:44\u001b[0m, in \u001b[0;36mcompute_eer\u001b[0;34m(target_scores, nontarget_scores)\u001b[0m\n\u001b[1;32m 42\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mcompute_eer\u001b[39m(target_scores, nontarget_scores):\n\u001b[1;32m 43\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\" Returns equal error rate (EER) and the corresponding threshold. \"\"\"\u001b[39;00m\n\u001b[0;32m---> 44\u001b[0m frr, far, thresholds \u001b[38;5;241m=\u001b[39m \u001b[43mcompute_det_curve\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtarget_scores\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mnontarget_scores\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 45\u001b[0m abs_diffs \u001b[38;5;241m=\u001b[39m np\u001b[38;5;241m.\u001b[39mabs(frr \u001b[38;5;241m-\u001b[39m far)\n\u001b[1;32m 46\u001b[0m min_index \u001b[38;5;241m=\u001b[39m np\u001b[38;5;241m.\u001b[39margmin(abs_diffs)\n", - "File \u001b[0;32m/data/hungdx/Lightning-hydra/notebooks/eval_metrics_DF.py:37\u001b[0m, in \u001b[0;36mcompute_det_curve\u001b[0;34m(target_scores, nontarget_scores)\u001b[0m\n\u001b[1;32m 35\u001b[0m frr \u001b[38;5;241m=\u001b[39m np\u001b[38;5;241m.\u001b[39mconcatenate((np\u001b[38;5;241m.\u001b[39matleast_1d(\u001b[38;5;241m0\u001b[39m), tar_trial_sums \u001b[38;5;241m/\u001b[39m target_scores\u001b[38;5;241m.\u001b[39msize)) \u001b[38;5;66;03m# false rejection rates\u001b[39;00m\n\u001b[1;32m 36\u001b[0m far \u001b[38;5;241m=\u001b[39m np\u001b[38;5;241m.\u001b[39mconcatenate((np\u001b[38;5;241m.\u001b[39matleast_1d(\u001b[38;5;241m1\u001b[39m), nontarget_trial_sums \u001b[38;5;241m/\u001b[39m nontarget_scores\u001b[38;5;241m.\u001b[39msize)) \u001b[38;5;66;03m# false acceptance rates\u001b[39;00m\n\u001b[0;32m---> 37\u001b[0m thresholds \u001b[38;5;241m=\u001b[39m np\u001b[38;5;241m.\u001b[39mconcatenate((np\u001b[38;5;241m.\u001b[39matleast_1d(all_scores[\u001b[43mindices\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m]\u001b[49m] \u001b[38;5;241m-\u001b[39m \u001b[38;5;241m0.001\u001b[39m), all_scores[indices])) \u001b[38;5;66;03m# Thresholds are the sorted scores\u001b[39;00m\n\u001b[1;32m 39\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m frr, far, thresholds\n", - "\u001b[0;31mIndexError\u001b[0m: index 0 is out of bounds for axis 0 with size 0" - ] - } - ], + "outputs": [], "source": [ "import sys\n", "import os.path\n", @@ -174,16 +152,24 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 3, "metadata": {}, "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "eer: 8.234916170745263\tthreshold: -3.04897689819336\n", - "\n", - "0.08234916170745263\n" + "ename": "FileNotFoundError", + "evalue": "[Errno 2] No such file or directory: '/dataa/Datasets/in_the_wild.txt'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[3], line 29\u001b[0m\n\u001b[1;32m 26\u001b[0m \u001b[38;5;28mprint\u001b[39m(out_data)\n\u001b[1;32m 27\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m eer_cm\n\u001b[0;32m---> 29\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[43meval_to_score_file\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m/data/hungdx/tcm_add/Scores/avg_5_best_baseline_itw_var.txt\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m/dataa/Datasets/in_the_wild.txt\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m)\n", + "Cell \u001b[0;32mIn[3], line 13\u001b[0m, in \u001b[0;36meval_to_score_file\u001b[0;34m(score_file, cm_key_file)\u001b[0m\n\u001b[1;32m 8\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21meval_to_score_file\u001b[39m(score_file, cm_key_file):\n\u001b[1;32m 9\u001b[0m \u001b[38;5;66;03m# CM key file is the metadata file that contains the ground truth labels for the eval set\u001b[39;00m\n\u001b[1;32m 10\u001b[0m \u001b[38;5;66;03m# score file is the output of the system that contains the scores for the eval set\u001b[39;00m\n\u001b[1;32m 11\u001b[0m \u001b[38;5;66;03m# phase is the phase of the eval set (dev or eval)\u001b[39;00m\n\u001b[0;32m---> 13\u001b[0m cm_data \u001b[38;5;241m=\u001b[39m \u001b[43mpandas\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread_csv\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcm_key_file\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43msep\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43m \u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mheader\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\n\u001b[1;32m 14\u001b[0m submission_scores \u001b[38;5;241m=\u001b[39m pandas\u001b[38;5;241m.\u001b[39mread_csv(\n\u001b[1;32m 15\u001b[0m score_file, sep\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m \u001b[39m\u001b[38;5;124m'\u001b[39m, header\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m, skipinitialspace\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[1;32m 16\u001b[0m \u001b[38;5;66;03m# check here for progress vs eval set\u001b[39;00m\n", + "File \u001b[0;32m~/miniconda3/lib/python3.11/site-packages/pandas/io/parsers/readers.py:948\u001b[0m, in \u001b[0;36mread_csv\u001b[0;34m(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, date_format, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options, dtype_backend)\u001b[0m\n\u001b[1;32m 935\u001b[0m kwds_defaults \u001b[38;5;241m=\u001b[39m _refine_defaults_read(\n\u001b[1;32m 936\u001b[0m dialect,\n\u001b[1;32m 937\u001b[0m delimiter,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 944\u001b[0m dtype_backend\u001b[38;5;241m=\u001b[39mdtype_backend,\n\u001b[1;32m 945\u001b[0m )\n\u001b[1;32m 946\u001b[0m kwds\u001b[38;5;241m.\u001b[39mupdate(kwds_defaults)\n\u001b[0;32m--> 948\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m_read\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfilepath_or_buffer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkwds\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/miniconda3/lib/python3.11/site-packages/pandas/io/parsers/readers.py:611\u001b[0m, in \u001b[0;36m_read\u001b[0;34m(filepath_or_buffer, kwds)\u001b[0m\n\u001b[1;32m 608\u001b[0m _validate_names(kwds\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mnames\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m))\n\u001b[1;32m 610\u001b[0m \u001b[38;5;66;03m# Create the parser.\u001b[39;00m\n\u001b[0;32m--> 611\u001b[0m parser \u001b[38;5;241m=\u001b[39m \u001b[43mTextFileReader\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfilepath_or_buffer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwds\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 613\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m chunksize \u001b[38;5;129;01mor\u001b[39;00m iterator:\n\u001b[1;32m 614\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m parser\n", + "File \u001b[0;32m~/miniconda3/lib/python3.11/site-packages/pandas/io/parsers/readers.py:1448\u001b[0m, in \u001b[0;36mTextFileReader.__init__\u001b[0;34m(self, f, engine, **kwds)\u001b[0m\n\u001b[1;32m 1445\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moptions[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhas_index_names\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m kwds[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhas_index_names\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n\u001b[1;32m 1447\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles: IOHandles \u001b[38;5;241m|\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m-> 1448\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_engine \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_make_engine\u001b[49m\u001b[43m(\u001b[49m\u001b[43mf\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mengine\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/miniconda3/lib/python3.11/site-packages/pandas/io/parsers/readers.py:1705\u001b[0m, in \u001b[0;36mTextFileReader._make_engine\u001b[0;34m(self, f, engine)\u001b[0m\n\u001b[1;32m 1703\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mb\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m mode:\n\u001b[1;32m 1704\u001b[0m mode \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mb\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m-> 1705\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles \u001b[38;5;241m=\u001b[39m \u001b[43mget_handle\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1706\u001b[0m \u001b[43m \u001b[49m\u001b[43mf\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1707\u001b[0m \u001b[43m \u001b[49m\u001b[43mmode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1708\u001b[0m \u001b[43m \u001b[49m\u001b[43mencoding\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mencoding\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1709\u001b[0m \u001b[43m \u001b[49m\u001b[43mcompression\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mcompression\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1710\u001b[0m \u001b[43m \u001b[49m\u001b[43mmemory_map\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mmemory_map\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1711\u001b[0m \u001b[43m \u001b[49m\u001b[43mis_text\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mis_text\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1712\u001b[0m \u001b[43m \u001b[49m\u001b[43merrors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mencoding_errors\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mstrict\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1713\u001b[0m \u001b[43m \u001b[49m\u001b[43mstorage_options\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mstorage_options\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1714\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1715\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 1716\u001b[0m f \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles\u001b[38;5;241m.\u001b[39mhandle\n", + "File \u001b[0;32m~/miniconda3/lib/python3.11/site-packages/pandas/io/common.py:863\u001b[0m, in \u001b[0;36mget_handle\u001b[0;34m(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)\u001b[0m\n\u001b[1;32m 858\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(handle, \u001b[38;5;28mstr\u001b[39m):\n\u001b[1;32m 859\u001b[0m \u001b[38;5;66;03m# Check whether the filename is to be opened in binary mode.\u001b[39;00m\n\u001b[1;32m 860\u001b[0m \u001b[38;5;66;03m# Binary mode does not support 'encoding' and 'newline'.\u001b[39;00m\n\u001b[1;32m 861\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m ioargs\u001b[38;5;241m.\u001b[39mencoding \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mb\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m ioargs\u001b[38;5;241m.\u001b[39mmode:\n\u001b[1;32m 862\u001b[0m \u001b[38;5;66;03m# Encoding\u001b[39;00m\n\u001b[0;32m--> 863\u001b[0m handle \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mopen\u001b[39m(\n\u001b[1;32m 864\u001b[0m handle,\n\u001b[1;32m 865\u001b[0m ioargs\u001b[38;5;241m.\u001b[39mmode,\n\u001b[1;32m 866\u001b[0m encoding\u001b[38;5;241m=\u001b[39mioargs\u001b[38;5;241m.\u001b[39mencoding,\n\u001b[1;32m 867\u001b[0m errors\u001b[38;5;241m=\u001b[39merrors,\n\u001b[1;32m 868\u001b[0m newline\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 869\u001b[0m )\n\u001b[1;32m 870\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 871\u001b[0m \u001b[38;5;66;03m# Binary mode\u001b[39;00m\n\u001b[1;32m 872\u001b[0m handle \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mopen\u001b[39m(handle, ioargs\u001b[38;5;241m.\u001b[39mmode)\n", + "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: '/dataa/Datasets/in_the_wild.txt'" ] } ], @@ -1960,15 +1946,15 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "/home/hungdx/mlaad_xlsr_conformertcm_train_large_corpus_multiview_conf-1.txt\n", - "Accuracy: 93.54\n", + "/home/hungdx/mlaad_xlsr_conformertcm_train_large_corpus_multiview_conf-1_lts_3s.txt\n", + "Accuracy: 93.48\n", "\n", "\n" ] @@ -1990,7 +1976,7 @@ "# list_dir = sorted(list_dir)\n", "\n", "#for score_file in list_dir:\n", - "score_file = \"/home/hungdx/mlaad_xlsr_conformertcm_train_large_corpus_multiview_conf-1.txt\"\n", + "score_file = \"/home/hungdx/mlaad_xlsr_conformertcm_train_large_corpus_multiview_conf-1_lts_3s.txt\"\n", "pred_df = pd.read_csv(score_file, sep=\" \", header=None)\n", "print(score_file)\n", "pred_df.columns = [\"utt\", \"spoof\", \"score\"]\n", @@ -2011,16 +1997,16 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "eer: 0.22422491216022286\tthreshold: -0.7054389119148254\n", + "eer: 0.254385417374277\tthreshold: -0.7286882996559143\n", "\n", - "0.0022422491216022287\n", + "0.00254385417374277\n", "\n", "\n" ] @@ -2062,10 +2048,76 @@ " return eer_cm\n", "\n", "\n", - "score_file = \"/home/hungdx/large_corpus_eval_xlsr_conformertcm_train_large_corpus_multiview_conf-1.txt\"\n", + "score_file = \"/home/hungdx/large_corpus_xlsr_conformertcm_train_large_corpus_multiview_conf-1_lts_2s.txt\"\n", "print(eval_to_score_file(score_file, \"/data/Datasets/0_large-corpus/protocol.txt\"))\n", "print(\"\\n\")" ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# MLAAD_v5" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## config1 + Nov " + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/home/hungdx/mlaad_v5_xlsr_conformertcm_train_large_corpus_multiview_conf-1_lts_4s.txt\n", + "Accuracy: 95.18\n", + "\n", + "\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, accuracy_score, f1_score, recall_score, precision_score, det_curve\n", + "full_df = pd.read_csv(\n", + " \"/nvme1/hungdx/Lightning-hydra/data/mlaad_v5/mlaad_v5_protocol.txt\", sep=\" \", header=None)\n", + "\n", + "full_df.columns = [\"utt\", \"subset\", \"label\"]\n", + "\n", + "# BASE_DIR = \"/nvme2/hungdx/tcm_add/Scores/CNSL_Multiview/ConformerTCM_LargeCorpus_eval_MLAAD\"\n", + "# list_dir = os.listdir(\n", + "# BASE_DIR)\n", + "\n", + "# list_dir = [os.path.join(BASE_DIR, x) for x in list_dir]\n", + "# list_dir = sorted(list_dir)\n", + "\n", + "# for score_file in list_dir:\n", + "score_file = \"/home/hungdx/mlaad_v5_xlsr_conformertcm_train_large_corpus_multiview_conf-1_lts_4s.txt\"\n", + "pred_df = pd.read_csv(score_file, sep=\" \", header=None)\n", + "print(score_file)\n", + "pred_df.columns = [\"utt\", \"spoof\", \"score\"]\n", + "pred_df = pred_df.drop_duplicates(subset=['utt'])\n", + "\n", + "# pred_df['utt'] = pred_df['utt'].apply(lambda x: x.split('/')[-1].split('.')[0])\n", + "\n", + "# if spoof < score, then bonafide, else spoof\n", + "pred_df['pred'] = pred_df.apply(\n", + " lambda x: 'bonafide' if x['spoof'] < x['score'] else 'spoof', axis=1)\n", + "\n", + "# merge eval_df and pred_df on utt\n", + "res_df = pd.merge(full_df, pred_df, on='utt')\n", + "\n", + "print(\"Accuracy: {:.2f}\".format(\n", + " accuracy_score(res_df[\"label\"], res_df[\"pred\"])*100))\n", + "print(\"\\n\")" + ] } ], "metadata": { diff --git a/notebooks/prepare_dataset.ipynb b/notebooks/prepare_dataset.ipynb index cd305f8..8617438 100644 --- a/notebooks/prepare_dataset.ipynb +++ b/notebooks/prepare_dataset.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "metadata": {}, "outputs": [ { @@ -10,7 +10,7 @@ "output_type": "stream", "text": [ "Reading protocol file...\n", - "Processing 408572 files using 8 workers...\n" + "Processing 248159 files using 8 workers...\n" ] }, { @@ -19,7 +19,7 @@ "text": [ "/home/hungdx/miniconda3/lib/python3.11/site-packages/numpy/core/fromnumeric.py:59: FutureWarning: 'DataFrame.swapaxes' is deprecated and will be removed in a future version. Please use 'DataFrame.transpose' instead.\n", " return bound(*args, **kwds)\n", - "Calculating durations: 100%|██████████| 8/8 [00:49<00:00, 6.20s/it]\n" + "Calculating durations: 100%|██████████| 8/8 [00:02<00:00, 2.93it/s]\n" ] }, { @@ -30,10 +30,10 @@ "Saving results...\n", "\n", "Processing Statistics:\n", - "Total files processed: 408572\n", + "Total files processed: 248159\n", "Failed files: 0\n", - "Total duration: 671.90 hours\n", - "Mean duration: 5.92 seconds\n", + "Total duration: 433.71 hours\n", + "Mean duration: 6.29 seconds\n", "Min duration: 0.08 seconds\n", "Max duration: 26.20 seconds\n", "\n", @@ -148,8 +148,8 @@ "\n", "# Usage\n", "BASE_DIR = \"/data/hungdx/Lightning-hydra/data/0_large-corpus\"\n", - "protocol_file = \"new_protocol_trim_vocoded.txt\"\n", - "output_file = \"audio_durations.csv\"\n", + "protocol_file = \"new_protocol_trim_vocoded_v2.txt\"\n", + "output_file = \"audio_durations_v2.csv\"\n", "\n", "# Run the processing\n", "df_results, stats = calculate_durations(\n", @@ -166,7 +166,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 9, "metadata": {}, "outputs": [ { @@ -175,13 +175,13 @@ "text": [ "\n", "Short audio files (duration < 1 second):\n", - "2993\n" + "436\n" ] } ], "source": [ - "# Filter audio files with duration less than 1 second\n", - "short_files = df_results[df_results['duration'] < 1]\n", + "# Filter audio files with duration less than 1 second with subset == train\n", + "short_files = df_results[df_results['duration'] < 1 & (df_results['subset'] == 'train')]\n", "print(\"\\nShort audio files (duration < 1 second):\")\n", "print(len(short_files))" ] @@ -195,14 +195,16 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ - "df_results = df_results[df_results['duration'] >= 1]\n", + "# remove all files with duration < 1 second and subset == train\n", + "df_results = df_results[~((df_results['duration'] < 1) & (df_results['subset'] == 'train'))]\n", + "\n", "# drop duration column and save\n", "df_results.drop(columns=['duration'], inplace=True)\n", - "df_results.to_csv(\"new_protocol_trim_vocoded_cleaned.txt\", index=False, header=False, sep=\" \")" + "df_results.to_csv(\"new_protocol_trim_vocoded_cleaned_v2.txt\", index=False, header=False, sep=\" \")" ] }, { @@ -231,6 +233,159 @@ "\n", "print(\"After removing vocoded files:\", len(df))" ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# MLAAD" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "df = pd.read_csv(\"~/MLAAD/protocol.txt\", sep=\" \", header=None)\n", + "df.columns = [\"utt_id\", \"subset\", \"unk\", \"label\"]\n", + "\n", + "# drop unk column\n", + "df.drop(columns=['unk'], inplace=True)\n", + "\n", + "df.to_csv(\"~/MLAAD/protocol_lts.txt\", index=False, header=False, sep=\" \")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Before removing vocoded files: 405579\n", + "After removing vocoded files: 390512\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "\n", + "df = pd.read_csv(\"new_protocol_trim_vocoded_cleaned.txt\", sep=\" \", header=None)\n", + "df.columns = [\"utt_id\", \"subset\", \"label\"]\n", + "\n", + "print(\"Before removing vocoded files:\", len(df))\n", + "\n", + "df = df[~((df[\"utt_id\"].str.startswith(\"vocoded\")) & (df[\"subset\"] == \"dev\"))]\n", + "\n", + "print(\"After removing vocoded files:\", len(df))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# MLAAD V5" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Created protocol.txt with 152388 entries\n", + "Created combined_meta.csv with 152388 entries\n" + ] + } + ], + "source": [ + "import os\n", + "import pandas as pd\n", + "from pathlib import Path\n", + "import csv\n", + "\n", + "\n", + "def read_csv_safely(file_path):\n", + " try:\n", + " # First attempt with standard reading\n", + " return pd.read_csv(file_path, sep='|')\n", + " except:\n", + " try:\n", + " # Second attempt with quote character handling\n", + " return pd.read_csv(file_path, sep='|', quoting=csv.QUOTE_NONE, escapechar='\\\\')\n", + " except:\n", + " try:\n", + " # Third attempt with error handling\n", + " return pd.read_csv(file_path, sep='|', on_bad_lines='skip', quoting=csv.QUOTE_NONE)\n", + " except Exception as e:\n", + " print(f\"Error reading file {file_path}: {str(e)}\")\n", + " return None\n", + "\n", + "\n", + "def create_protocol_and_merge_meta(root_dir):\n", + " # Initialize empty list to store all metadata\n", + " all_meta_data = []\n", + "\n", + " # Initialize protocol file\n", + " protocol_lines = []\n", + "\n", + " # Walk through the directory structure\n", + " for language_dir in os.listdir(root_dir):\n", + " language_path = os.path.join(root_dir, language_dir)\n", + " if not os.path.isdir(language_path):\n", + " continue\n", + "\n", + " # For each model directory in the language directory\n", + " for model_dir in os.listdir(language_path):\n", + " model_path = os.path.join(language_path, model_dir)\n", + " if not os.path.isdir(model_path):\n", + " continue\n", + "\n", + " # Look for meta.csv file\n", + " meta_file = os.path.join(model_path, 'meta.csv')\n", + " if os.path.exists(meta_file):\n", + " # Read meta.csv with safe reading function\n", + " df = read_csv_safely(meta_file)\n", + "\n", + " if df is not None:\n", + " # Add to combined metadata\n", + " all_meta_data.append(df)\n", + "\n", + " # Create protocol lines for all audio files in this directory\n", + " for audio_path in df['path']:\n", + " protocol_lines.append(f\"{audio_path} eval spoof\")\n", + "\n", + " # Combine all metadata\n", + " if all_meta_data:\n", + " combined_meta = pd.concat(all_meta_data, ignore_index=True)\n", + "\n", + " # Save combined metadata\n", + " combined_meta.to_csv('mlaad_v5_combined_meta.csv', sep='|',\n", + " index=False, quoting=csv.QUOTE_NONE, escapechar='\\\\')\n", + "\n", + " # Save protocol file\n", + " with open('mlaad_v5_protocol.txt', 'w') as f:\n", + " f.write('\\n'.join(protocol_lines))\n", + "\n", + " print(f\"Created protocol.txt with {len(protocol_lines)} entries\")\n", + " print(f\"Created combined_meta.csv with {len(combined_meta)} entries\")\n", + " else:\n", + " print(\"No meta.csv files found or all files had errors\")\n", + "\n", + "\n", + "# Run the function\n", + "if __name__ == \"__main__\":\n", + " create_protocol_and_merge_meta(\n", + " '/nvme1/hungdx/Lightning-hydra/data/mlaad_v5/fake')" + ] } ], "metadata": { diff --git a/src/data/components/augwrapper.py b/src/data/components/augwrapper.py index e959302..667a64d 100644 --- a/src/data/components/augwrapper.py +++ b/src/data/components/augwrapper.py @@ -14,7 +14,7 @@ SUPPORTED_AUGMENTATION = [ 'background_noise_5_15', 'pitch_1', 'volume_10', 'reverb_1', 'speed_01', 'telephone_g722', 'gaussian_1', 'gaussian_2', 'gaussian_2_5', 'gaussian_3', - 'RawBoostdf', 'RawBoost12', 'copy_paste_80', 'copy_paste_r', 'time_masking', 'masking', 'time_swap', + 'RawBoostdf', 'RawBoost12', 'RawBoostFull', 'copy_paste_80', 'copy_paste_r', 'time_masking', 'masking', 'time_swap', 'freq_swap', 'swapping', 'frequency_masking', 'linear_filter', 'mp32flac', 'ogg2flac', 'nonspeechtrim', 'bandpass_0_4000', 'griffinlim_downsample', 'lowpass_hifigan_asvspoof5', 'lowpass_hifigan', 'librosa_downsample', 'none'] @@ -1054,6 +1054,24 @@ def RawBoost12(x, args, sr=16000, audio_path=None): return waveform +def RawBoostFull(x, args, sr=16000, audio_path=None): + aug_dir = args.aug_dir + utt_id = os.path.basename(audio_path).split('.')[0] + aug_audio_path = os.path.join(aug_dir, 'RawBoostFull', utt_id + '.wav') + if args.online_aug: + return process_Rawboost_feature(x, sr, args, algo=4) + else: + # check if the augmented file exists + if (os.path.exists(aug_audio_path)): + waveform, _ = librosa.load(aug_audio_path, sr=sr, mono=True) + return waveform + else: + waveform = process_Rawboost_feature(x, sr, args, algo=5) + # save the augmented file,waveform in np array + sf.write(aug_audio_path, waveform, sr, subtype='PCM_16') + return waveform + + def RawBoostdf(x, args, sr=16000, audio_path=None): aug_dir = args.aug_dir utt_id = os.path.basename(audio_path).split('.')[0] diff --git a/src/data/normal_multiview_datamodule.py b/src/data/normal_multiview_datamodule.py index 093e41d..64d5c37 100644 --- a/src/data/normal_multiview_datamodule.py +++ b/src/data/normal_multiview_datamodule.py @@ -89,10 +89,10 @@ def __init__(self, args, list_IDs, labels, base_dir, algo=5, vocoders=[], trim_length, wav_samp_rate, noise_path, rir_path, aug_dir, online_aug, repeat_pad, is_train, random_start) self.enable_chunking = enable_chunking - if repeat_pad: - self.padding_type = "repeat" - else: - self.padding_type = "zero" + self.padding_type = "repeat" if repeat_pad else "zero" + + print("trim_length:", trim_length) + print("padding_type:", self.padding_type) def __getitem__(self, idx): utt_id = self.list_IDs[idx] @@ -263,7 +263,7 @@ def setup(self, stage: Optional[str] = None) -> None: base_dir=self.data_dir+'/', is_train=False, **self.args['data']) self.data_test = Dataset_for_eval(self.args, list_IDs=file_eval, labels=None, - base_dir=self.data_dir+'/', **self.args['data']) + base_dir=self.data_dir+'/', random_start=self.args.random_start, trim_length=self.args.trim_length, repeat_pad=True if self.args.padding_type == 'repeat' else False) def train_dataloader(self) -> DataLoader[Any]: """Create and return the train dataloader. @@ -367,9 +367,12 @@ def genList(self, is_train=False, is_eval=False, is_dev=False): l_meta = f.readlines() for line in l_meta: utt, subset, label = line.strip().split() - if subset == 'eval': - file_list.append(utt) - d_meta[utt] = 1 if label == 'bonafide' else 0 + # if subset == 'eval': + # file_list.append(utt) + # d_meta[utt] = 1 if label == 'bonafide' else 0 + + file_list.append(utt) + d_meta[utt] = 1 if label == 'bonafide' else 0 # return d_meta, file_list return d_meta, file_list