Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Russian Datasets Added #7

Open
wants to merge 5 commits into
base: hw_asr_2022
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions hw_asr/datasets/__init__.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,18 @@
from hw_asr.datasets.common_voice import CommonVoiceDataset
from hw_asr.datasets.custom_audio_dataset import CustomAudioDataset
from hw_asr.datasets.custom_dir_audio_dataset import CustomDirAudioDataset
from hw_asr.datasets.librispeech_dataset import LibrispeechDataset
from hw_asr.datasets.ljspeech_dataset import LJspeechDataset
from hw_asr.datasets.ru_commonvoice_dataset import RuCommonVoiceDataset
from hw_asr.datasets.ru_golos_dataset import GolosDataset
from hw_asr.datasets.common_voice import CommonVoiceDataset

__all__ = [
"LibrispeechDataset",
"CustomDirAudioDataset",
"CustomAudioDataset",
"LJspeechDataset",
"RuCommonVoiceDataset",
"GolosDataset"
"CommonVoiceDataset"
]
2 changes: 1 addition & 1 deletion hw_asr/datasets/ljspeech_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ def _create_index(self, part):
w_id = line.split('|')[0]
w_text = " ".join(line.split('|')[1:]).strip()
wav_path = wav_dir / f"{w_id}.wav"
if not wav_path.exists(): # elem in another part
if not wav_path.exists(): # elem is in another part
continue
t_info = torchaudio.info(str(wav_path))
length = t_info.num_frames / t_info.sample_rate
Expand Down
126 changes: 126 additions & 0 deletions hw_asr/datasets/ru_commonvoice_dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
import concurrent.futures as cf
import json
import logging
import os
import shutil
from asyncio import as_completed
from pathlib import Path

import pandas as pd
import torch
import torchaudio
from hw_asr.base.base_dataset import BaseDataset
from hw_asr.utils import ROOT_PATH
from speechbrain.utils.data_utils import download_file
from tqdm import tqdm

logger = logging.getLogger(__name__)

class RuCommonVoiceDataset(BaseDataset):
def __init__(self, part, data_dir=None, use_vad=False, *args, **kwargs):
"""
:param part: which part of dataset to use
:param data_dir: Path objecth with the path to data folder
:param use_vad: whether to preprocess all audios with Voice Activity Detector
in order to cut silence at the beggining and end of the audio
"""
if data_dir is None:
data_dir = ROOT_PATH / "data" / "datasets" / "ru_commonvoice"
data_dir.mkdir(exist_ok=True, parents=True)
self._data_dir = data_dir
index = self._get_or_load_index(part, use_vad)

super().__init__(index, *args, **kwargs)

def _load_part(self, part):
df = pd.read_csv(str(self._data_dir / f'{part}.tsv'), sep='\t')
for _, row in df.iterrows():
f_name = row['path']
file_path = self._data_dir / 'clips' / f_name
shutil.move(str(file_path), str(self._data_dir / part / f_name))

def _load_dataset(self):
arch_path = self._data_dir / "cv-corpus-11.0-2022-09-21-ru.tar.gz"

# url wget is not supported due to email confirmation needed
assert arch_path.exists(), "please download RU Common Voice 11.0 from the official website"
print(f"Loading RU Common Voice 11.0")

shutil.unpack_archive(arch_path, self._data_dir)
for fpath in (self._data_dir / "cv-corpus-11.0-2022-09-21/ru").iterdir():
shutil.move(str(fpath), str(self._data_dir / fpath.name))
os.remove(str(arch_path))
shutil.rmtree(str(self._data_dir / "cv-corpus-11.0-2022-09-21"))

(self._data_dir / "train").mkdir(exist_ok=True, parents=True)
(self._data_dir / "dev").mkdir(exist_ok=True, parents=True)
(self._data_dir / "test").mkdir(exist_ok=True, parents=True)

self._load_part("train")
self._load_part("dev")
self._load_part("test")

shutil.rmtree(str(self._data_dir / "clips"))


def _get_or_load_index(self, part, use_vad):
if use_vad:
index_path = self._data_dir / f"{part}_vad_index.json"
else:
index_path = self._data_dir / f"{part}_index.json"
if index_path.exists():
with index_path.open() as f:
index = json.load(f)
else:
index = self._create_index(part, use_vad)
with index_path.open("w") as f:
json.dump(index, f, indent=2)
return index

def _create_index(self, part, use_vad):
index = []
split_dir = self._data_dir / part
if not split_dir.exists():
self._load_dataset()

mp3_dirs = set()
for dirpath, dirnames, filenames in os.walk(str(split_dir)):
if any([f.endswith(".mp3") for f in filenames]):
mp3_dirs.add(dirpath)
for mp3_dir in tqdm(
list(mp3_dirs), desc=f"Preparing ru common voice folders: {part}"
):
torchaudio.set_audio_backend('sox_io')
mp3_dir = Path(mp3_dir)
trans_path = self._data_dir / f"{part}.tsv"
df = pd.read_csv(trans_path, sep='\t')
with cf.ThreadPoolExecutor(max_workers=100) as executor:
future_to_dict = {executor.submit(add_to_index, mp3_dir, row, use_vad): row\
for _, row in df.iterrows()}
for future in cf.as_completed(future_to_dict):
index.append(future.result())
return index


def add_to_index(mp3_dir, row, use_vad):
m_id = row['path']
m_text = row['sentence'].strip()
mp3_path = mp3_dir / m_id
if use_vad:
audio_tensor, sr = torchaudio.load(str(mp3_path))
# Common voice has too much noise and silence and the start and end
audio_tensor = torchaudio.functional.vad(audio_tensor, sr, pre_trigger_time=0.15) # cut leading silence
audio_tensor = torch.flip(audio_tensor, [0, 1])
audio_tensor = torchaudio.functional.vad(audio_tensor, sr, pre_trigger_time=0.15) # cut ending silence
audio_tensor = torch.flip(audio_tensor, [0, 1])
mp3_path = Path(str(mp3_path)[:-4] + "_vad.mp3")
torchaudio.save(str(mp3_path), audio_tensor, sr)

t_info = torchaudio.info(str(mp3_path))
length = t_info.num_frames / t_info.sample_rate
res_dict= {
"path": str(mp3_path.absolute().resolve()),
"text": m_text.lower(),
"audio_len": length,
}
return res_dict
123 changes: 123 additions & 0 deletions hw_asr/datasets/ru_golos_dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
import json
import logging
import os
import shutil
from pathlib import Path

import jsonlines
import pandas as pd
import torchaudio
from hw_asr.base.base_dataset import BaseDataset
from hw_asr.utils import ROOT_PATH
from speechbrain.utils.data_utils import download_file
from tqdm import tqdm

logger = logging.getLogger(__name__)

URL_LINKS = {
"farfield": "https://sc.link/1Z3",
"train_1": "https://sc.link/MvQ",
"train_2": "https://sc.link/NwL",
"train_3": "https://sc.link/Oxg",
"train_4": "https://sc.link/Pyz",
"train_5": "https://sc.link/Qz7",
"train_6": "https://sc.link/RAL",
"train_7": "https://sc.link/VG5",
"train_8": "https://sc.link/WJW",
"train_9": "https://sc.link/XKk",
}

class GolosDataset(BaseDataset):
def __init__(self, part, names=["crowd7", "crowd8", "crowd9"], data_dir=None, *args, **kwargs):
"""
:param part: which part of dataset to use (only train is supported)
:param names: which part of train split to use (crowd{i} or farfield),
crowd0 is not supported
:param data_dir: Path object with the path to data folder
"""
if data_dir is None:
data_dir = ROOT_PATH / "data" / "datasets" / "ru_golos"
data_dir.mkdir(exist_ok=True, parents=True)
self._data_dir = data_dir
index = self._get_or_load_index(part, names)

super().__init__(index, *args, **kwargs)

def _load_dataset(self, name):
print(f"Loading GOLOS_{name}")

if (self._data_dir / "train" / name).exists():
return
if (self._data_dir / "train" / "crowd" / f"{name[-1]}").exists():
return

if name == "farfield":
url_name = name
else:
url_name = f"train_{name[-1]}"

arch_path = self._data_dir / f"{url_name}.tar"
if not arch_path.exists():
download_file(URL_LINKS[url_name], arch_path)
shutil.unpack_archive(arch_path, self._data_dir)
if name[-1] == "9":
shutil.move(str(self._data_dir / "train" / "manifest.jsonl"),\
str(self._data_dir / "manifest.jsonl"))
os.remove(str(arch_path))

def _get_or_load_index(self, part, names):
index_path = self._data_dir / f"{part}_{'_'.join(names)}_index.json"
if index_path.exists():
with index_path.open() as f:
index = json.load(f)
else:
index = self._create_index(part, names)
with index_path.open("w") as f:
json.dump(index, f, indent=2)
return index

def _create_index(self, part, names):
index = []
split_dir = self._data_dir / part
for name in names:
if name == "farfield":
if not (split_dir / name).exists():
self._load_dataset(name)
elif not (split_dir / "crowd" / f"{name[-1]}").exists():
self._load_dataset(name)

wav_dirs = set()
for dirpath, dirnames, filenames in os.walk(str(split_dir)):
if any([f.endswith(".wav") for f in filenames]):
wav_dirs.add(dirpath)
for wav_dir in tqdm(
list(wav_dirs), desc=f"Preparing golos folders: {part}"
):
wav_dir = Path(wav_dir)
trans_path = self._data_dir / "manifest.jsonl"
assert trans_path.exists(), "download crowd9 first"
with jsonlines.open(str(trans_path)) as reader:
for obj in reader.iter(type=dict):
if "farfield" not in str(wav_dir):
path_check = f"crowd/{str(wav_dir)[-1]}"
if f"crowd{str(wav_dir)[-1]}" not in names:
continue
else:
path_check = "farfield"
if "farfield" not in names:
continue
if path_check not in obj["audio_filepath"]:
continue
w_id = obj['id'] + ".wav"
w_text = obj['text'].strip()
wav_path = wav_dir / w_id
t_info = torchaudio.info(str(wav_path))
length = t_info.num_frames / t_info.sample_rate
index.append(
{
"path": str(wav_path.absolute().resolve()),
"text": w_text.lower(),
"audio_len": length,
}
)
return index
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,4 @@ wandb
pyctcdecode
torchaudio~=0.11.0
pillow
jsonlines