diff --git a/rvc/embedders/.gitignore b/rvc/embedders/.gitignore deleted file mode 100644 index 8b13789..0000000 --- a/rvc/embedders/.gitignore +++ /dev/null @@ -1 +0,0 @@ - diff --git a/rvc/lib/FCPEF0Predictor.py b/rvc/lib/FCPEF0Predictor.py deleted file mode 100644 index 02c7519..0000000 --- a/rvc/lib/FCPEF0Predictor.py +++ /dev/null @@ -1,1036 +0,0 @@ -from typing import Union - -import torch.nn.functional as F -import numpy as np -import torch -import torch.nn as nn -from torch.nn.utils.parametrizations import weight_norm -from torchaudio.transforms import Resample -import os -import librosa -import soundfile as sf -import torch.utils.data -from librosa.filters import mel as librosa_mel_fn -import math -from functools import partial - -from einops import rearrange, repeat -from local_attention import LocalAttention -from torch import nn - -os.environ["LRU_CACHE_CAPACITY"] = "3" - - -def load_wav_to_torch(full_path, target_sr=None, return_empty_on_exception=False): - sample_rate = None - try: - data, sample_rate = sf.read(full_path, always_2d=True) # than soundfile. - except Exception as error: - print(f"'{full_path}' failed to load with {error}") - if return_empty_on_exception: - return [], sample_rate or target_sr or 48000 - else: - raise Exception(error) - - if len(data.shape) > 1: - data = data[:, 0] - assert ( - len(data) > 2 - ) # check duration of audio file is > 2 samples (because otherwise the slice operation was on the wrong dimension) - - if np.issubdtype(data.dtype, np.integer): # if audio data is type int - max_mag = -np.iinfo( - data.dtype - ).min # maximum magnitude = min possible value of intXX - else: # if audio data is type fp32 - max_mag = max(np.amax(data), -np.amin(data)) - max_mag = ( - (2**31) + 1 - if max_mag > (2**15) - else ((2**15) + 1 if max_mag > 1.01 else 1.0) - ) # data should be either 16-bit INT, 32-bit INT or [-1 to 1] float32 - - data = torch.FloatTensor(data.astype(np.float32)) / max_mag - - if ( - torch.isinf(data) | torch.isnan(data) - ).any() and return_empty_on_exception: # resample will crash with inf/NaN inputs. return_empty_on_exception will return empty arr instead of except - return [], sample_rate or target_sr or 48000 - if target_sr is not None and sample_rate != target_sr: - data = torch.from_numpy( - librosa.core.resample( - data.numpy(), orig_sr=sample_rate, target_sr=target_sr - ) - ) - sample_rate = target_sr - - return data, sample_rate - - -def dynamic_range_compression(x, C=1, clip_val=1e-5): - return np.log(np.clip(x, a_min=clip_val, a_max=None) * C) - - -def dynamic_range_decompression(x, C=1): - return np.exp(x) / C - - -def dynamic_range_compression_torch(x, C=1, clip_val=1e-5): - return torch.log(torch.clamp(x, min=clip_val) * C) - - -def dynamic_range_decompression_torch(x, C=1): - return torch.exp(x) / C - - -class STFT: - def __init__( - self, - sr=22050, - n_mels=80, - n_fft=1024, - win_size=1024, - hop_length=256, - fmin=20, - fmax=11025, - clip_val=1e-5, - ): - self.target_sr = sr - - self.n_mels = n_mels - self.n_fft = n_fft - self.win_size = win_size - self.hop_length = hop_length - self.fmin = fmin - self.fmax = fmax - self.clip_val = clip_val - self.mel_basis = {} - self.hann_window = {} - - def get_mel(self, y, keyshift=0, speed=1, center=False, train=False): - sample_rate = self.target_sr - n_mels = self.n_mels - n_fft = self.n_fft - win_size = self.win_size - hop_length = self.hop_length - fmin = self.fmin - fmax = self.fmax - clip_val = self.clip_val - - factor = 2 ** (keyshift / 12) - n_fft_new = int(np.round(n_fft * factor)) - win_size_new = int(np.round(win_size * factor)) - hop_length_new = int(np.round(hop_length * speed)) - if not train: - mel_basis = self.mel_basis - hann_window = self.hann_window - else: - mel_basis = {} - hann_window = {} - - mel_basis_key = str(fmax) + "_" + str(y.device) - if mel_basis_key not in mel_basis: - mel = librosa_mel_fn( - sr=sample_rate, n_fft=n_fft, n_mels=n_mels, fmin=fmin, fmax=fmax - ) - mel_basis[mel_basis_key] = torch.from_numpy(mel).float().to(y.device) - - keyshift_key = str(keyshift) + "_" + str(y.device) - if keyshift_key not in hann_window: - hann_window[keyshift_key] = torch.hann_window(win_size_new).to(y.device) - - pad_left = (win_size_new - hop_length_new) // 2 - pad_right = max( - (win_size_new - hop_length_new + 1) // 2, - win_size_new - y.size(-1) - pad_left, - ) - if pad_right < y.size(-1): - mode = "reflect" - else: - mode = "constant" - y = torch.nn.functional.pad(y.unsqueeze(1), (pad_left, pad_right), mode=mode) - y = y.squeeze(1) - - spec = torch.stft( - y, - n_fft_new, - hop_length=hop_length_new, - win_length=win_size_new, - window=hann_window[keyshift_key], - center=center, - pad_mode="reflect", - normalized=False, - onesided=True, - return_complex=True, - ) - spec = torch.sqrt(spec.real.pow(2) + spec.imag.pow(2) + (1e-9)) - if keyshift != 0: - size = n_fft // 2 + 1 - resize = spec.size(1) - if resize < size: - spec = F.pad(spec, (0, 0, 0, size - resize)) - spec = spec[:, :size, :] * win_size / win_size_new - spec = torch.matmul(mel_basis[mel_basis_key], spec) - spec = dynamic_range_compression_torch(spec, clip_val=clip_val) - return spec - - def __call__(self, audiopath): - audio, sr = load_wav_to_torch(audiopath, target_sr=self.target_sr) - spect = self.get_mel(audio.unsqueeze(0)).squeeze(0) - return spect - - -stft = STFT() - -# import fast_transformers.causal_product.causal_product_cuda - - -def softmax_kernel( - data, *, projection_matrix, is_query, normalize_data=True, eps=1e-4, device=None -): - b, h, *_ = data.shape - # (batch size, head, length, model_dim) - - # normalize model dim - data_normalizer = (data.shape[-1] ** -0.25) if normalize_data else 1.0 - - # what is ration?, projection_matrix.shape[0] --> 266 - - ratio = projection_matrix.shape[0] ** -0.5 - - projection = repeat(projection_matrix, "j d -> b h j d", b=b, h=h) - projection = projection.type_as(data) - - # data_dash = w^T x - data_dash = torch.einsum("...id,...jd->...ij", (data_normalizer * data), projection) - - # diag_data = D**2 - diag_data = data**2 - diag_data = torch.sum(diag_data, dim=-1) - diag_data = (diag_data / 2.0) * (data_normalizer**2) - diag_data = diag_data.unsqueeze(dim=-1) - - if is_query: - data_dash = ratio * ( - torch.exp( - data_dash - - diag_data - - torch.max(data_dash, dim=-1, keepdim=True).values - ) - + eps - ) - else: - data_dash = ratio * ( - torch.exp(data_dash - diag_data + eps) - ) # - torch.max(data_dash)) + eps) - - return data_dash.type_as(data) - - -def orthogonal_matrix_chunk(cols, qr_uniform_q=False, device=None): - unstructured_block = torch.randn((cols, cols), device=device) - q, r = torch.linalg.qr(unstructured_block.cpu(), mode="reduced") - q, r = map(lambda t: t.to(device), (q, r)) - - # proposed by @Parskatt - # to make sure Q is uniform https://arxiv.org/pdf/math-ph/0609050.pdf - if qr_uniform_q: - d = torch.diag(r, 0) - q *= d.sign() - return q.t() - - -def exists(val): - return val is not None - - -def empty(tensor): - return tensor.numel() == 0 - - -def default(val, d): - return val if exists(val) else d - - -def cast_tuple(val): - return (val,) if not isinstance(val, tuple) else val - - -class PCmer(nn.Module): - """The encoder that is used in the Transformer model.""" - - def __init__( - self, - num_layers, - num_heads, - dim_model, - dim_keys, - dim_values, - residual_dropout, - attention_dropout, - ): - super().__init__() - self.num_layers = num_layers - self.num_heads = num_heads - self.dim_model = dim_model - self.dim_values = dim_values - self.dim_keys = dim_keys - self.residual_dropout = residual_dropout - self.attention_dropout = attention_dropout - - self._layers = nn.ModuleList([_EncoderLayer(self) for _ in range(num_layers)]) - - # METHODS ######################################################################################################## - - def forward(self, phone, mask=None): - - # apply all layers to the input - for i, layer in enumerate(self._layers): - phone = layer(phone, mask) - # provide the final sequence - return phone - - -# ==================================================================================================================== # -# CLASS _ E N C O D E R L A Y E R # -# ==================================================================================================================== # - - -class _EncoderLayer(nn.Module): - """One layer of the encoder. - - Attributes: - attn: (:class:`mha.MultiHeadAttention`): The attention mechanism that is used to read the input sequence. - feed_forward (:class:`ffl.FeedForwardLayer`): The feed-forward layer on top of the attention mechanism. - """ - - def __init__(self, parent: PCmer): - """Creates a new instance of ``_EncoderLayer``. - - Args: - parent (Encoder): The encoder that the layers is created for. - """ - super().__init__() - - self.conformer = ConformerConvModule(parent.dim_model) - self.norm = nn.LayerNorm(parent.dim_model) - self.dropout = nn.Dropout(parent.residual_dropout) - - # selfatt -> fastatt: performer! - self.attn = SelfAttention( - dim=parent.dim_model, heads=parent.num_heads, causal=False - ) - - # METHODS ######################################################################################################## - - def forward(self, phone, mask=None): - - # compute attention sub-layer - phone = phone + (self.attn(self.norm(phone), mask=mask)) - - phone = phone + (self.conformer(phone)) - - return phone - - -def calc_same_padding(kernel_size): - pad = kernel_size // 2 - return (pad, pad - (kernel_size + 1) % 2) - - -# helper classes - - -class Swish(nn.Module): - def forward(self, x): - return x * x.sigmoid() - - -class Transpose(nn.Module): - def __init__(self, dims): - super().__init__() - assert len(dims) == 2, "dims must be a tuple of two dimensions" - self.dims = dims - - def forward(self, x): - return x.transpose(*self.dims) - - -class GLU(nn.Module): - def __init__(self, dim): - super().__init__() - self.dim = dim - - def forward(self, x): - out, gate = x.chunk(2, dim=self.dim) - return out * gate.sigmoid() - - -class DepthWiseConv1d(nn.Module): - def __init__(self, chan_in, chan_out, kernel_size, padding): - super().__init__() - self.padding = padding - self.conv = nn.Conv1d(chan_in, chan_out, kernel_size, groups=chan_in) - - def forward(self, x): - x = F.pad(x, self.padding) - return self.conv(x) - - -class ConformerConvModule(nn.Module): - def __init__( - self, dim, causal=False, expansion_factor=2, kernel_size=31, dropout=0.0 - ): - super().__init__() - - inner_dim = dim * expansion_factor - padding = calc_same_padding(kernel_size) if not causal else (kernel_size - 1, 0) - - self.net = nn.Sequential( - nn.LayerNorm(dim), - Transpose((1, 2)), - nn.Conv1d(dim, inner_dim * 2, 1), - GLU(dim=1), - DepthWiseConv1d( - inner_dim, inner_dim, kernel_size=kernel_size, padding=padding - ), - # nn.BatchNorm1d(inner_dim) if not causal else nn.Identity(), - Swish(), - nn.Conv1d(inner_dim, dim, 1), - Transpose((1, 2)), - nn.Dropout(dropout), - ) - - def forward(self, x): - return self.net(x) - - -def linear_attention(q, k, v): - if v is None: - out = torch.einsum("...ed,...nd->...ne", k, q) - return out - - else: - k_cumsum = k.sum(dim=-2) - # k_cumsum = k.sum(dim = -2) - D_inv = 1.0 / (torch.einsum("...nd,...d->...n", q, k_cumsum.type_as(q)) + 1e-8) - - context = torch.einsum("...nd,...ne->...de", k, v) - out = torch.einsum("...de,...nd,...n->...ne", context, q, D_inv) - return out - - -def gaussian_orthogonal_random_matrix( - nb_rows, nb_columns, scaling=0, qr_uniform_q=False, device=None -): - nb_full_blocks = int(nb_rows / nb_columns) - block_list = [] - - for _ in range(nb_full_blocks): - q = orthogonal_matrix_chunk( - nb_columns, qr_uniform_q=qr_uniform_q, device=device - ) - block_list.append(q) - - remaining_rows = nb_rows - nb_full_blocks * nb_columns - if remaining_rows > 0: - q = orthogonal_matrix_chunk( - nb_columns, qr_uniform_q=qr_uniform_q, device=device - ) - - block_list.append(q[:remaining_rows]) - - final_matrix = torch.cat(block_list) - - if scaling == 0: - multiplier = torch.randn((nb_rows, nb_columns), device=device).norm(dim=1) - elif scaling == 1: - multiplier = math.sqrt((float(nb_columns))) * torch.ones( - (nb_rows,), device=device - ) - else: - raise ValueError(f"Invalid scaling {scaling}") - - return torch.diag(multiplier) @ final_matrix - - -class FastAttention(nn.Module): - def __init__( - self, - dim_heads, - nb_features=None, - ortho_scaling=0, - causal=False, - generalized_attention=False, - kernel_fn=nn.ReLU(), - qr_uniform_q=False, - no_projection=False, - ): - super().__init__() - nb_features = default(nb_features, int(dim_heads * math.log(dim_heads))) - - self.dim_heads = dim_heads - self.nb_features = nb_features - self.ortho_scaling = ortho_scaling - - self.create_projection = partial( - gaussian_orthogonal_random_matrix, - nb_rows=self.nb_features, - nb_columns=dim_heads, - scaling=ortho_scaling, - qr_uniform_q=qr_uniform_q, - ) - projection_matrix = self.create_projection() - self.register_buffer("projection_matrix", projection_matrix) - - self.generalized_attention = generalized_attention - self.kernel_fn = kernel_fn - - # if this is turned on, no projection will be used - # queries and keys will be softmax-ed as in the original efficient attention paper - self.no_projection = no_projection - - self.causal = causal - - @torch.no_grad() - def redraw_projection_matrix(self): - projections = self.create_projection() - self.projection_matrix.copy_(projections) - del projections - - def forward(self, q, k, v): - device = q.device - - if self.no_projection: - q = q.softmax(dim=-1) - k = torch.exp(k) if self.causal else k.softmax(dim=-2) - else: - create_kernel = partial( - softmax_kernel, projection_matrix=self.projection_matrix, device=device - ) - - q = create_kernel(q, is_query=True) - k = create_kernel(k, is_query=False) - - attn_fn = linear_attention if not self.causal else self.causal_linear_fn - if v is None: - out = attn_fn(q, k, None) - return out - else: - out = attn_fn(q, k, v) - return out - - -class SelfAttention(nn.Module): - def __init__( - self, - dim, - causal=False, - heads=8, - dim_head=64, - local_heads=0, - local_window_size=256, - nb_features=None, - feature_redraw_interval=1000, - generalized_attention=False, - kernel_fn=nn.ReLU(), - qr_uniform_q=False, - dropout=0.0, - no_projection=False, - ): - super().__init__() - assert dim % heads == 0, "dimension must be divisible by number of heads" - dim_head = default(dim_head, dim // heads) - inner_dim = dim_head * heads - self.fast_attention = FastAttention( - dim_head, - nb_features, - causal=causal, - generalized_attention=generalized_attention, - kernel_fn=kernel_fn, - qr_uniform_q=qr_uniform_q, - no_projection=no_projection, - ) - - self.heads = heads - self.global_heads = heads - local_heads - self.local_attn = ( - LocalAttention( - window_size=local_window_size, - causal=causal, - autopad=True, - dropout=dropout, - look_forward=int(not causal), - rel_pos_emb_config=(dim_head, local_heads), - ) - if local_heads > 0 - else None - ) - - self.to_q = nn.Linear(dim, inner_dim) - self.to_k = nn.Linear(dim, inner_dim) - self.to_v = nn.Linear(dim, inner_dim) - self.to_out = nn.Linear(inner_dim, dim) - self.dropout = nn.Dropout(dropout) - - @torch.no_grad() - def redraw_projection_matrix(self): - self.fast_attention.redraw_projection_matrix() - - def forward( - self, - x, - context=None, - mask=None, - context_mask=None, - name=None, - inference=False, - **kwargs, - ): - _, _, _, h, gh = *x.shape, self.heads, self.global_heads - - cross_attend = exists(context) - - context = default(context, x) - context_mask = default(context_mask, mask) if not cross_attend else context_mask - q, k, v = self.to_q(x), self.to_k(context), self.to_v(context) - - q, k, v = map(lambda t: rearrange(t, "b n (h d) -> b h n d", h=h), (q, k, v)) - (q, lq), (k, lk), (v, lv) = map(lambda t: (t[:, :gh], t[:, gh:]), (q, k, v)) - - attn_outs = [] - if not empty(q): - if exists(context_mask): - global_mask = context_mask[:, None, :, None] - v.masked_fill_(~global_mask, 0.0) - if cross_attend: - pass - else: - out = self.fast_attention(q, k, v) - attn_outs.append(out) - - if not empty(lq): - assert ( - not cross_attend - ), "local attention is not compatible with cross attention" - out = self.local_attn(lq, lk, lv, input_mask=mask) - attn_outs.append(out) - - out = torch.cat(attn_outs, dim=1) - out = rearrange(out, "b h n d -> b n (h d)") - out = self.to_out(out) - return self.dropout(out) - - -def l2_regularization(model, l2_alpha): - l2_loss = [] - for module in model.modules(): - if type(module) is nn.Conv2d: - l2_loss.append((module.weight**2).sum() / 2.0) - return l2_alpha * sum(l2_loss) - - -class FCPE(nn.Module): - def __init__( - self, - input_channel=128, - out_dims=360, - n_layers=12, - n_chans=512, - use_siren=False, - use_full=False, - loss_mse_scale=10, - loss_l2_regularization=False, - loss_l2_regularization_scale=1, - loss_grad1_mse=False, - loss_grad1_mse_scale=1, - f0_max=1975.5, - f0_min=32.70, - confidence=False, - threshold=0.05, - use_input_conv=True, - ): - super().__init__() - if use_siren is True: - raise ValueError("Siren is not supported yet.") - if use_full is True: - raise ValueError("Full model is not supported yet.") - - self.loss_mse_scale = loss_mse_scale if (loss_mse_scale is not None) else 10 - self.loss_l2_regularization = ( - loss_l2_regularization if (loss_l2_regularization is not None) else False - ) - self.loss_l2_regularization_scale = ( - loss_l2_regularization_scale - if (loss_l2_regularization_scale is not None) - else 1 - ) - self.loss_grad1_mse = loss_grad1_mse if (loss_grad1_mse is not None) else False - self.loss_grad1_mse_scale = ( - loss_grad1_mse_scale if (loss_grad1_mse_scale is not None) else 1 - ) - self.f0_max = f0_max if (f0_max is not None) else 1975.5 - self.f0_min = f0_min if (f0_min is not None) else 32.70 - self.confidence = confidence if (confidence is not None) else False - self.threshold = threshold if (threshold is not None) else 0.05 - self.use_input_conv = use_input_conv if (use_input_conv is not None) else True - - self.cent_table_b = torch.Tensor( - np.linspace( - self.f0_to_cent(torch.Tensor([f0_min]))[0], - self.f0_to_cent(torch.Tensor([f0_max]))[0], - out_dims, - ) - ) - self.register_buffer("cent_table", self.cent_table_b) - - # conv in stack - _leaky = nn.LeakyReLU() - self.stack = nn.Sequential( - nn.Conv1d(input_channel, n_chans, 3, 1, 1), - nn.GroupNorm(4, n_chans), - _leaky, - nn.Conv1d(n_chans, n_chans, 3, 1, 1), - ) - - # transformer - self.decoder = PCmer( - num_layers=n_layers, - num_heads=8, - dim_model=n_chans, - dim_keys=n_chans, - dim_values=n_chans, - residual_dropout=0.1, - attention_dropout=0.1, - ) - self.norm = nn.LayerNorm(n_chans) - - # out - self.n_out = out_dims - self.dense_out = weight_norm(nn.Linear(n_chans, self.n_out)) - - def forward( - self, mel, infer=True, gt_f0=None, return_hz_f0=False, cdecoder="local_argmax" - ): - """ - input: - B x n_frames x n_unit - return: - dict of B x n_frames x feat - """ - if cdecoder == "argmax": - self.cdecoder = self.cents_decoder - elif cdecoder == "local_argmax": - self.cdecoder = self.cents_local_decoder - if self.use_input_conv: - x = self.stack(mel.transpose(1, 2)).transpose(1, 2) - else: - x = mel - x = self.decoder(x) - x = self.norm(x) - x = self.dense_out(x) # [B,N,D] - x = torch.sigmoid(x) - if not infer: - gt_cent_f0 = self.f0_to_cent(gt_f0) # mel f0 #[B,N,1] - gt_cent_f0 = self.gaussian_blurred_cent(gt_cent_f0) # #[B,N,out_dim] - loss_all = self.loss_mse_scale * F.binary_cross_entropy( - x, gt_cent_f0 - ) # bce loss - # l2 regularization - if self.loss_l2_regularization: - loss_all = loss_all + l2_regularization( - model=self, l2_alpha=self.loss_l2_regularization_scale - ) - x = loss_all - if infer: - x = self.cdecoder(x) - x = self.cent_to_f0(x) - if not return_hz_f0: - x = (1 + x / 700).log() - return x - - def cents_decoder(self, y, mask=True): - B, N, _ = y.size() - ci = self.cent_table[None, None, :].expand(B, N, -1) - rtn = torch.sum(ci * y, dim=-1, keepdim=True) / torch.sum( - y, dim=-1, keepdim=True - ) # cents: [B,N,1] - if mask: - confident = torch.max(y, dim=-1, keepdim=True)[0] - confident_mask = torch.ones_like(confident) - confident_mask[confident <= self.threshold] = float("-INF") - rtn = rtn * confident_mask - if self.confidence: - return rtn, confident - else: - return rtn - - def cents_local_decoder(self, y, mask=True): - B, N, _ = y.size() - ci = self.cent_table[None, None, :].expand(B, N, -1) - confident, max_index = torch.max(y, dim=-1, keepdim=True) - local_argmax_index = torch.arange(0, 9).to(max_index.device) + (max_index - 4) - local_argmax_index[local_argmax_index < 0] = 0 - local_argmax_index[local_argmax_index >= self.n_out] = self.n_out - 1 - ci_l = torch.gather(ci, -1, local_argmax_index) - y_l = torch.gather(y, -1, local_argmax_index) - rtn = torch.sum(ci_l * y_l, dim=-1, keepdim=True) / torch.sum( - y_l, dim=-1, keepdim=True - ) # cents: [B,N,1] - if mask: - confident_mask = torch.ones_like(confident) - confident_mask[confident <= self.threshold] = float("-INF") - rtn = rtn * confident_mask - if self.confidence: - return rtn, confident - else: - return rtn - - def cent_to_f0(self, cent): - return 10.0 * 2 ** (cent / 1200.0) - - def f0_to_cent(self, f0): - return 1200.0 * torch.log2(f0 / 10.0) - - def gaussian_blurred_cent(self, cents): # cents: [B,N,1] - mask = (cents > 0.1) & (cents < (1200.0 * np.log2(self.f0_max / 10.0))) - B, N, _ = cents.size() - ci = self.cent_table[None, None, :].expand(B, N, -1) - return torch.exp(-torch.square(ci - cents) / 1250) * mask.float() - - -class FCPEInfer: - def __init__(self, model_path, device=None, dtype=torch.float32): - if device is None: - device = "cuda" if torch.cuda.is_available() else "cpu" - self.device = device - ckpt = torch.load(model_path, map_location=torch.device(self.device)) - self.args = DotDict(ckpt["config"]) - self.dtype = dtype - model = FCPE( - input_channel=self.args.model.input_channel, - out_dims=self.args.model.out_dims, - n_layers=self.args.model.n_layers, - n_chans=self.args.model.n_chans, - use_siren=self.args.model.use_siren, - use_full=self.args.model.use_full, - loss_mse_scale=self.args.loss.loss_mse_scale, - loss_l2_regularization=self.args.loss.loss_l2_regularization, - loss_l2_regularization_scale=self.args.loss.loss_l2_regularization_scale, - loss_grad1_mse=self.args.loss.loss_grad1_mse, - loss_grad1_mse_scale=self.args.loss.loss_grad1_mse_scale, - f0_max=self.args.model.f0_max, - f0_min=self.args.model.f0_min, - confidence=self.args.model.confidence, - ) - model.to(self.device).to(self.dtype) - model.load_state_dict(ckpt["model"]) - model.eval() - self.model = model - self.wav2mel = Wav2Mel(self.args, dtype=self.dtype, device=self.device) - - @torch.no_grad() - def __call__(self, audio, sr, threshold=0.05): - self.model.threshold = threshold - audio = audio[None, :] - mel = self.wav2mel(audio=audio, sample_rate=sr).to(self.dtype) - f0 = self.model(mel=mel, infer=True, return_hz_f0=True) - return f0 - - -class Wav2Mel: - - def __init__(self, args, device=None, dtype=torch.float32): - # self.args = args - self.sample_rate = args.mel.sample_rate - self.hop_size = args.mel.hop_size - if device is None: - device = "cuda" if torch.cuda.is_available() else "cpu" - self.device = device - self.dtype = dtype - self.stft = STFT( - args.mel.sample_rate, - args.mel.num_mels, - args.mel.n_fft, - args.mel.win_size, - args.mel.hop_size, - args.mel.fmin, - args.mel.fmax, - ) - self.resample_kernel = {} - - def extract_nvstft(self, audio, keyshift=0, train=False): - mel = self.stft.get_mel(audio, keyshift=keyshift, train=train).transpose( - 1, 2 - ) # B, n_frames, bins - return mel - - def extract_mel(self, audio, sample_rate, keyshift=0, train=False): - audio = audio.to(self.dtype).to(self.device) - # resample - if sample_rate == self.sample_rate: - audio_res = audio - else: - key_str = str(sample_rate) - if key_str not in self.resample_kernel: - self.resample_kernel[key_str] = Resample( - sample_rate, self.sample_rate, lowpass_filter_width=128 - ) - self.resample_kernel[key_str] = ( - self.resample_kernel[key_str].to(self.dtype).to(self.device) - ) - audio_res = self.resample_kernel[key_str](audio) - - # extract - mel = self.extract_nvstft( - audio_res, keyshift=keyshift, train=train - ) # B, n_frames, bins - n_frames = int(audio.shape[1] // self.hop_size) + 1 - if n_frames > int(mel.shape[1]): - mel = torch.cat((mel, mel[:, -1:, :]), 1) - if n_frames < int(mel.shape[1]): - mel = mel[:, :n_frames, :] - return mel - - def __call__(self, audio, sample_rate, keyshift=0, train=False): - return self.extract_mel(audio, sample_rate, keyshift=keyshift, train=train) - - -class DotDict(dict): - def __getattr__(*args): - val = dict.get(*args) - return DotDict(val) if type(val) is dict else val - - __setattr__ = dict.__setitem__ - __delattr__ = dict.__delitem__ - - -class F0Predictor(object): - def compute_f0(self, wav, p_len): - """ - input: wav:[signal_length] - p_len:int - output: f0:[signal_length//hop_length] - """ - pass - - def compute_f0_uv(self, wav, p_len): - """ - input: wav:[signal_length] - p_len:int - output: f0:[signal_length//hop_length],uv:[signal_length//hop_length] - """ - pass - - -class FCPEF0Predictor(F0Predictor): - def __init__( - self, - model_path, - hop_length=512, - f0_min=50, - f0_max=1100, - dtype=torch.float32, - device=None, - sample_rate=44100, - threshold=0.05, - ): - self.fcpe = FCPEInfer(model_path, device=device, dtype=dtype) - self.hop_length = hop_length - self.f0_min = f0_min - self.f0_max = f0_max - if device is None: - self.device = "cuda" if torch.cuda.is_available() else "cpu" - else: - self.device = device - self.threshold = threshold - self.sample_rate = sample_rate - self.dtype = dtype - self.name = "fcpe" - - def repeat_expand( - self, - content: Union[torch.Tensor, np.ndarray], - target_len: int, - mode: str = "nearest", - ): - ndim = content.ndim - - if content.ndim == 1: - content = content[None, None] - elif content.ndim == 2: - content = content[None] - - assert content.ndim == 3 - - is_np = isinstance(content, np.ndarray) - if is_np: - content = torch.from_numpy(content) - - results = torch.nn.functional.interpolate(content, size=target_len, mode=mode) - - if is_np: - results = results.numpy() - - if ndim == 1: - return results[0, 0] - elif ndim == 2: - return results[0] - - def post_process(self, x, sample_rate, f0, pad_to): - if isinstance(f0, np.ndarray): - f0 = torch.from_numpy(f0).float().to(x.device) - - if pad_to is None: - return f0 - - f0 = self.repeat_expand(f0, pad_to) - - vuv_vector = torch.zeros_like(f0) - vuv_vector[f0 > 0.0] = 1.0 - vuv_vector[f0 <= 0.0] = 0.0 - - # 去掉0频率, 并线性插值 - nzindex = torch.nonzero(f0).squeeze() - f0 = torch.index_select(f0, dim=0, index=nzindex).cpu().numpy() - time_org = self.hop_length / sample_rate * nzindex.cpu().numpy() - time_frame = np.arange(pad_to) * self.hop_length / sample_rate - - vuv_vector = F.interpolate(vuv_vector[None, None, :], size=pad_to)[0][0] - - if f0.shape[0] <= 0: - return ( - torch.zeros(pad_to, dtype=torch.float, device=x.device).cpu().numpy(), - vuv_vector.cpu().numpy(), - ) - if f0.shape[0] == 1: - return ( - torch.ones(pad_to, dtype=torch.float, device=x.device) * f0[0] - ).cpu().numpy(), vuv_vector.cpu().numpy() - - # 大概可以用 torch 重写? - f0 = np.interp(time_frame, time_org, f0, left=f0[0], right=f0[-1]) - # vuv_vector = np.ceil(scipy.ndimage.zoom(vuv_vector,pad_to/len(vuv_vector),order = 0)) - - return f0, vuv_vector.cpu().numpy() - - def compute_f0(self, wav, p_len=None): - x = torch.FloatTensor(wav).to(self.dtype).to(self.device) - if p_len is None: - print("fcpe p_len is None") - p_len = x.shape[0] // self.hop_length - f0 = self.fcpe(x, sr=self.sample_rate, threshold=self.threshold)[0, :, 0] - if torch.all(f0 == 0): - rtn = f0.cpu().numpy() if p_len is None else np.zeros(p_len) - return rtn, rtn - return self.post_process(x, self.sample_rate, f0, p_len)[0] - - def compute_f0_uv(self, wav, p_len=None): - x = torch.FloatTensor(wav).to(self.dtype).to(self.device) - if p_len is None: - p_len = x.shape[0] // self.hop_length - f0 = self.fcpe(x, sr=self.sample_rate, threshold=self.threshold)[0, :, 0] - if torch.all(f0 == 0): - rtn = f0.cpu().numpy() if p_len is None else np.zeros(p_len) - return rtn, rtn - return self.post_process(x, self.sample_rate, f0, p_len) diff --git a/rvc/lib/infer_pack/__init__.py b/rvc/lib/infer_pack/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/rvc/lib/infer_pack/attentions.py b/rvc/lib/infer_pack/attentions.py deleted file mode 100644 index ea728e2..0000000 --- a/rvc/lib/infer_pack/attentions.py +++ /dev/null @@ -1,398 +0,0 @@ -import math -import torch -from torch import nn -from torch.nn import functional as F - -from . import commons -from .modules import LayerNorm - - -class Encoder(nn.Module): - def __init__( - self, - hidden_channels, - filter_channels, - n_heads, - n_layers, - kernel_size=1, - p_dropout=0.0, - window_size=10, - **kwargs - ): - super().__init__() - self.hidden_channels = hidden_channels - self.filter_channels = filter_channels - self.n_heads = n_heads - self.n_layers = n_layers - self.kernel_size = kernel_size - self.p_dropout = p_dropout - self.window_size = window_size - - self.drop = nn.Dropout(p_dropout) - self.attn_layers = nn.ModuleList() - self.norm_layers_1 = nn.ModuleList() - self.ffn_layers = nn.ModuleList() - self.norm_layers_2 = nn.ModuleList() - for i in range(self.n_layers): - self.attn_layers.append( - MultiHeadAttention( - hidden_channels, - hidden_channels, - n_heads, - p_dropout=p_dropout, - window_size=window_size, - ) - ) - self.norm_layers_1.append(LayerNorm(hidden_channels)) - self.ffn_layers.append( - FFN( - hidden_channels, - hidden_channels, - filter_channels, - kernel_size, - p_dropout=p_dropout, - ) - ) - self.norm_layers_2.append(LayerNorm(hidden_channels)) - - def forward(self, x, x_mask): - attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1) - x = x * x_mask - for i in range(self.n_layers): - y = self.attn_layers[i](x, x, attn_mask) - y = self.drop(y) - x = self.norm_layers_1[i](x + y) - - y = self.ffn_layers[i](x, x_mask) - y = self.drop(y) - x = self.norm_layers_2[i](x + y) - x = x * x_mask - return x - - -class Decoder(nn.Module): - def __init__( - self, - hidden_channels, - filter_channels, - n_heads, - n_layers, - kernel_size=1, - p_dropout=0.0, - proximal_bias=False, - proximal_init=True, - **kwargs - ): - super().__init__() - self.hidden_channels = hidden_channels - self.filter_channels = filter_channels - self.n_heads = n_heads - self.n_layers = n_layers - self.kernel_size = kernel_size - self.p_dropout = p_dropout - self.proximal_bias = proximal_bias - self.proximal_init = proximal_init - - self.drop = nn.Dropout(p_dropout) - self.self_attn_layers = nn.ModuleList() - self.norm_layers_0 = nn.ModuleList() - self.encdec_attn_layers = nn.ModuleList() - self.norm_layers_1 = nn.ModuleList() - self.ffn_layers = nn.ModuleList() - self.norm_layers_2 = nn.ModuleList() - for i in range(self.n_layers): - self.self_attn_layers.append( - MultiHeadAttention( - hidden_channels, - hidden_channels, - n_heads, - p_dropout=p_dropout, - proximal_bias=proximal_bias, - proximal_init=proximal_init, - ) - ) - self.norm_layers_0.append(LayerNorm(hidden_channels)) - self.encdec_attn_layers.append( - MultiHeadAttention( - hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout - ) - ) - self.norm_layers_1.append(LayerNorm(hidden_channels)) - self.ffn_layers.append( - FFN( - hidden_channels, - hidden_channels, - filter_channels, - kernel_size, - p_dropout=p_dropout, - causal=True, - ) - ) - self.norm_layers_2.append(LayerNorm(hidden_channels)) - - def forward(self, x, x_mask, h, h_mask): - """ - x: decoder input - h: encoder output - """ - self_attn_mask = commons.subsequent_mask(x_mask.size(2)).to( - device=x.device, dtype=x.dtype - ) - encdec_attn_mask = h_mask.unsqueeze(2) * x_mask.unsqueeze(-1) - x = x * x_mask - for i in range(self.n_layers): - y = self.self_attn_layers[i](x, x, self_attn_mask) - y = self.drop(y) - x = self.norm_layers_0[i](x + y) - - y = self.encdec_attn_layers[i](x, h, encdec_attn_mask) - y = self.drop(y) - x = self.norm_layers_1[i](x + y) - - y = self.ffn_layers[i](x, x_mask) - y = self.drop(y) - x = self.norm_layers_2[i](x + y) - x = x * x_mask - return x - - -class MultiHeadAttention(nn.Module): - def __init__( - self, - channels, - out_channels, - n_heads, - p_dropout=0.0, - window_size=None, - heads_share=True, - block_length=None, - proximal_bias=False, - proximal_init=False, - ): - super().__init__() - assert channels % n_heads == 0 - - self.channels = channels - self.out_channels = out_channels - self.n_heads = n_heads - self.p_dropout = p_dropout - self.window_size = window_size - self.heads_share = heads_share - self.block_length = block_length - self.proximal_bias = proximal_bias - self.proximal_init = proximal_init - self.attn = None - - self.k_channels = channels // n_heads - self.conv_q = nn.Conv1d(channels, channels, 1) - self.conv_k = nn.Conv1d(channels, channels, 1) - self.conv_v = nn.Conv1d(channels, channels, 1) - self.conv_o = nn.Conv1d(channels, out_channels, 1) - self.drop = nn.Dropout(p_dropout) - - if window_size is not None: - n_heads_rel = 1 if heads_share else n_heads - rel_stddev = self.k_channels**-0.5 - self.emb_rel_k = nn.Parameter( - torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) - * rel_stddev - ) - self.emb_rel_v = nn.Parameter( - torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) - * rel_stddev - ) - - nn.init.xavier_uniform_(self.conv_q.weight) - nn.init.xavier_uniform_(self.conv_k.weight) - nn.init.xavier_uniform_(self.conv_v.weight) - if proximal_init: - with torch.no_grad(): - self.conv_k.weight.copy_(self.conv_q.weight) - self.conv_k.bias.copy_(self.conv_q.bias) - - def forward(self, x, c, attn_mask=None): - q = self.conv_q(x) - k = self.conv_k(c) - v = self.conv_v(c) - - x, self.attn = self.attention(q, k, v, mask=attn_mask) - - x = self.conv_o(x) - return x - - def attention(self, query, key, value, mask=None): - b, d, t_s, t_t = (*key.size(), query.size(2)) - query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3) - key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3) - value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3) - - scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1)) - if self.window_size is not None: - assert ( - t_s == t_t - ), "Relative attention is only available for self-attention." - key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s) - rel_logits = self._matmul_with_relative_keys( - query / math.sqrt(self.k_channels), key_relative_embeddings - ) - scores_local = self._relative_position_to_absolute_position(rel_logits) - scores = scores + scores_local - if self.proximal_bias: - assert t_s == t_t, "Proximal bias is only available for self-attention." - scores = scores + self._attention_bias_proximal(t_s).to( - device=scores.device, dtype=scores.dtype - ) - if mask is not None: - scores = scores.masked_fill(mask == 0, -1e4) - if self.block_length is not None: - assert ( - t_s == t_t - ), "Local attention is only available for self-attention." - block_mask = ( - torch.ones_like(scores) - .triu(-self.block_length) - .tril(self.block_length) - ) - scores = scores.masked_fill(block_mask == 0, -1e4) - p_attn = F.softmax(scores, dim=-1) - p_attn = self.drop(p_attn) - output = torch.matmul(p_attn, value) - if self.window_size is not None: - relative_weights = self._absolute_position_to_relative_position(p_attn) - value_relative_embeddings = self._get_relative_embeddings( - self.emb_rel_v, t_s - ) - output = output + self._matmul_with_relative_values( - relative_weights, value_relative_embeddings - ) - output = output.transpose(2, 3).contiguous().view(b, d, t_t) - return output, p_attn - - def _matmul_with_relative_values(self, x, y): - """ - x: [b, h, l, m] - y: [h or 1, m, d] - ret: [b, h, l, d] - """ - ret = torch.matmul(x, y.unsqueeze(0)) - return ret - - def _matmul_with_relative_keys(self, x, y): - """ - x: [b, h, l, d] - y: [h or 1, m, d] - ret: [b, h, l, m] - """ - ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1)) - return ret - - def _get_relative_embeddings(self, relative_embeddings, length): - pad_length = max(length - (self.window_size + 1), 0) - slice_start_position = max((self.window_size + 1) - length, 0) - slice_end_position = slice_start_position + 2 * length - 1 - if pad_length > 0: - padded_relative_embeddings = F.pad( - relative_embeddings, - commons.convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]), - ) - else: - padded_relative_embeddings = relative_embeddings - used_relative_embeddings = padded_relative_embeddings[ - :, slice_start_position:slice_end_position - ] - return used_relative_embeddings - - def _relative_position_to_absolute_position(self, x): - """ - x: [b, h, l, 2*l-1] - ret: [b, h, l, l] - """ - batch, heads, length, _ = x.size() - - x = F.pad(x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, 1]])) - x_flat = x.view([batch, heads, length * 2 * length]) - x_flat = F.pad( - x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [0, length - 1]]) - ) - - x_final = x_flat.view([batch, heads, length + 1, 2 * length - 1])[ - :, :, :length, length - 1 : - ] - return x_final - - def _absolute_position_to_relative_position(self, x): - """ - x: [b, h, l, l] - ret: [b, h, l, 2*l-1] - """ - batch, heads, length, _ = x.size() - x = F.pad( - x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length - 1]]) - ) - x_flat = x.view([batch, heads, length**2 + length * (length - 1)]) - x_flat = F.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [length, 0]])) - x_final = x_flat.view([batch, heads, length, 2 * length])[:, :, :, 1:] - return x_final - - def _attention_bias_proximal(self, length): - r = torch.arange(length, dtype=torch.float32) - diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1) - return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0) - - -class FFN(nn.Module): - def __init__( - self, - in_channels, - out_channels, - filter_channels, - kernel_size, - p_dropout=0.0, - activation=None, - causal=False, - ): - super().__init__() - self.in_channels = in_channels - self.out_channels = out_channels - self.filter_channels = filter_channels - self.kernel_size = kernel_size - self.p_dropout = p_dropout - self.activation = activation - self.causal = causal - - if causal: - self.padding = self._causal_padding - else: - self.padding = self._same_padding - - self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size) - self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size) - self.drop = nn.Dropout(p_dropout) - - def forward(self, x, x_mask): - x = self.conv_1(self.padding(x * x_mask)) - if self.activation == "gelu": - x = x * torch.sigmoid(1.702 * x) - else: - x = torch.relu(x) - x = self.drop(x) - x = self.conv_2(self.padding(x * x_mask)) - return x * x_mask - - def _causal_padding(self, x): - if self.kernel_size == 1: - return x - pad_l = self.kernel_size - 1 - pad_r = 0 - padding = [[0, 0], [0, 0], [pad_l, pad_r]] - x = F.pad(x, commons.convert_pad_shape(padding)) - return x - - def _same_padding(self, x): - if self.kernel_size == 1: - return x - pad_l = (self.kernel_size - 1) // 2 - pad_r = self.kernel_size // 2 - padding = [[0, 0], [0, 0], [pad_l, pad_r]] - x = F.pad(x, commons.convert_pad_shape(padding)) - return x diff --git a/rvc/lib/infer_pack/commons.py b/rvc/lib/infer_pack/commons.py deleted file mode 100644 index 5447098..0000000 --- a/rvc/lib/infer_pack/commons.py +++ /dev/null @@ -1,166 +0,0 @@ -import math -import numpy as np -import torch -from torch import nn -from torch.nn import functional as F - - -def init_weights(m, mean=0.0, std=0.01): - classname = m.__class__.__name__ - if classname.find("Conv") != -1: - m.weight.data.normal_(mean, std) - - -def get_padding(kernel_size, dilation=1): - return int((kernel_size * dilation - dilation) / 2) - - -def convert_pad_shape(pad_shape): - l = pad_shape[::-1] - pad_shape = [item for sublist in l for item in sublist] - return pad_shape - - -def kl_divergence(m_p, logs_p, m_q, logs_q): - """KL(P||Q)""" - kl = (logs_q - logs_p) - 0.5 - kl += ( - 0.5 * (torch.exp(2.0 * logs_p) + ((m_p - m_q) ** 2)) * torch.exp(-2.0 * logs_q) - ) - return kl - - -def rand_gumbel(shape): - """Sample from the Gumbel distribution, protect from overflows.""" - uniform_samples = torch.rand(shape) * 0.99998 + 0.00001 - return -torch.log(-torch.log(uniform_samples)) - - -def rand_gumbel_like(x): - g = rand_gumbel(x.size()).to(dtype=x.dtype, device=x.device) - return g - - -def slice_segments(x, ids_str, segment_size=4): - ret = torch.zeros_like(x[:, :, :segment_size]) - for i in range(x.size(0)): - idx_str = ids_str[i] - idx_end = idx_str + segment_size - ret[i] = x[i, :, idx_str:idx_end] - return ret - - -def slice_segments2(x, ids_str, segment_size=4): - ret = torch.zeros_like(x[:, :segment_size]) - for i in range(x.size(0)): - idx_str = ids_str[i] - idx_end = idx_str + segment_size - ret[i] = x[i, idx_str:idx_end] - return ret - - -def rand_slice_segments(x, x_lengths=None, segment_size=4): - b, d, t = x.size() - if x_lengths is None: - x_lengths = t - ids_str_max = x_lengths - segment_size + 1 - ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long) - ret = slice_segments(x, ids_str, segment_size) - return ret, ids_str - - -def get_timing_signal_1d(length, channels, min_timescale=1.0, max_timescale=1.0e4): - position = torch.arange(length, dtype=torch.float) - num_timescales = channels // 2 - log_timescale_increment = math.log(float(max_timescale) / float(min_timescale)) / ( - num_timescales - 1 - ) - inv_timescales = min_timescale * torch.exp( - torch.arange(num_timescales, dtype=torch.float) * -log_timescale_increment - ) - scaled_time = position.unsqueeze(0) * inv_timescales.unsqueeze(1) - signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], 0) - signal = F.pad(signal, [0, 0, 0, channels % 2]) - signal = signal.view(1, channels, length) - return signal - - -def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4): - b, channels, length = x.size() - signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale) - return x + signal.to(dtype=x.dtype, device=x.device) - - -def cat_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4, axis=1): - b, channels, length = x.size() - signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale) - return torch.cat([x, signal.to(dtype=x.dtype, device=x.device)], axis) - - -def subsequent_mask(length): - mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0) - return mask - - -@torch.jit.script -def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels): - n_channels_int = n_channels[0] - in_act = input_a + input_b - t_act = torch.tanh(in_act[:, :n_channels_int, :]) - s_act = torch.sigmoid(in_act[:, n_channels_int:, :]) - acts = t_act * s_act - return acts - - -def convert_pad_shape(pad_shape): - l = pad_shape[::-1] - pad_shape = [item for sublist in l for item in sublist] - return pad_shape - - -def shift_1d(x): - x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [1, 0]]))[:, :, :-1] - return x - - -def sequence_mask(length, max_length=None): - if max_length is None: - max_length = length.max() - x = torch.arange(max_length, dtype=length.dtype, device=length.device) - return x.unsqueeze(0) < length.unsqueeze(1) - - -def generate_path(duration, mask): - """ - duration: [b, 1, t_x] - mask: [b, 1, t_y, t_x] - """ - device = duration.device - - b, _, t_y, t_x = mask.shape - cum_duration = torch.cumsum(duration, -1) - - cum_duration_flat = cum_duration.view(b * t_x) - path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype) - path = path.view(b, t_x, t_y) - path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1] - path = path.unsqueeze(1).transpose(2, 3) * mask - return path - - -def clip_grad_value_(parameters, clip_value, norm_type=2): - if isinstance(parameters, torch.Tensor): - parameters = [parameters] - parameters = list(filter(lambda p: p.grad is not None, parameters)) - norm_type = float(norm_type) - if clip_value is not None: - clip_value = float(clip_value) - - total_norm = 0 - for p in parameters: - param_norm = p.grad.data.norm(norm_type) - total_norm += param_norm.item() ** norm_type - if clip_value is not None: - p.grad.data.clamp_(min=-clip_value, max=clip_value) - total_norm = total_norm ** (1.0 / norm_type) - return total_norm diff --git a/rvc/lib/infer_pack/models.py b/rvc/lib/infer_pack/models.py deleted file mode 100644 index 77a2b36..0000000 --- a/rvc/lib/infer_pack/models.py +++ /dev/null @@ -1,1382 +0,0 @@ -import math -import torch -from torch import nn -from torch.nn import functional as F -from . import modules -from . import attentions -from . import commons -from .commons import init_weights, get_padding -from torch.nn import Conv1d, ConvTranspose1d, Conv2d -from torch.nn.utils import remove_weight_norm -from torch.nn.utils.parametrizations import spectral_norm, weight_norm -from typing import Optional - -has_xpu = bool(hasattr(torch, "xpu") and torch.xpu.is_available()) - - -class TextEncoder256(nn.Module): - def __init__( - self, - out_channels, - hidden_channels, - filter_channels, - n_heads, - n_layers, - kernel_size, - p_dropout, - f0=True, - ): - super(TextEncoder256, self).__init__() - self.out_channels = out_channels - self.hidden_channels = hidden_channels - self.filter_channels = filter_channels - self.n_heads = n_heads - self.n_layers = n_layers - self.kernel_size = kernel_size - self.p_dropout = float(p_dropout) - self.emb_phone = nn.Linear(256, hidden_channels) - self.lrelu = nn.LeakyReLU(0.1, inplace=True) - if f0 == True: - self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256 - self.encoder = attentions.Encoder( - hidden_channels, - filter_channels, - n_heads, - n_layers, - kernel_size, - float(p_dropout), - ) - self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) - - def forward( - self, phone: torch.Tensor, pitch: Optional[torch.Tensor], lengths: torch.Tensor - ): - if pitch is None: - x = self.emb_phone(phone) - else: - x = self.emb_phone(phone) + self.emb_pitch(pitch) - x = x * math.sqrt(self.hidden_channels) # [b, t, h] - x = self.lrelu(x) - x = torch.transpose(x, 1, -1) # [b, h, t] - x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to( - x.dtype - ) - x = self.encoder(x * x_mask, x_mask) - stats = self.proj(x) * x_mask - - m, logs = torch.split(stats, self.out_channels, dim=1) - return m, logs, x_mask - - -class TextEncoder768(nn.Module): - def __init__( - self, - out_channels, - hidden_channels, - filter_channels, - n_heads, - n_layers, - kernel_size, - p_dropout, - f0=True, - ): - super(TextEncoder768, self).__init__() - self.out_channels = out_channels - self.hidden_channels = hidden_channels - self.filter_channels = filter_channels - self.n_heads = n_heads - self.n_layers = n_layers - self.kernel_size = kernel_size - self.p_dropout = float(p_dropout) - self.emb_phone = nn.Linear(768, hidden_channels) - self.lrelu = nn.LeakyReLU(0.1, inplace=True) - if f0 == True: - self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256 - self.encoder = attentions.Encoder( - hidden_channels, - filter_channels, - n_heads, - n_layers, - kernel_size, - float(p_dropout), - ) - self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) - - def forward(self, phone: torch.Tensor, pitch: torch.Tensor, lengths: torch.Tensor): - if pitch is None: - x = self.emb_phone(phone) - else: - x = self.emb_phone(phone) + self.emb_pitch(pitch) - x = x * math.sqrt(self.hidden_channels) # [b, t, h] - x = self.lrelu(x) - x = torch.transpose(x, 1, -1) # [b, h, t] - x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to( - x.dtype - ) - x = self.encoder(x * x_mask, x_mask) - stats = self.proj(x) * x_mask - - m, logs = torch.split(stats, self.out_channels, dim=1) - return m, logs, x_mask - - -class ResidualCouplingBlock(nn.Module): - def __init__( - self, - channels, - hidden_channels, - kernel_size, - dilation_rate, - n_layers, - n_flows=4, - gin_channels=0, - ): - super(ResidualCouplingBlock, self).__init__() - self.channels = channels - self.hidden_channels = hidden_channels - self.kernel_size = kernel_size - self.dilation_rate = dilation_rate - self.n_layers = n_layers - self.n_flows = n_flows - self.gin_channels = gin_channels - - self.flows = nn.ModuleList() - for i in range(n_flows): - self.flows.append( - modules.ResidualCouplingLayer( - channels, - hidden_channels, - kernel_size, - dilation_rate, - n_layers, - gin_channels=gin_channels, - mean_only=True, - ) - ) - self.flows.append(modules.Flip()) - - def forward( - self, - x: torch.Tensor, - x_mask: torch.Tensor, - g: Optional[torch.Tensor] = None, - reverse: bool = False, - ): - if not reverse: - for flow in self.flows: - x, _ = flow(x, x_mask, g=g, reverse=reverse) - else: - for flow in self.flows[::-1]: - x = flow.forward(x, x_mask, g=g, reverse=reverse) - return x - - def remove_weight_norm(self): - for i in range(self.n_flows): - self.flows[i * 2].remove_weight_norm() - - def __prepare_scriptable__(self): - for i in range(self.n_flows): - for hook in self.flows[i * 2]._forward_pre_hooks.values(): - if ( - hook.__module__ == "torch.nn.utils.parametrizations.weight_norm" - and hook.__class__.__name__ == "WeightNorm" - ): - torch.nn.utils.remove_weight_norm(self.flows[i * 2]) - - return self - - -class PosteriorEncoder(nn.Module): - def __init__( - self, - in_channels, - out_channels, - hidden_channels, - kernel_size, - dilation_rate, - n_layers, - gin_channels=0, - ): - super(PosteriorEncoder, self).__init__() - self.in_channels = in_channels - self.out_channels = out_channels - self.hidden_channels = hidden_channels - self.kernel_size = kernel_size - self.dilation_rate = dilation_rate - self.n_layers = n_layers - self.gin_channels = gin_channels - - self.pre = nn.Conv1d(in_channels, hidden_channels, 1) - self.enc = modules.WN( - hidden_channels, - kernel_size, - dilation_rate, - n_layers, - gin_channels=gin_channels, - ) - self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) - - def forward( - self, x: torch.Tensor, x_lengths: torch.Tensor, g: Optional[torch.Tensor] = None - ): - x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to( - x.dtype - ) - x = self.pre(x) * x_mask - x = self.enc(x, x_mask, g=g) - stats = self.proj(x) * x_mask - m, logs = torch.split(stats, self.out_channels, dim=1) - z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask - return z, m, logs, x_mask - - def remove_weight_norm(self): - self.enc.remove_weight_norm() - - def __prepare_scriptable__(self): - for hook in self.enc._forward_pre_hooks.values(): - if ( - hook.__module__ == "torch.nn.utils.parametrizations.weight_norm" - and hook.__class__.__name__ == "WeightNorm" - ): - torch.nn.utils.remove_weight_norm(self.enc) - return self - - -class Generator(torch.nn.Module): - def __init__( - self, - initial_channel, - resblock, - resblock_kernel_sizes, - resblock_dilation_sizes, - upsample_rates, - upsample_initial_channel, - upsample_kernel_sizes, - gin_channels=0, - ): - super(Generator, self).__init__() - self.num_kernels = len(resblock_kernel_sizes) - self.num_upsamples = len(upsample_rates) - self.conv_pre = Conv1d( - initial_channel, upsample_initial_channel, 7, 1, padding=3 - ) - resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2 - - self.ups = nn.ModuleList() - for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)): - self.ups.append( - weight_norm( - ConvTranspose1d( - upsample_initial_channel // (2**i), - upsample_initial_channel // (2 ** (i + 1)), - k, - u, - padding=(k - u) // 2, - ) - ) - ) - - self.resblocks = nn.ModuleList() - for i in range(len(self.ups)): - ch = upsample_initial_channel // (2 ** (i + 1)) - for j, (k, d) in enumerate( - zip(resblock_kernel_sizes, resblock_dilation_sizes) - ): - self.resblocks.append(resblock(ch, k, d)) - - self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False) - self.ups.apply(init_weights) - - if gin_channels != 0: - self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1) - - def forward(self, x: torch.Tensor, g: Optional[torch.Tensor] = None): - x = self.conv_pre(x) - if g is not None: - x = x + self.cond(g) - - for i in range(self.num_upsamples): - x = F.leaky_relu(x, modules.LRELU_SLOPE) - x = self.ups[i](x) - xs = None - for j in range(self.num_kernels): - if xs is None: - xs = self.resblocks[i * self.num_kernels + j](x) - else: - xs += self.resblocks[i * self.num_kernels + j](x) - x = xs / self.num_kernels - x = F.leaky_relu(x) - x = self.conv_post(x) - x = torch.tanh(x) - - return x - - def __prepare_scriptable__(self): - for l in self.ups: - for hook in l._forward_pre_hooks.values(): - # The hook we want to remove is an instance of WeightNorm class, so - # normally we would do `if isinstance(...)` but this class is not accessible - # because of shadowing, so we check the module name directly. - # https://github.com/pytorch/pytorch/blob/be0ca00c5ce260eb5bcec3237357f7a30cc08983/torch/nn/utils/__init__.py#L3 - if ( - hook.__module__ == "torch.nn.utils.parametrizations.weight_norm" - and hook.__class__.__name__ == "WeightNorm" - ): - torch.nn.utils.remove_weight_norm(l) - - for l in self.resblocks: - for hook in l._forward_pre_hooks.values(): - if ( - hook.__module__ == "torch.nn.utils.parametrizations.weight_norm" - and hook.__class__.__name__ == "WeightNorm" - ): - torch.nn.utils.remove_weight_norm(l) - return self - - def remove_weight_norm(self): - for l in self.ups: - remove_weight_norm(l) - for l in self.resblocks: - l.remove_weight_norm() - - -class SineGen(torch.nn.Module): - """Definition of sine generator - SineGen(samp_rate, harmonic_num = 0, - sine_amp = 0.1, noise_std = 0.003, - voiced_threshold = 0, - flag_for_pulse=False) - samp_rate: sampling rate in Hz - harmonic_num: number of harmonic overtones (default 0) - sine_amp: amplitude of sine-wavefrom (default 0.1) - noise_std: std of Gaussian noise (default 0.003) - voiced_thoreshold: F0 threshold for U/V classification (default 0) - flag_for_pulse: this SinGen is used inside PulseGen (default False) - Note: when flag_for_pulse is True, the first time step of a voiced - segment is always sin(torch.pi) or cos(0) - """ - - def __init__( - self, - samp_rate, - harmonic_num=0, - sine_amp=0.1, - noise_std=0.003, - voiced_threshold=0, - flag_for_pulse=False, - ): - super(SineGen, self).__init__() - self.sine_amp = sine_amp - self.noise_std = noise_std - self.harmonic_num = harmonic_num - self.dim = self.harmonic_num + 1 - self.sample_rate = samp_rate - self.voiced_threshold = voiced_threshold - - def _f02uv(self, f0): - # generate uv signal - uv = torch.ones_like(f0) - uv = uv * (f0 > self.voiced_threshold) - if uv.device.type == "privateuseone": # for DirectML - uv = uv.float() - return uv - - def forward(self, f0: torch.Tensor, upp: int): - """sine_tensor, uv = forward(f0) - input F0: tensor(batchsize=1, length, dim=1) - f0 for unvoiced steps should be 0 - output sine_tensor: tensor(batchsize=1, length, dim) - output uv: tensor(batchsize=1, length, 1) - """ - with torch.no_grad(): - f0 = f0[:, None].transpose(1, 2) - f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, device=f0.device) - # fundamental component - f0_buf[:, :, 0] = f0[:, :, 0] - for idx in range(self.harmonic_num): - f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * ( - idx + 2 - ) # idx + 2: the (idx+1)-th overtone, (idx+2)-th harmonic - rad_values = (f0_buf / float(self.sample_rate)) % 1 - rand_ini = torch.rand( - f0_buf.shape[0], f0_buf.shape[2], device=f0_buf.device - ) - rand_ini[:, 0] = 0 - rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini - tmp_over_one = torch.cumsum(rad_values, 1) - tmp_over_one *= upp - tmp_over_one = F.interpolate( - tmp_over_one.transpose(2, 1), - scale_factor=float(upp), - mode="linear", - align_corners=True, - ).transpose(2, 1) - rad_values = F.interpolate( - rad_values.transpose(2, 1), scale_factor=float(upp), mode="nearest" - ).transpose( - 2, 1 - ) ####### - tmp_over_one %= 1 - tmp_over_one_idx = (tmp_over_one[:, 1:, :] - tmp_over_one[:, :-1, :]) < 0 - cumsum_shift = torch.zeros_like(rad_values) - cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0 - sine_waves = torch.sin( - torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * torch.pi - ) - sine_waves = sine_waves * self.sine_amp - uv = self._f02uv(f0) - uv = F.interpolate( - uv.transpose(2, 1), scale_factor=float(upp), mode="nearest" - ).transpose(2, 1) - noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3 - noise = noise_amp * torch.randn_like(sine_waves) - sine_waves = sine_waves * uv + noise - return sine_waves, uv, noise - - -class SourceModuleHnNSF(torch.nn.Module): - """SourceModule for hn-nsf - SourceModule(sample_rate, harmonic_num=0, sine_amp=0.1, - add_noise_std=0.003, voiced_threshod=0) - sample_rate: sample_rate in Hz - harmonic_num: number of harmonic above F0 (default: 0) - sine_amp: amplitude of sine source signal (default: 0.1) - add_noise_std: std of additive Gaussian noise (default: 0.003) - note that amplitude of noise in unvoiced is decided - by sine_amp - voiced_threshold: threhold to set U/V given F0 (default: 0) - Sine_source, noise_source = SourceModuleHnNSF(F0_sampled) - F0_sampled (batchsize, length, 1) - Sine_source (batchsize, length, 1) - noise_source (batchsize, length 1) - uv (batchsize, length, 1) - """ - - def __init__( - self, - sample_rate, - harmonic_num=0, - sine_amp=0.1, - add_noise_std=0.003, - voiced_threshod=0, - is_half=True, - ): - super(SourceModuleHnNSF, self).__init__() - - self.sine_amp = sine_amp - self.noise_std = add_noise_std - self.is_half = is_half - # to produce sine waveforms - self.l_sin_gen = SineGen( - sample_rate, harmonic_num, sine_amp, add_noise_std, voiced_threshod - ) - - # to merge source harmonics into a single excitation - self.l_linear = torch.nn.Linear(harmonic_num + 1, 1) - self.l_tanh = torch.nn.Tanh() - # self.ddtype:int = -1 - - def forward(self, x: torch.Tensor, upp: int = 1): - # if self.ddtype ==-1: - # self.ddtype = self.l_linear.weight.dtype - sine_wavs, uv, _ = self.l_sin_gen(x, upp) - # print(x.dtype,sine_wavs.dtype,self.l_linear.weight.dtype) - # if self.is_half: - # sine_wavs = sine_wavs.half() - # sine_merge = self.l_tanh(self.l_linear(sine_wavs.to(x))) - # print(sine_wavs.dtype,self.ddtype) - # if sine_wavs.dtype != self.l_linear.weight.dtype: - sine_wavs = sine_wavs.to(dtype=self.l_linear.weight.dtype) - sine_merge = self.l_tanh(self.l_linear(sine_wavs)) - return sine_merge, None, None # noise, uv - - -class GeneratorNSF(torch.nn.Module): - def __init__( - self, - initial_channel, - resblock, - resblock_kernel_sizes, - resblock_dilation_sizes, - upsample_rates, - upsample_initial_channel, - upsample_kernel_sizes, - gin_channels, - sr, - is_half=False, - ): - super(GeneratorNSF, self).__init__() - self.num_kernels = len(resblock_kernel_sizes) - self.num_upsamples = len(upsample_rates) - - self.f0_upsamp = torch.nn.Upsample(scale_factor=math.prod(upsample_rates)) - self.m_source = SourceModuleHnNSF( - sample_rate=sr, harmonic_num=0, is_half=is_half - ) - self.noise_convs = nn.ModuleList() - self.conv_pre = Conv1d( - initial_channel, upsample_initial_channel, 7, 1, padding=3 - ) - resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2 - - self.ups = nn.ModuleList() - for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)): - c_cur = upsample_initial_channel // (2 ** (i + 1)) - self.ups.append( - weight_norm( - ConvTranspose1d( - upsample_initial_channel // (2**i), - upsample_initial_channel // (2 ** (i + 1)), - k, - u, - padding=(k - u) // 2, - ) - ) - ) - if i + 1 < len(upsample_rates): - stride_f0 = math.prod(upsample_rates[i + 1 :]) - self.noise_convs.append( - Conv1d( - 1, - c_cur, - kernel_size=stride_f0 * 2, - stride=stride_f0, - padding=stride_f0 // 2, - ) - ) - else: - self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1)) - - self.resblocks = nn.ModuleList() - for i in range(len(self.ups)): - ch = upsample_initial_channel // (2 ** (i + 1)) - for j, (k, d) in enumerate( - zip(resblock_kernel_sizes, resblock_dilation_sizes) - ): - self.resblocks.append(resblock(ch, k, d)) - - self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False) - self.ups.apply(init_weights) - - if gin_channels != 0: - self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1) - - self.upp = math.prod(upsample_rates) - - self.lrelu_slope = modules.LRELU_SLOPE - - def forward(self, x, f0, g: Optional[torch.Tensor] = None): - har_source, noi_source, uv = self.m_source(f0, self.upp) - har_source = har_source.transpose(1, 2) - x = self.conv_pre(x) - if g is not None: - x = x + self.cond(g) - # torch.jit.script() does not support direct indexing of torch modules - # That's why I wrote this - for i, (ups, noise_convs) in enumerate(zip(self.ups, self.noise_convs)): - if i < self.num_upsamples: - x = F.leaky_relu(x, self.lrelu_slope) - x = ups(x) - x_source = noise_convs(har_source) - x = x + x_source - xs: Optional[torch.Tensor] = None - l = [i * self.num_kernels + j for j in range(self.num_kernels)] - for j, resblock in enumerate(self.resblocks): - if j in l: - if xs is None: - xs = resblock(x) - else: - xs += resblock(x) - # This assertion cannot be ignored! \ - # If ignored, it will cause torch.jit.script() compilation errors - assert isinstance(xs, torch.Tensor) - x = xs / self.num_kernels - x = F.leaky_relu(x) - x = self.conv_post(x) - x = torch.tanh(x) - return x - - def remove_weight_norm(self): - for l in self.ups: - remove_weight_norm(l) - for l in self.resblocks: - l.remove_weight_norm() - - def __prepare_scriptable__(self): - for l in self.ups: - for hook in l._forward_pre_hooks.values(): - # The hook we want to remove is an instance of WeightNorm class, so - # normally we would do `if isinstance(...)` but this class is not accessible - # because of shadowing, so we check the module name directly. - # https://github.com/pytorch/pytorch/blob/be0ca00c5ce260eb5bcec3237357f7a30cc08983/torch/nn/utils/__init__.py#L3 - if ( - hook.__module__ == "torch.nn.utils.parametrizations.weight_norm" - and hook.__class__.__name__ == "WeightNorm" - ): - torch.nn.utils.remove_weight_norm(l) - for l in self.resblocks: - for hook in self.resblocks._forward_pre_hooks.values(): - if ( - hook.__module__ == "torch.nn.utils.parametrizations.weight_norm" - and hook.__class__.__name__ == "WeightNorm" - ): - torch.nn.utils.remove_weight_norm(l) - return self - - -class SynthesizerTrnMs256NSFsid(nn.Module): - def __init__( - self, - spec_channels, - segment_size, - inter_channels, - hidden_channels, - filter_channels, - n_heads, - n_layers, - kernel_size, - p_dropout, - resblock, - resblock_kernel_sizes, - resblock_dilation_sizes, - upsample_rates, - upsample_initial_channel, - upsample_kernel_sizes, - spk_embed_dim, - gin_channels, - sr, - **kwargs - ): - super(SynthesizerTrnMs256NSFsid, self).__init__() - self.spec_channels = spec_channels - self.inter_channels = inter_channels - self.hidden_channels = hidden_channels - self.filter_channels = filter_channels - self.n_heads = n_heads - self.n_layers = n_layers - self.kernel_size = kernel_size - self.p_dropout = float(p_dropout) - self.resblock = resblock - self.resblock_kernel_sizes = resblock_kernel_sizes - self.resblock_dilation_sizes = resblock_dilation_sizes - self.upsample_rates = upsample_rates - self.upsample_initial_channel = upsample_initial_channel - self.upsample_kernel_sizes = upsample_kernel_sizes - self.segment_size = segment_size - self.gin_channels = gin_channels - # self.hop_length = hop_length# - self.spk_embed_dim = spk_embed_dim - self.enc_p = TextEncoder256( - inter_channels, - hidden_channels, - filter_channels, - n_heads, - n_layers, - kernel_size, - float(p_dropout), - ) - self.dec = GeneratorNSF( - inter_channels, - resblock, - resblock_kernel_sizes, - resblock_dilation_sizes, - upsample_rates, - upsample_initial_channel, - upsample_kernel_sizes, - gin_channels=gin_channels, - sr=sr, - is_half=kwargs["is_half"], - ) - self.enc_q = PosteriorEncoder( - spec_channels, - inter_channels, - hidden_channels, - 5, - 1, - 16, - gin_channels=gin_channels, - ) - self.flow = ResidualCouplingBlock( - inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels - ) - self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels) - - def remove_weight_norm(self): - self.dec.remove_weight_norm() - self.flow.remove_weight_norm() - self.enc_q.remove_weight_norm() - - def __prepare_scriptable__(self): - for hook in self.dec._forward_pre_hooks.values(): - # The hook we want to remove is an instance of WeightNorm class, so - # normally we would do `if isinstance(...)` but this class is not accessible - # because of shadowing, so we check the module name directly. - # https://github.com/pytorch/pytorch/blob/be0ca00c5ce260eb5bcec3237357f7a30cc08983/torch/nn/utils/__init__.py#L3 - if ( - hook.__module__ == "torch.nn.utils.parametrizations.weight_norm" - and hook.__class__.__name__ == "WeightNorm" - ): - torch.nn.utils.remove_weight_norm(self.dec) - for hook in self.flow._forward_pre_hooks.values(): - if ( - hook.__module__ == "torch.nn.utils.parametrizations.weight_norm" - and hook.__class__.__name__ == "WeightNorm" - ): - torch.nn.utils.remove_weight_norm(self.flow) - if hasattr(self, "enc_q"): - for hook in self.enc_q._forward_pre_hooks.values(): - if ( - hook.__module__ == "torch.nn.utils.parametrizations.weight_norm" - and hook.__class__.__name__ == "WeightNorm" - ): - torch.nn.utils.remove_weight_norm(self.enc_q) - return self - - @torch.jit.ignore - def forward( - self, - phone: torch.Tensor, - phone_lengths: torch.Tensor, - pitch: torch.Tensor, - pitchf: torch.Tensor, - y: torch.Tensor, - y_lengths: torch.Tensor, - ds: Optional[torch.Tensor] = None, - ): # 这里ds是id,[bs,1] - # print(1,pitch.shape)#[bs,t] - g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t,广播的 - m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths) - z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g) - z_p = self.flow(z, y_mask, g=g) - z_slice, ids_slice = commons.rand_slice_segments( - z, y_lengths, self.segment_size - ) - # print(-1,pitchf.shape,ids_slice,self.segment_size,self.hop_length,self.segment_size//self.hop_length) - pitchf = commons.slice_segments2(pitchf, ids_slice, self.segment_size) - # print(-2,pitchf.shape,z_slice.shape) - o = self.dec(z_slice, pitchf, g=g) - return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q) - - @torch.jit.export - def infer( - self, - phone: torch.Tensor, - phone_lengths: torch.Tensor, - pitch: torch.Tensor, - nsff0: torch.Tensor, - sid: torch.Tensor, - rate: Optional[torch.Tensor] = None, - ): - g = self.emb_g(sid).unsqueeze(-1) - m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths) - z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask - if rate is not None: - assert isinstance(rate, torch.Tensor) - head = int(z_p.shape[2] * (1 - rate.item())) - z_p = z_p[:, :, head:] - x_mask = x_mask[:, :, head:] - nsff0 = nsff0[:, head:] - z = self.flow(z_p, x_mask, g=g, reverse=True) - o = self.dec(z * x_mask, nsff0, g=g) - return o, x_mask, (z, z_p, m_p, logs_p) - - -class SynthesizerTrnMs768NSFsid(nn.Module): - def __init__( - self, - spec_channels, - segment_size, - inter_channels, - hidden_channels, - filter_channels, - n_heads, - n_layers, - kernel_size, - p_dropout, - resblock, - resblock_kernel_sizes, - resblock_dilation_sizes, - upsample_rates, - upsample_initial_channel, - upsample_kernel_sizes, - spk_embed_dim, - gin_channels, - sr, - **kwargs - ): - super(SynthesizerTrnMs768NSFsid, self).__init__() - self.spec_channels = spec_channels - self.inter_channels = inter_channels - self.hidden_channels = hidden_channels - self.filter_channels = filter_channels - self.n_heads = n_heads - self.n_layers = n_layers - self.kernel_size = kernel_size - self.p_dropout = float(p_dropout) - self.resblock = resblock - self.resblock_kernel_sizes = resblock_kernel_sizes - self.resblock_dilation_sizes = resblock_dilation_sizes - self.upsample_rates = upsample_rates - self.upsample_initial_channel = upsample_initial_channel - self.upsample_kernel_sizes = upsample_kernel_sizes - self.segment_size = segment_size - self.gin_channels = gin_channels - # self.hop_length = hop_length# - self.spk_embed_dim = spk_embed_dim - self.enc_p = TextEncoder768( - inter_channels, - hidden_channels, - filter_channels, - n_heads, - n_layers, - kernel_size, - float(p_dropout), - ) - self.dec = GeneratorNSF( - inter_channels, - resblock, - resblock_kernel_sizes, - resblock_dilation_sizes, - upsample_rates, - upsample_initial_channel, - upsample_kernel_sizes, - gin_channels=gin_channels, - sr=sr, - is_half=kwargs["is_half"], - ) - self.enc_q = PosteriorEncoder( - spec_channels, - inter_channels, - hidden_channels, - 5, - 1, - 16, - gin_channels=gin_channels, - ) - self.flow = ResidualCouplingBlock( - inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels - ) - self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels) - - def remove_weight_norm(self): - self.dec.remove_weight_norm() - self.flow.remove_weight_norm() - self.enc_q.remove_weight_norm() - - def __prepare_scriptable__(self): - for hook in self.dec._forward_pre_hooks.values(): - # The hook we want to remove is an instance of WeightNorm class, so - # normally we would do `if isinstance(...)` but this class is not accessible - # because of shadowing, so we check the module name directly. - # https://github.com/pytorch/pytorch/blob/be0ca00c5ce260eb5bcec3237357f7a30cc08983/torch/nn/utils/__init__.py#L3 - if ( - hook.__module__ == "torch.nn.utils.parametrizations.weight_norm" - and hook.__class__.__name__ == "WeightNorm" - ): - torch.nn.utils.remove_weight_norm(self.dec) - for hook in self.flow._forward_pre_hooks.values(): - if ( - hook.__module__ == "torch.nn.utils.parametrizations.weight_norm" - and hook.__class__.__name__ == "WeightNorm" - ): - torch.nn.utils.remove_weight_norm(self.flow) - if hasattr(self, "enc_q"): - for hook in self.enc_q._forward_pre_hooks.values(): - if ( - hook.__module__ == "torch.nn.utils.parametrizations.weight_norm" - and hook.__class__.__name__ == "WeightNorm" - ): - torch.nn.utils.remove_weight_norm(self.enc_q) - return self - - @torch.jit.ignore - def forward( - self, phone, phone_lengths, pitch, pitchf, y, y_lengths, ds - ): # 这里ds是id,[bs,1] - # print(1,pitch.shape)#[bs,t] - g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t,广播的 - m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths) - z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g) - z_p = self.flow(z, y_mask, g=g) - z_slice, ids_slice = commons.rand_slice_segments( - z, y_lengths, self.segment_size - ) - # print(-1,pitchf.shape,ids_slice,self.segment_size,self.hop_length,self.segment_size//self.hop_length) - pitchf = commons.slice_segments2(pitchf, ids_slice, self.segment_size) - # print(-2,pitchf.shape,z_slice.shape) - o = self.dec(z_slice, pitchf, g=g) - return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q) - - @torch.jit.export - def infer( - self, - phone: torch.Tensor, - phone_lengths: torch.Tensor, - pitch: torch.Tensor, - nsff0: torch.Tensor, - sid: torch.Tensor, - rate: Optional[torch.Tensor] = None, - ): - g = self.emb_g(sid).unsqueeze(-1) - m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths) - z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask - if rate is not None: - head = int(z_p.shape[2] * (1.0 - rate.item())) - z_p = z_p[:, :, head:] - x_mask = x_mask[:, :, head:] - nsff0 = nsff0[:, head:] - z = self.flow(z_p, x_mask, g=g, reverse=True) - o = self.dec(z * x_mask, nsff0, g=g) - return o, x_mask, (z, z_p, m_p, logs_p) - - -class SynthesizerTrnMs256NSFsid_nono(nn.Module): - def __init__( - self, - spec_channels, - segment_size, - inter_channels, - hidden_channels, - filter_channels, - n_heads, - n_layers, - kernel_size, - p_dropout, - resblock, - resblock_kernel_sizes, - resblock_dilation_sizes, - upsample_rates, - upsample_initial_channel, - upsample_kernel_sizes, - spk_embed_dim, - gin_channels, - sr=None, - **kwargs - ): - super(SynthesizerTrnMs256NSFsid_nono, self).__init__() - self.spec_channels = spec_channels - self.inter_channels = inter_channels - self.hidden_channels = hidden_channels - self.filter_channels = filter_channels - self.n_heads = n_heads - self.n_layers = n_layers - self.kernel_size = kernel_size - self.p_dropout = float(p_dropout) - self.resblock = resblock - self.resblock_kernel_sizes = resblock_kernel_sizes - self.resblock_dilation_sizes = resblock_dilation_sizes - self.upsample_rates = upsample_rates - self.upsample_initial_channel = upsample_initial_channel - self.upsample_kernel_sizes = upsample_kernel_sizes - self.segment_size = segment_size - self.gin_channels = gin_channels - # self.hop_length = hop_length# - self.spk_embed_dim = spk_embed_dim - self.enc_p = TextEncoder256( - inter_channels, - hidden_channels, - filter_channels, - n_heads, - n_layers, - kernel_size, - float(p_dropout), - f0=False, - ) - self.dec = Generator( - inter_channels, - resblock, - resblock_kernel_sizes, - resblock_dilation_sizes, - upsample_rates, - upsample_initial_channel, - upsample_kernel_sizes, - gin_channels=gin_channels, - ) - self.enc_q = PosteriorEncoder( - spec_channels, - inter_channels, - hidden_channels, - 5, - 1, - 16, - gin_channels=gin_channels, - ) - self.flow = ResidualCouplingBlock( - inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels - ) - self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels) - - def remove_weight_norm(self): - self.dec.remove_weight_norm() - self.flow.remove_weight_norm() - self.enc_q.remove_weight_norm() - - def __prepare_scriptable__(self): - for hook in self.dec._forward_pre_hooks.values(): - # The hook we want to remove is an instance of WeightNorm class, so - # normally we would do `if isinstance(...)` but this class is not accessible - # because of shadowing, so we check the module name directly. - # https://github.com/pytorch/pytorch/blob/be0ca00c5ce260eb5bcec3237357f7a30cc08983/torch/nn/utils/__init__.py#L3 - if ( - hook.__module__ == "torch.nn.utils.parametrizations.weight_norm" - and hook.__class__.__name__ == "WeightNorm" - ): - torch.nn.utils.remove_weight_norm(self.dec) - for hook in self.flow._forward_pre_hooks.values(): - if ( - hook.__module__ == "torch.nn.utils.parametrizations.weight_norm" - and hook.__class__.__name__ == "WeightNorm" - ): - torch.nn.utils.remove_weight_norm(self.flow) - if hasattr(self, "enc_q"): - for hook in self.enc_q._forward_pre_hooks.values(): - if ( - hook.__module__ == "torch.nn.utils.parametrizations.weight_norm" - and hook.__class__.__name__ == "WeightNorm" - ): - torch.nn.utils.remove_weight_norm(self.enc_q) - return self - - @torch.jit.ignore - def forward(self, phone, phone_lengths, y, y_lengths, ds): # 这里ds是id,[bs,1] - g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t,广播的 - m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths) - z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g) - z_p = self.flow(z, y_mask, g=g) - z_slice, ids_slice = commons.rand_slice_segments( - z, y_lengths, self.segment_size - ) - o = self.dec(z_slice, g=g) - return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q) - - @torch.jit.export - def infer( - self, - phone: torch.Tensor, - phone_lengths: torch.Tensor, - sid: torch.Tensor, - rate: Optional[torch.Tensor] = None, - ): - g = self.emb_g(sid).unsqueeze(-1) - m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths) - z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask - if rate is not None: - head = int(z_p.shape[2] * (1.0 - rate.item())) - z_p = z_p[:, :, head:] - x_mask = x_mask[:, :, head:] - z = self.flow(z_p, x_mask, g=g, reverse=True) - o = self.dec(z * x_mask, g=g) - return o, x_mask, (z, z_p, m_p, logs_p) - - -class SynthesizerTrnMs768NSFsid_nono(nn.Module): - def __init__( - self, - spec_channels, - segment_size, - inter_channels, - hidden_channels, - filter_channels, - n_heads, - n_layers, - kernel_size, - p_dropout, - resblock, - resblock_kernel_sizes, - resblock_dilation_sizes, - upsample_rates, - upsample_initial_channel, - upsample_kernel_sizes, - spk_embed_dim, - gin_channels, - sr=None, - **kwargs - ): - super(SynthesizerTrnMs768NSFsid_nono, self).__init__() - self.spec_channels = spec_channels - self.inter_channels = inter_channels - self.hidden_channels = hidden_channels - self.filter_channels = filter_channels - self.n_heads = n_heads - self.n_layers = n_layers - self.kernel_size = kernel_size - self.p_dropout = float(p_dropout) - self.resblock = resblock - self.resblock_kernel_sizes = resblock_kernel_sizes - self.resblock_dilation_sizes = resblock_dilation_sizes - self.upsample_rates = upsample_rates - self.upsample_initial_channel = upsample_initial_channel - self.upsample_kernel_sizes = upsample_kernel_sizes - self.segment_size = segment_size - self.gin_channels = gin_channels - # self.hop_length = hop_length# - self.spk_embed_dim = spk_embed_dim - self.enc_p = TextEncoder768( - inter_channels, - hidden_channels, - filter_channels, - n_heads, - n_layers, - kernel_size, - float(p_dropout), - f0=False, - ) - self.dec = Generator( - inter_channels, - resblock, - resblock_kernel_sizes, - resblock_dilation_sizes, - upsample_rates, - upsample_initial_channel, - upsample_kernel_sizes, - gin_channels=gin_channels, - ) - self.enc_q = PosteriorEncoder( - spec_channels, - inter_channels, - hidden_channels, - 5, - 1, - 16, - gin_channels=gin_channels, - ) - self.flow = ResidualCouplingBlock( - inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels - ) - self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels) - - def remove_weight_norm(self): - self.dec.remove_weight_norm() - self.flow.remove_weight_norm() - self.enc_q.remove_weight_norm() - - def __prepare_scriptable__(self): - for hook in self.dec._forward_pre_hooks.values(): - # The hook we want to remove is an instance of WeightNorm class, so - # normally we would do `if isinstance(...)` but this class is not accessible - # because of shadowing, so we check the module name directly. - # https://github.com/pytorch/pytorch/blob/be0ca00c5ce260eb5bcec3237357f7a30cc08983/torch/nn/utils/__init__.py#L3 - if ( - hook.__module__ == "torch.nn.utils.parametrizations.weight_norm" - and hook.__class__.__name__ == "WeightNorm" - ): - torch.nn.utils.remove_weight_norm(self.dec) - for hook in self.flow._forward_pre_hooks.values(): - if ( - hook.__module__ == "torch.nn.utils.parametrizations.weight_norm" - and hook.__class__.__name__ == "WeightNorm" - ): - torch.nn.utils.remove_weight_norm(self.flow) - if hasattr(self, "enc_q"): - for hook in self.enc_q._forward_pre_hooks.values(): - if ( - hook.__module__ == "torch.nn.utils.parametrizations.weight_norm" - and hook.__class__.__name__ == "WeightNorm" - ): - torch.nn.utils.remove_weight_norm(self.enc_q) - return self - - @torch.jit.ignore - def forward(self, phone, phone_lengths, y, y_lengths, ds): # 这里ds是id,[bs,1] - g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t,广播的 - m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths) - z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g) - z_p = self.flow(z, y_mask, g=g) - z_slice, ids_slice = commons.rand_slice_segments( - z, y_lengths, self.segment_size - ) - o = self.dec(z_slice, g=g) - return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q) - - @torch.jit.export - def infer( - self, - phone: torch.Tensor, - phone_lengths: torch.Tensor, - sid: torch.Tensor, - rate: Optional[torch.Tensor] = None, - ): - g = self.emb_g(sid).unsqueeze(-1) - m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths) - z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask - if rate is not None: - head = int(z_p.shape[2] * (1.0 - rate.item())) - z_p = z_p[:, :, head:] - x_mask = x_mask[:, :, head:] - z = self.flow(z_p, x_mask, g=g, reverse=True) - o = self.dec(z * x_mask, g=g) - return o, x_mask, (z, z_p, m_p, logs_p) - - -class MultiPeriodDiscriminator(torch.nn.Module): - def __init__(self, use_spectral_norm=False): - super(MultiPeriodDiscriminator, self).__init__() - periods = [2, 3, 5, 7, 11, 17] - # periods = [3, 5, 7, 11, 17, 23, 37] - - discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)] - discs = discs + [ - DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods - ] - self.discriminators = nn.ModuleList(discs) - - def forward(self, y, y_hat): - y_d_rs = [] # - y_d_gs = [] - fmap_rs = [] - fmap_gs = [] - for i, d in enumerate(self.discriminators): - y_d_r, fmap_r = d(y) - y_d_g, fmap_g = d(y_hat) - # for j in range(len(fmap_r)): - # print(i,j,y.shape,y_hat.shape,fmap_r[j].shape,fmap_g[j].shape) - y_d_rs.append(y_d_r) - y_d_gs.append(y_d_g) - fmap_rs.append(fmap_r) - fmap_gs.append(fmap_g) - - return y_d_rs, y_d_gs, fmap_rs, fmap_gs - - -class MultiPeriodDiscriminatorV2(torch.nn.Module): - def __init__(self, use_spectral_norm=False): - super(MultiPeriodDiscriminatorV2, self).__init__() - # periods = [2, 3, 5, 7, 11, 17] - periods = [2, 3, 5, 7, 11, 17, 23, 37] - - discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)] - discs = discs + [ - DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods - ] - self.discriminators = nn.ModuleList(discs) - - def forward(self, y, y_hat): - y_d_rs = [] # - y_d_gs = [] - fmap_rs = [] - fmap_gs = [] - for i, d in enumerate(self.discriminators): - y_d_r, fmap_r = d(y) - y_d_g, fmap_g = d(y_hat) - # for j in range(len(fmap_r)): - # print(i,j,y.shape,y_hat.shape,fmap_r[j].shape,fmap_g[j].shape) - y_d_rs.append(y_d_r) - y_d_gs.append(y_d_g) - fmap_rs.append(fmap_r) - fmap_gs.append(fmap_g) - - return y_d_rs, y_d_gs, fmap_rs, fmap_gs - - -class DiscriminatorS(torch.nn.Module): - def __init__(self, use_spectral_norm=False): - super(DiscriminatorS, self).__init__() - norm_f = weight_norm if use_spectral_norm == False else spectral_norm - self.convs = nn.ModuleList( - [ - norm_f(Conv1d(1, 16, 15, 1, padding=7)), - norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)), - norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)), - norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)), - norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)), - norm_f(Conv1d(1024, 1024, 5, 1, padding=2)), - ] - ) - self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1)) - - def forward(self, x): - fmap = [] - - for l in self.convs: - x = l(x) - x = F.leaky_relu(x, modules.LRELU_SLOPE) - fmap.append(x) - x = self.conv_post(x) - fmap.append(x) - x = torch.flatten(x, 1, -1) - - return x, fmap - - -class DiscriminatorP(torch.nn.Module): - def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False): - super(DiscriminatorP, self).__init__() - self.period = period - self.use_spectral_norm = use_spectral_norm - norm_f = weight_norm if use_spectral_norm == False else spectral_norm - self.convs = nn.ModuleList( - [ - norm_f( - Conv2d( - 1, - 32, - (kernel_size, 1), - (stride, 1), - padding=(get_padding(kernel_size, 1), 0), - ) - ), - norm_f( - Conv2d( - 32, - 128, - (kernel_size, 1), - (stride, 1), - padding=(get_padding(kernel_size, 1), 0), - ) - ), - norm_f( - Conv2d( - 128, - 512, - (kernel_size, 1), - (stride, 1), - padding=(get_padding(kernel_size, 1), 0), - ) - ), - norm_f( - Conv2d( - 512, - 1024, - (kernel_size, 1), - (stride, 1), - padding=(get_padding(kernel_size, 1), 0), - ) - ), - norm_f( - Conv2d( - 1024, - 1024, - (kernel_size, 1), - 1, - padding=(get_padding(kernel_size, 1), 0), - ) - ), - ] - ) - self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0))) - - def forward(self, x): - fmap = [] - - # 1d to 2d - b, c, t = x.shape - if t % self.period != 0: # pad first - n_pad = self.period - (t % self.period) - if has_xpu and x.dtype == torch.bfloat16: - x = F.pad(x.to(dtype=torch.float16), (0, n_pad), "reflect").to( - dtype=torch.bfloat16 - ) - else: - x = F.pad(x, (0, n_pad), "reflect") - t = t + n_pad - x = x.view(b, c, t // self.period, self.period) - - for l in self.convs: - x = l(x) - x = F.leaky_relu(x, modules.LRELU_SLOPE) - fmap.append(x) - x = self.conv_post(x) - fmap.append(x) - x = torch.flatten(x, 1, -1) - - return x, fmap diff --git a/rvc/lib/infer_pack/modules.py b/rvc/lib/infer_pack/modules.py deleted file mode 100644 index 7c18733..0000000 --- a/rvc/lib/infer_pack/modules.py +++ /dev/null @@ -1,521 +0,0 @@ -import math -import torch -from torch import nn -from torch.nn import functional as F - -from torch.nn import Conv1d -from torch.nn.utils import remove_weight_norm -from torch.nn.utils.parametrizations import weight_norm - - -from . import commons -from .commons import init_weights, get_padding -from .transforms import piecewise_rational_quadratic_transform - - -LRELU_SLOPE = 0.1 - - -class LayerNorm(nn.Module): - def __init__(self, channels, eps=1e-5): - super().__init__() - self.channels = channels - self.eps = eps - - self.gamma = nn.Parameter(torch.ones(channels)) - self.beta = nn.Parameter(torch.zeros(channels)) - - def forward(self, x): - x = x.transpose(1, -1) - x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps) - return x.transpose(1, -1) - - -class ConvReluNorm(nn.Module): - def __init__( - self, - in_channels, - hidden_channels, - out_channels, - kernel_size, - n_layers, - p_dropout, - ): - super().__init__() - self.in_channels = in_channels - self.hidden_channels = hidden_channels - self.out_channels = out_channels - self.kernel_size = kernel_size - self.n_layers = n_layers - self.p_dropout = p_dropout - assert n_layers > 1, "Number of layers should be larger than 0." - - self.conv_layers = nn.ModuleList() - self.norm_layers = nn.ModuleList() - self.conv_layers.append( - nn.Conv1d( - in_channels, hidden_channels, kernel_size, padding=kernel_size // 2 - ) - ) - self.norm_layers.append(LayerNorm(hidden_channels)) - self.relu_drop = nn.Sequential(nn.ReLU(), nn.Dropout(p_dropout)) - for _ in range(n_layers - 1): - self.conv_layers.append( - nn.Conv1d( - hidden_channels, - hidden_channels, - kernel_size, - padding=kernel_size // 2, - ) - ) - self.norm_layers.append(LayerNorm(hidden_channels)) - self.proj = nn.Conv1d(hidden_channels, out_channels, 1) - self.proj.weight.data.zero_() - self.proj.bias.data.zero_() - - def forward(self, x, x_mask): - x_org = x - for i in range(self.n_layers): - x = self.conv_layers[i](x * x_mask) - x = self.norm_layers[i](x) - x = self.relu_drop(x) - x = x_org + self.proj(x) - return x * x_mask - - -class DDSConv(nn.Module): - def __init__(self, channels, kernel_size, n_layers, p_dropout=0.0): - super().__init__() - self.channels = channels - self.kernel_size = kernel_size - self.n_layers = n_layers - self.p_dropout = p_dropout - - self.drop = nn.Dropout(p_dropout) - self.convs_sep = nn.ModuleList() - self.convs_1x1 = nn.ModuleList() - self.norms_1 = nn.ModuleList() - self.norms_2 = nn.ModuleList() - for i in range(n_layers): - dilation = kernel_size**i - padding = (kernel_size * dilation - dilation) // 2 - self.convs_sep.append( - nn.Conv1d( - channels, - channels, - kernel_size, - groups=channels, - dilation=dilation, - padding=padding, - ) - ) - self.convs_1x1.append(nn.Conv1d(channels, channels, 1)) - self.norms_1.append(LayerNorm(channels)) - self.norms_2.append(LayerNorm(channels)) - - def forward(self, x, x_mask, g=None): - if g is not None: - x = x + g - for i in range(self.n_layers): - y = self.convs_sep[i](x * x_mask) - y = self.norms_1[i](y) - y = F.gelu(y) - y = self.convs_1x1[i](y) - y = self.norms_2[i](y) - y = F.gelu(y) - y = self.drop(y) - x = x + y - return x * x_mask - - -class WN(torch.nn.Module): - def __init__( - self, - hidden_channels, - kernel_size, - dilation_rate, - n_layers, - gin_channels=0, - p_dropout=0, - ): - super(WN, self).__init__() - assert kernel_size % 2 == 1 - self.hidden_channels = hidden_channels - self.kernel_size = (kernel_size,) - self.dilation_rate = dilation_rate - self.n_layers = n_layers - self.gin_channels = gin_channels - self.p_dropout = p_dropout - - self.in_layers = torch.nn.ModuleList() - self.res_skip_layers = torch.nn.ModuleList() - self.drop = nn.Dropout(p_dropout) - - if gin_channels != 0: - cond_layer = torch.nn.Conv1d( - gin_channels, 2 * hidden_channels * n_layers, 1 - ) - self.cond_layer = torch.nn.utils.parametrizations.weight_norm( - cond_layer, name="weight" - ) - - for i in range(n_layers): - dilation = dilation_rate**i - padding = int((kernel_size * dilation - dilation) / 2) - in_layer = torch.nn.Conv1d( - hidden_channels, - 2 * hidden_channels, - kernel_size, - dilation=dilation, - padding=padding, - ) - in_layer = torch.nn.utils.parametrizations.weight_norm( - in_layer, name="weight" - ) - self.in_layers.append(in_layer) - if i < n_layers - 1: - res_skip_channels = 2 * hidden_channels - else: - res_skip_channels = hidden_channels - - res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1) - res_skip_layer = torch.nn.utils.parametrizations.weight_norm( - res_skip_layer, name="weight" - ) - self.res_skip_layers.append(res_skip_layer) - - def forward(self, x, x_mask, g=None, **kwargs): - output = torch.zeros_like(x) - n_channels_tensor = torch.IntTensor([self.hidden_channels]) - - if g is not None: - g = self.cond_layer(g) - - for i in range(self.n_layers): - x_in = self.in_layers[i](x) - if g is not None: - cond_offset = i * 2 * self.hidden_channels - g_l = g[:, cond_offset : cond_offset + 2 * self.hidden_channels, :] - else: - g_l = torch.zeros_like(x_in) - - acts = commons.fused_add_tanh_sigmoid_multiply(x_in, g_l, n_channels_tensor) - acts = self.drop(acts) - - res_skip_acts = self.res_skip_layers[i](acts) - if i < self.n_layers - 1: - res_acts = res_skip_acts[:, : self.hidden_channels, :] - x = (x + res_acts) * x_mask - output = output + res_skip_acts[:, self.hidden_channels :, :] - else: - output = output + res_skip_acts - return output * x_mask - - def remove_weight_norm(self): - if self.gin_channels != 0: - torch.nn.utils.remove_weight_norm(self.cond_layer) - for l in self.in_layers: - torch.nn.utils.remove_weight_norm(l) - for l in self.res_skip_layers: - torch.nn.utils.remove_weight_norm(l) - - -class ResBlock1(torch.nn.Module): - def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)): - super(ResBlock1, self).__init__() - self.convs1 = nn.ModuleList( - [ - weight_norm( - Conv1d( - channels, - channels, - kernel_size, - 1, - dilation=dilation[0], - padding=get_padding(kernel_size, dilation[0]), - ) - ), - weight_norm( - Conv1d( - channels, - channels, - kernel_size, - 1, - dilation=dilation[1], - padding=get_padding(kernel_size, dilation[1]), - ) - ), - weight_norm( - Conv1d( - channels, - channels, - kernel_size, - 1, - dilation=dilation[2], - padding=get_padding(kernel_size, dilation[2]), - ) - ), - ] - ) - self.convs1.apply(init_weights) - - self.convs2 = nn.ModuleList( - [ - weight_norm( - Conv1d( - channels, - channels, - kernel_size, - 1, - dilation=1, - padding=get_padding(kernel_size, 1), - ) - ), - weight_norm( - Conv1d( - channels, - channels, - kernel_size, - 1, - dilation=1, - padding=get_padding(kernel_size, 1), - ) - ), - weight_norm( - Conv1d( - channels, - channels, - kernel_size, - 1, - dilation=1, - padding=get_padding(kernel_size, 1), - ) - ), - ] - ) - self.convs2.apply(init_weights) - - def forward(self, x, x_mask=None): - for c1, c2 in zip(self.convs1, self.convs2): - xt = F.leaky_relu(x, LRELU_SLOPE) - if x_mask is not None: - xt = xt * x_mask - xt = c1(xt) - xt = F.leaky_relu(xt, LRELU_SLOPE) - if x_mask is not None: - xt = xt * x_mask - xt = c2(xt) - x = xt + x - if x_mask is not None: - x = x * x_mask - return x - - def remove_weight_norm(self): - for l in self.convs1: - remove_weight_norm(l) - for l in self.convs2: - remove_weight_norm(l) - - -class ResBlock2(torch.nn.Module): - def __init__(self, channels, kernel_size=3, dilation=(1, 3)): - super(ResBlock2, self).__init__() - self.convs = nn.ModuleList( - [ - weight_norm( - Conv1d( - channels, - channels, - kernel_size, - 1, - dilation=dilation[0], - padding=get_padding(kernel_size, dilation[0]), - ) - ), - weight_norm( - Conv1d( - channels, - channels, - kernel_size, - 1, - dilation=dilation[1], - padding=get_padding(kernel_size, dilation[1]), - ) - ), - ] - ) - self.convs.apply(init_weights) - - def forward(self, x, x_mask=None): - for c in self.convs: - xt = F.leaky_relu(x, LRELU_SLOPE) - if x_mask is not None: - xt = xt * x_mask - xt = c(xt) - x = xt + x - if x_mask is not None: - x = x * x_mask - return x - - def remove_weight_norm(self): - for l in self.convs: - remove_weight_norm(l) - - -class Log(nn.Module): - def forward(self, x, x_mask, reverse=False, **kwargs): - if not reverse: - y = torch.log(torch.clamp_min(x, 1e-5)) * x_mask - logdet = torch.sum(-y, [1, 2]) - return y, logdet - else: - x = torch.exp(x) * x_mask - return x - - -class Flip(nn.Module): - def forward(self, x, *args, reverse=False, **kwargs): - x = torch.flip(x, [1]) - if not reverse: - logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device) - return x, logdet - else: - return x - - -class ElementwiseAffine(nn.Module): - def __init__(self, channels): - super().__init__() - self.channels = channels - self.m = nn.Parameter(torch.zeros(channels, 1)) - self.logs = nn.Parameter(torch.zeros(channels, 1)) - - def forward(self, x, x_mask, reverse=False, **kwargs): - if not reverse: - y = self.m + torch.exp(self.logs) * x - y = y * x_mask - logdet = torch.sum(self.logs * x_mask, [1, 2]) - return y, logdet - else: - x = (x - self.m) * torch.exp(-self.logs) * x_mask - return x - - -class ResidualCouplingLayer(nn.Module): - def __init__( - self, - channels, - hidden_channels, - kernel_size, - dilation_rate, - n_layers, - p_dropout=0, - gin_channels=0, - mean_only=False, - ): - assert channels % 2 == 0, "channels should be divisible by 2" - super().__init__() - self.channels = channels - self.hidden_channels = hidden_channels - self.kernel_size = kernel_size - self.dilation_rate = dilation_rate - self.n_layers = n_layers - self.half_channels = channels // 2 - self.mean_only = mean_only - - self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1) - self.enc = WN( - hidden_channels, - kernel_size, - dilation_rate, - n_layers, - p_dropout=p_dropout, - gin_channels=gin_channels, - ) - self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1) - self.post.weight.data.zero_() - self.post.bias.data.zero_() - - def forward(self, x, x_mask, g=None, reverse=False): - x0, x1 = torch.split(x, [self.half_channels] * 2, 1) - h = self.pre(x0) * x_mask - h = self.enc(h, x_mask, g=g) - stats = self.post(h) * x_mask - if not self.mean_only: - m, logs = torch.split(stats, [self.half_channels] * 2, 1) - else: - m = stats - logs = torch.zeros_like(m) - - if not reverse: - x1 = m + x1 * torch.exp(logs) * x_mask - x = torch.cat([x0, x1], 1) - logdet = torch.sum(logs, [1, 2]) - return x, logdet - else: - x1 = (x1 - m) * torch.exp(-logs) * x_mask - x = torch.cat([x0, x1], 1) - return x - - def remove_weight_norm(self): - self.enc.remove_weight_norm() - - -class ConvFlow(nn.Module): - def __init__( - self, - in_channels, - filter_channels, - kernel_size, - n_layers, - num_bins=10, - tail_bound=5.0, - ): - super().__init__() - self.in_channels = in_channels - self.filter_channels = filter_channels - self.kernel_size = kernel_size - self.n_layers = n_layers - self.num_bins = num_bins - self.tail_bound = tail_bound - self.half_channels = in_channels // 2 - - self.pre = nn.Conv1d(self.half_channels, filter_channels, 1) - self.convs = DDSConv(filter_channels, kernel_size, n_layers, p_dropout=0.0) - self.proj = nn.Conv1d( - filter_channels, self.half_channels * (num_bins * 3 - 1), 1 - ) - self.proj.weight.data.zero_() - self.proj.bias.data.zero_() - - def forward(self, x, x_mask, g=None, reverse=False): - x0, x1 = torch.split(x, [self.half_channels] * 2, 1) - h = self.pre(x0) - h = self.convs(h, x_mask, g=g) - h = self.proj(h) * x_mask - - b, c, t = x0.shape - h = h.reshape(b, c, -1, t).permute(0, 1, 3, 2) - - unnormalized_widths = h[..., : self.num_bins] / math.sqrt(self.filter_channels) - unnormalized_heights = h[..., self.num_bins : 2 * self.num_bins] / math.sqrt( - self.filter_channels - ) - unnormalized_derivatives = h[..., 2 * self.num_bins :] - - x1, logabsdet = piecewise_rational_quadratic_transform( - x1, - unnormalized_widths, - unnormalized_heights, - unnormalized_derivatives, - inverse=reverse, - tails="linear", - tail_bound=self.tail_bound, - ) - - x = torch.cat([x0, x1], 1) * x_mask - logdet = torch.sum(logabsdet * x_mask, [1, 2]) - if not reverse: - return x, logdet - else: - return x diff --git a/rvc/lib/infer_pack/modules/F0Predictor/DioF0Predictor.py b/rvc/lib/infer_pack/modules/F0Predictor/DioF0Predictor.py deleted file mode 100644 index e95c0e9..0000000 --- a/rvc/lib/infer_pack/modules/F0Predictor/DioF0Predictor.py +++ /dev/null @@ -1,86 +0,0 @@ -from infer_pack.modules.F0Predictor.F0Predictor import F0Predictor -import pyworld -import numpy as np - - -class DioF0Predictor(F0Predictor): - def __init__(self, hop_length=512, f0_min=50, f0_max=1100, sample_rate=44100): - self.hop_length = hop_length - self.f0_min = f0_min - self.f0_max = f0_max - self.sample_rate = sample_rate - - def interpolate_f0(self, f0): - data = np.reshape(f0, (f0.size, 1)) - - vuv_vector = np.zeros((data.size, 1), dtype=np.float32) - vuv_vector[data > 0.0] = 1.0 - vuv_vector[data <= 0.0] = 0.0 - - ip_data = data - - frame_number = data.size - last_value = 0.0 - for i in range(frame_number): - if data[i] <= 0.0: - j = i + 1 - for j in range(i + 1, frame_number): - if data[j] > 0.0: - break - if j < frame_number - 1: - if last_value > 0.0: - step = (data[j] - data[i - 1]) / float(j - i) - for k in range(i, j): - ip_data[k] = data[i - 1] + step * (k - i + 1) - else: - for k in range(i, j): - ip_data[k] = data[j] - else: - for k in range(i, frame_number): - ip_data[k] = last_value - else: - ip_data[i] = data[i] # 这里可能存在一个没有必要的拷贝 - last_value = data[i] - - return ip_data[:, 0], vuv_vector[:, 0] - - def resize_f0(self, x, target_len): - source = np.array(x) - source[source < 0.001] = np.nan - target = np.interp( - np.arange(0, len(source) * target_len, len(source)) / target_len, - np.arange(0, len(source)), - source, - ) - res = np.nan_to_num(target) - return res - - def compute_f0(self, wav, p_len=None): - if p_len is None: - p_len = wav.shape[0] // self.hop_length - f0, t = pyworld.dio( - wav.astype(np.double), - fs=self.sample_rate, - f0_floor=self.f0_min, - f0_ceil=self.f0_max, - frame_period=1000 * self.hop_length / self.sample_rate, - ) - f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.sample_rate) - for index, pitch in enumerate(f0): - f0[index] = round(pitch, 1) - return self.interpolate_f0(self.resize_f0(f0, p_len))[0] - - def compute_f0_uv(self, wav, p_len=None): - if p_len is None: - p_len = wav.shape[0] // self.hop_length - f0, t = pyworld.dio( - wav.astype(np.double), - fs=self.sample_rate, - f0_floor=self.f0_min, - f0_ceil=self.f0_max, - frame_period=1000 * self.hop_length / self.sample_rate, - ) - f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.sample_rate) - for index, pitch in enumerate(f0): - f0[index] = round(pitch, 1) - return self.interpolate_f0(self.resize_f0(f0, p_len)) diff --git a/rvc/lib/infer_pack/modules/F0Predictor/F0Predictor.py b/rvc/lib/infer_pack/modules/F0Predictor/F0Predictor.py deleted file mode 100644 index 384f43f..0000000 --- a/rvc/lib/infer_pack/modules/F0Predictor/F0Predictor.py +++ /dev/null @@ -1,6 +0,0 @@ -class F0Predictor(object): - def compute_f0(self, wav, p_len): - pass - - def compute_f0_uv(self, wav, p_len): - pass diff --git a/rvc/lib/infer_pack/modules/F0Predictor/HarvestF0Predictor.py b/rvc/lib/infer_pack/modules/F0Predictor/HarvestF0Predictor.py deleted file mode 100644 index f30f61b..0000000 --- a/rvc/lib/infer_pack/modules/F0Predictor/HarvestF0Predictor.py +++ /dev/null @@ -1,82 +0,0 @@ -from infer_pack.modules.F0Predictor.F0Predictor import F0Predictor -import pyworld -import numpy as np - - -class HarvestF0Predictor(F0Predictor): - def __init__(self, hop_length=512, f0_min=50, f0_max=1100, sample_rate=44100): - self.hop_length = hop_length - self.f0_min = f0_min - self.f0_max = f0_max - self.sample_rate = sample_rate - - def interpolate_f0(self, f0): - data = np.reshape(f0, (f0.size, 1)) - - vuv_vector = np.zeros((data.size, 1), dtype=np.float32) - vuv_vector[data > 0.0] = 1.0 - vuv_vector[data <= 0.0] = 0.0 - - ip_data = data - - frame_number = data.size - last_value = 0.0 - for i in range(frame_number): - if data[i] <= 0.0: - j = i + 1 - for j in range(i + 1, frame_number): - if data[j] > 0.0: - break - if j < frame_number - 1: - if last_value > 0.0: - step = (data[j] - data[i - 1]) / float(j - i) - for k in range(i, j): - ip_data[k] = data[i - 1] + step * (k - i + 1) - else: - for k in range(i, j): - ip_data[k] = data[j] - else: - for k in range(i, frame_number): - ip_data[k] = last_value - else: - ip_data[i] = data[i] - last_value = data[i] - - return ip_data[:, 0], vuv_vector[:, 0] - - def resize_f0(self, x, target_len): - source = np.array(x) - source[source < 0.001] = np.nan - target = np.interp( - np.arange(0, len(source) * target_len, len(source)) / target_len, - np.arange(0, len(source)), - source, - ) - res = np.nan_to_num(target) - return res - - def compute_f0(self, wav, p_len=None): - if p_len is None: - p_len = wav.shape[0] // self.hop_length - f0, t = pyworld.harvest( - wav.astype(np.double), - fs=self.sample_rate, - f0_ceil=self.f0_max, - f0_floor=self.f0_min, - frame_period=1000 * self.hop_length / self.sample_rate, - ) - f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.fs) - return self.interpolate_f0(self.resize_f0(f0, p_len))[0] - - def compute_f0_uv(self, wav, p_len=None): - if p_len is None: - p_len = wav.shape[0] // self.hop_length - f0, t = pyworld.harvest( - wav.astype(np.double), - fs=self.sample_rate, - f0_floor=self.f0_min, - f0_ceil=self.f0_max, - frame_period=1000 * self.hop_length / self.sample_rate, - ) - f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.sample_rate) - return self.interpolate_f0(self.resize_f0(f0, p_len)) diff --git a/rvc/lib/infer_pack/modules/F0Predictor/PMF0Predictor.py b/rvc/lib/infer_pack/modules/F0Predictor/PMF0Predictor.py deleted file mode 100644 index 1b81a9f..0000000 --- a/rvc/lib/infer_pack/modules/F0Predictor/PMF0Predictor.py +++ /dev/null @@ -1,93 +0,0 @@ -from infer_pack.modules.F0Predictor.F0Predictor import F0Predictor -import parselmouth -import numpy as np - - -class PMF0Predictor(F0Predictor): - def __init__(self, hop_length=512, f0_min=50, f0_max=1100, sample_rate=44100): - self.hop_length = hop_length - self.f0_min = f0_min - self.f0_max = f0_max - self.sample_rate = sample_rate - - def interpolate_f0(self, f0): - data = np.reshape(f0, (f0.size, 1)) - - vuv_vector = np.zeros((data.size, 1), dtype=np.float32) - vuv_vector[data > 0.0] = 1.0 - vuv_vector[data <= 0.0] = 0.0 - - ip_data = data - - frame_number = data.size - last_value = 0.0 - for i in range(frame_number): - if data[i] <= 0.0: - j = i + 1 - for j in range(i + 1, frame_number): - if data[j] > 0.0: - break - if j < frame_number - 1: - if last_value > 0.0: - step = (data[j] - data[i - 1]) / float(j - i) - for k in range(i, j): - ip_data[k] = data[i - 1] + step * (k - i + 1) - else: - for k in range(i, j): - ip_data[k] = data[j] - else: - for k in range(i, frame_number): - ip_data[k] = last_value - else: - ip_data[i] = data[i] # 这里可能存在一个没有必要的拷贝 - last_value = data[i] - - return ip_data[:, 0], vuv_vector[:, 0] - - def compute_f0(self, wav, p_len=None): - x = wav - if p_len is None: - p_len = x.shape[0] // self.hop_length - else: - assert abs(p_len - x.shape[0] // self.hop_length) < 4, "pad length error" - time_step = self.hop_length / self.sample_rate * 1000 - f0 = ( - parselmouth.Sound(x, self.sample_rate) - .to_pitch_ac( - time_step=time_step / 1000, - voicing_threshold=0.6, - pitch_floor=self.f0_min, - pitch_ceiling=self.f0_max, - ) - .selected_array["frequency"] - ) - - pad_size = (p_len - len(f0) + 1) // 2 - if pad_size > 0 or p_len - len(f0) - pad_size > 0: - f0 = np.pad(f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant") - f0, uv = self.interpolate_f0(f0) - return f0 - - def compute_f0_uv(self, wav, p_len=None): - x = wav - if p_len is None: - p_len = x.shape[0] // self.hop_length - else: - assert abs(p_len - x.shape[0] // self.hop_length) < 4, "pad length error" - time_step = self.hop_length / self.sample_rate * 1000 - f0 = ( - parselmouth.Sound(x, self.sample_rate) - .to_pitch_ac( - time_step=time_step / 1000, - voicing_threshold=0.6, - pitch_floor=self.f0_min, - pitch_ceiling=self.f0_max, - ) - .selected_array["frequency"] - ) - - pad_size = (p_len - len(f0) + 1) // 2 - if pad_size > 0 or p_len - len(f0) - pad_size > 0: - f0 = np.pad(f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant") - f0, uv = self.interpolate_f0(f0) - return f0, uv diff --git a/rvc/lib/infer_pack/modules/F0Predictor/__init__.py b/rvc/lib/infer_pack/modules/F0Predictor/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/rvc/lib/infer_pack/transforms.py b/rvc/lib/infer_pack/transforms.py deleted file mode 100644 index a11f799..0000000 --- a/rvc/lib/infer_pack/transforms.py +++ /dev/null @@ -1,209 +0,0 @@ -import torch -from torch.nn import functional as F - -import numpy as np - - -DEFAULT_MIN_BIN_WIDTH = 1e-3 -DEFAULT_MIN_BIN_HEIGHT = 1e-3 -DEFAULT_MIN_DERIVATIVE = 1e-3 - - -def piecewise_rational_quadratic_transform( - inputs, - unnormalized_widths, - unnormalized_heights, - unnormalized_derivatives, - inverse=False, - tails=None, - tail_bound=1.0, - min_bin_width=DEFAULT_MIN_BIN_WIDTH, - min_bin_height=DEFAULT_MIN_BIN_HEIGHT, - min_derivative=DEFAULT_MIN_DERIVATIVE, -): - if tails is None: - spline_fn = rational_quadratic_spline - spline_kwargs = {} - else: - spline_fn = unconstrained_rational_quadratic_spline - spline_kwargs = {"tails": tails, "tail_bound": tail_bound} - - outputs, logabsdet = spline_fn( - inputs=inputs, - unnormalized_widths=unnormalized_widths, - unnormalized_heights=unnormalized_heights, - unnormalized_derivatives=unnormalized_derivatives, - inverse=inverse, - min_bin_width=min_bin_width, - min_bin_height=min_bin_height, - min_derivative=min_derivative, - **spline_kwargs - ) - return outputs, logabsdet - - -def searchsorted(bin_locations, inputs, eps=1e-6): - bin_locations[..., -1] += eps - return torch.sum(inputs[..., None] >= bin_locations, dim=-1) - 1 - - -def unconstrained_rational_quadratic_spline( - inputs, - unnormalized_widths, - unnormalized_heights, - unnormalized_derivatives, - inverse=False, - tails="linear", - tail_bound=1.0, - min_bin_width=DEFAULT_MIN_BIN_WIDTH, - min_bin_height=DEFAULT_MIN_BIN_HEIGHT, - min_derivative=DEFAULT_MIN_DERIVATIVE, -): - inside_interval_mask = (inputs >= -tail_bound) & (inputs <= tail_bound) - outside_interval_mask = ~inside_interval_mask - - outputs = torch.zeros_like(inputs) - logabsdet = torch.zeros_like(inputs) - - if tails == "linear": - unnormalized_derivatives = F.pad(unnormalized_derivatives, pad=(1, 1)) - constant = np.log(np.exp(1 - min_derivative) - 1) - unnormalized_derivatives[..., 0] = constant - unnormalized_derivatives[..., -1] = constant - - outputs[outside_interval_mask] = inputs[outside_interval_mask] - logabsdet[outside_interval_mask] = 0 - else: - raise RuntimeError("{} tails are not implemented.".format(tails)) - - ( - outputs[inside_interval_mask], - logabsdet[inside_interval_mask], - ) = rational_quadratic_spline( - inputs=inputs[inside_interval_mask], - unnormalized_widths=unnormalized_widths[inside_interval_mask, :], - unnormalized_heights=unnormalized_heights[inside_interval_mask, :], - unnormalized_derivatives=unnormalized_derivatives[inside_interval_mask, :], - inverse=inverse, - left=-tail_bound, - right=tail_bound, - bottom=-tail_bound, - top=tail_bound, - min_bin_width=min_bin_width, - min_bin_height=min_bin_height, - min_derivative=min_derivative, - ) - - return outputs, logabsdet - - -def rational_quadratic_spline( - inputs, - unnormalized_widths, - unnormalized_heights, - unnormalized_derivatives, - inverse=False, - left=0.0, - right=1.0, - bottom=0.0, - top=1.0, - min_bin_width=DEFAULT_MIN_BIN_WIDTH, - min_bin_height=DEFAULT_MIN_BIN_HEIGHT, - min_derivative=DEFAULT_MIN_DERIVATIVE, -): - if torch.min(inputs) < left or torch.max(inputs) > right: - raise ValueError("Input to a transform is not within its domain") - - num_bins = unnormalized_widths.shape[-1] - - if min_bin_width * num_bins > 1.0: - raise ValueError("Minimal bin width too large for the number of bins") - if min_bin_height * num_bins > 1.0: - raise ValueError("Minimal bin height too large for the number of bins") - - widths = F.softmax(unnormalized_widths, dim=-1) - widths = min_bin_width + (1 - min_bin_width * num_bins) * widths - cumwidths = torch.cumsum(widths, dim=-1) - cumwidths = F.pad(cumwidths, pad=(1, 0), mode="constant", value=0.0) - cumwidths = (right - left) * cumwidths + left - cumwidths[..., 0] = left - cumwidths[..., -1] = right - widths = cumwidths[..., 1:] - cumwidths[..., :-1] - - derivatives = min_derivative + F.softplus(unnormalized_derivatives) - - heights = F.softmax(unnormalized_heights, dim=-1) - heights = min_bin_height + (1 - min_bin_height * num_bins) * heights - cumheights = torch.cumsum(heights, dim=-1) - cumheights = F.pad(cumheights, pad=(1, 0), mode="constant", value=0.0) - cumheights = (top - bottom) * cumheights + bottom - cumheights[..., 0] = bottom - cumheights[..., -1] = top - heights = cumheights[..., 1:] - cumheights[..., :-1] - - if inverse: - bin_idx = searchsorted(cumheights, inputs)[..., None] - else: - bin_idx = searchsorted(cumwidths, inputs)[..., None] - - input_cumwidths = cumwidths.gather(-1, bin_idx)[..., 0] - input_bin_widths = widths.gather(-1, bin_idx)[..., 0] - - input_cumheights = cumheights.gather(-1, bin_idx)[..., 0] - delta = heights / widths - input_delta = delta.gather(-1, bin_idx)[..., 0] - - input_derivatives = derivatives.gather(-1, bin_idx)[..., 0] - input_derivatives_plus_one = derivatives[..., 1:].gather(-1, bin_idx)[..., 0] - - input_heights = heights.gather(-1, bin_idx)[..., 0] - - if inverse: - a = (inputs - input_cumheights) * ( - input_derivatives + input_derivatives_plus_one - 2 * input_delta - ) + input_heights * (input_delta - input_derivatives) - b = input_heights * input_derivatives - (inputs - input_cumheights) * ( - input_derivatives + input_derivatives_plus_one - 2 * input_delta - ) - c = -input_delta * (inputs - input_cumheights) - - discriminant = b.pow(2) - 4 * a * c - assert (discriminant >= 0).all() - - root = (2 * c) / (-b - torch.sqrt(discriminant)) - outputs = root * input_bin_widths + input_cumwidths - - theta_one_minus_theta = root * (1 - root) - denominator = input_delta + ( - (input_derivatives + input_derivatives_plus_one - 2 * input_delta) - * theta_one_minus_theta - ) - derivative_numerator = input_delta.pow(2) * ( - input_derivatives_plus_one * root.pow(2) - + 2 * input_delta * theta_one_minus_theta - + input_derivatives * (1 - root).pow(2) - ) - logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator) - - return outputs, -logabsdet - else: - theta = (inputs - input_cumwidths) / input_bin_widths - theta_one_minus_theta = theta * (1 - theta) - - numerator = input_heights * ( - input_delta * theta.pow(2) + input_derivatives * theta_one_minus_theta - ) - denominator = input_delta + ( - (input_derivatives + input_derivatives_plus_one - 2 * input_delta) - * theta_one_minus_theta - ) - outputs = input_cumheights + numerator / denominator - - derivative_numerator = input_delta.pow(2) * ( - input_derivatives_plus_one * theta.pow(2) - + 2 * input_delta * theta_one_minus_theta - + input_derivatives * (1 - theta).pow(2) - ) - logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator) - - return outputs, logabsdet diff --git a/rvc/lib/rmvpe.py b/rvc/lib/rmvpe.py deleted file mode 100644 index 89b984b..0000000 --- a/rvc/lib/rmvpe.py +++ /dev/null @@ -1,388 +0,0 @@ -import torch.nn as nn -import torch, numpy as np -import torch.nn.functional as F -from librosa.filters import mel - - -class BiGRU(nn.Module): - def __init__(self, input_features, hidden_features, num_layers): - super(BiGRU, self).__init__() - self.gru = nn.GRU( - input_features, - hidden_features, - num_layers=num_layers, - batch_first=True, - bidirectional=True, - ) - - def forward(self, x): - return self.gru(x)[0] - - -class ConvBlockRes(nn.Module): - def __init__(self, in_channels, out_channels, momentum=0.01): - super(ConvBlockRes, self).__init__() - self.conv = nn.Sequential( - nn.Conv2d( - in_channels=in_channels, - out_channels=out_channels, - kernel_size=(3, 3), - stride=(1, 1), - padding=(1, 1), - bias=False, - ), - nn.BatchNorm2d(out_channels, momentum=momentum), - nn.ReLU(), - nn.Conv2d( - in_channels=out_channels, - out_channels=out_channels, - kernel_size=(3, 3), - stride=(1, 1), - padding=(1, 1), - bias=False, - ), - nn.BatchNorm2d(out_channels, momentum=momentum), - nn.ReLU(), - ) - if in_channels != out_channels: - self.shortcut = nn.Conv2d(in_channels, out_channels, (1, 1)) - self.is_shortcut = True - else: - self.is_shortcut = False - - def forward(self, x): - if self.is_shortcut: - return self.conv(x) + self.shortcut(x) - else: - return self.conv(x) + x - - -class Encoder(nn.Module): - def __init__( - self, - in_channels, - in_size, - n_encoders, - kernel_size, - n_blocks, - out_channels=16, - momentum=0.01, - ): - super(Encoder, self).__init__() - self.n_encoders = n_encoders - self.bn = nn.BatchNorm2d(in_channels, momentum=momentum) - self.layers = nn.ModuleList() - self.latent_channels = [] - for i in range(self.n_encoders): - self.layers.append( - ResEncoderBlock( - in_channels, out_channels, kernel_size, n_blocks, momentum=momentum - ) - ) - self.latent_channels.append([out_channels, in_size]) - in_channels = out_channels - out_channels *= 2 - in_size //= 2 - self.out_size = in_size - self.out_channel = out_channels - - def forward(self, x): - concat_tensors = [] - x = self.bn(x) - for i in range(self.n_encoders): - _, x = self.layers[i](x) - concat_tensors.append(_) - return x, concat_tensors - - -class ResEncoderBlock(nn.Module): - def __init__( - self, in_channels, out_channels, kernel_size, n_blocks=1, momentum=0.01 - ): - super(ResEncoderBlock, self).__init__() - self.n_blocks = n_blocks - self.conv = nn.ModuleList() - self.conv.append(ConvBlockRes(in_channels, out_channels, momentum)) - for i in range(n_blocks - 1): - self.conv.append(ConvBlockRes(out_channels, out_channels, momentum)) - self.kernel_size = kernel_size - if self.kernel_size is not None: - self.pool = nn.AvgPool2d(kernel_size=kernel_size) - - def forward(self, x): - for i in range(self.n_blocks): - x = self.conv[i](x) - if self.kernel_size is not None: - return x, self.pool(x) - else: - return x - - -class Intermediate(nn.Module): # - def __init__(self, in_channels, out_channels, n_inters, n_blocks, momentum=0.01): - super(Intermediate, self).__init__() - self.n_inters = n_inters - self.layers = nn.ModuleList() - self.layers.append( - ResEncoderBlock(in_channels, out_channels, None, n_blocks, momentum) - ) - for i in range(self.n_inters - 1): - self.layers.append( - ResEncoderBlock(out_channels, out_channels, None, n_blocks, momentum) - ) - - def forward(self, x): - for i in range(self.n_inters): - x = self.layers[i](x) - return x - - -class ResDecoderBlock(nn.Module): - def __init__(self, in_channels, out_channels, stride, n_blocks=1, momentum=0.01): - super(ResDecoderBlock, self).__init__() - out_padding = (0, 1) if stride == (1, 2) else (1, 1) - self.n_blocks = n_blocks - self.conv1 = nn.Sequential( - nn.ConvTranspose2d( - in_channels=in_channels, - out_channels=out_channels, - kernel_size=(3, 3), - stride=stride, - padding=(1, 1), - output_padding=out_padding, - bias=False, - ), - nn.BatchNorm2d(out_channels, momentum=momentum), - nn.ReLU(), - ) - self.conv2 = nn.ModuleList() - self.conv2.append(ConvBlockRes(out_channels * 2, out_channels, momentum)) - for i in range(n_blocks - 1): - self.conv2.append(ConvBlockRes(out_channels, out_channels, momentum)) - - def forward(self, x, concat_tensor): - x = self.conv1(x) - x = torch.cat((x, concat_tensor), dim=1) - for i in range(self.n_blocks): - x = self.conv2[i](x) - return x - - -class Decoder(nn.Module): - def __init__(self, in_channels, n_decoders, stride, n_blocks, momentum=0.01): - super(Decoder, self).__init__() - self.layers = nn.ModuleList() - self.n_decoders = n_decoders - for i in range(self.n_decoders): - out_channels = in_channels // 2 - self.layers.append( - ResDecoderBlock(in_channels, out_channels, stride, n_blocks, momentum) - ) - in_channels = out_channels - - def forward(self, x, concat_tensors): - for i in range(self.n_decoders): - x = self.layers[i](x, concat_tensors[-1 - i]) - return x - - -class DeepUnet(nn.Module): - def __init__( - self, - kernel_size, - n_blocks, - en_de_layers=5, - inter_layers=4, - in_channels=1, - en_out_channels=16, - ): - super(DeepUnet, self).__init__() - self.encoder = Encoder( - in_channels, 128, en_de_layers, kernel_size, n_blocks, en_out_channels - ) - self.intermediate = Intermediate( - self.encoder.out_channel // 2, - self.encoder.out_channel, - inter_layers, - n_blocks, - ) - self.decoder = Decoder( - self.encoder.out_channel, en_de_layers, kernel_size, n_blocks - ) - - def forward(self, x): - x, concat_tensors = self.encoder(x) - x = self.intermediate(x) - x = self.decoder(x, concat_tensors) - return x - - -class E2E(nn.Module): - def __init__( - self, - n_blocks, - n_gru, - kernel_size, - en_de_layers=5, - inter_layers=4, - in_channels=1, - en_out_channels=16, - ): - super(E2E, self).__init__() - self.unet = DeepUnet( - kernel_size, - n_blocks, - en_de_layers, - inter_layers, - in_channels, - en_out_channels, - ) - self.cnn = nn.Conv2d(en_out_channels, 3, (3, 3), padding=(1, 1)) - if n_gru: - self.fc = nn.Sequential( - BiGRU(3 * 128, 256, n_gru), - nn.Linear(512, 360), - nn.Dropout(0.25), - nn.Sigmoid(), - ) - - def forward(self, mel): - mel = mel.transpose(-1, -2).unsqueeze(1) - x = self.cnn(self.unet(mel)).transpose(1, 2).flatten(-2) - x = self.fc(x) - return x - - -class MelSpectrogram(torch.nn.Module): - def __init__( - self, - is_half, - n_mel_channels, - sample_rate, - win_length, - hop_length, - n_fft=None, - mel_fmin=0, - mel_fmax=None, - clamp=1e-5, - ): - super().__init__() - n_fft = win_length if n_fft is None else n_fft - self.hann_window = {} - mel_basis = mel( - sr=sample_rate, - n_fft=n_fft, - n_mels=n_mel_channels, - fmin=mel_fmin, - fmax=mel_fmax, - htk=True, - ) - mel_basis = torch.from_numpy(mel_basis).float() - self.register_buffer("mel_basis", mel_basis) - self.n_fft = win_length if n_fft is None else n_fft - self.hop_length = hop_length - self.win_length = win_length - self.sample_rate = sample_rate - self.n_mel_channels = n_mel_channels - self.clamp = clamp - self.is_half = is_half - - def forward(self, audio, keyshift=0, speed=1, center=True): - factor = 2 ** (keyshift / 12) - n_fft_new = int(np.round(self.n_fft * factor)) - win_length_new = int(np.round(self.win_length * factor)) - hop_length_new = int(np.round(self.hop_length * speed)) - keyshift_key = str(keyshift) + "_" + str(audio.device) - if keyshift_key not in self.hann_window: - self.hann_window[keyshift_key] = torch.hann_window(win_length_new).to( - audio.device - ) - fft = torch.stft( - audio, - n_fft=n_fft_new, - hop_length=hop_length_new, - win_length=win_length_new, - window=self.hann_window[keyshift_key], - center=center, - return_complex=True, - ) - magnitude = torch.sqrt(fft.real.pow(2) + fft.imag.pow(2)) - if keyshift != 0: - size = self.n_fft // 2 + 1 - resize = magnitude.size(1) - if resize < size: - magnitude = F.pad(magnitude, (0, 0, 0, size - resize)) - magnitude = magnitude[:, :size, :] * self.win_length / win_length_new - mel_output = torch.matmul(self.mel_basis, magnitude) - if self.is_half == True: - mel_output = mel_output.half() - log_mel_spec = torch.log(torch.clamp(mel_output, min=self.clamp)) - return log_mel_spec - - -class RMVPE: - def __init__(self, model_path, is_half, device=None): - self.resample_kernel = {} - model = E2E(4, 1, (2, 2)) - ckpt = torch.load(model_path, map_location="cpu") - model.load_state_dict(ckpt) - model.eval() - if is_half == True: - model = model.half() - self.model = model - self.resample_kernel = {} - self.is_half = is_half - if device is None: - device = "cuda" if torch.cuda.is_available() else "cpu" - self.device = device - self.mel_extractor = MelSpectrogram( - is_half, 128, 16000, 1024, 160, None, 30, 8000 - ).to(device) - self.model = self.model.to(device) - cents_mapping = 20 * np.arange(360) + 1997.3794084376191 - self.cents_mapping = np.pad(cents_mapping, (4, 4)) # 368 - - def mel2hidden(self, mel): - with torch.no_grad(): - n_frames = mel.shape[-1] - mel = F.pad( - mel, (0, 32 * ((n_frames - 1) // 32 + 1) - n_frames), mode="reflect" - ) - hidden = self.model(mel) - return hidden[:, :n_frames] - - def decode(self, hidden, thred=0.03): - cents_pred = self.to_local_average_cents(hidden, thred=thred) - f0 = 10 * (2 ** (cents_pred / 1200)) - f0[f0 == 10] = 0 - return f0 - - def infer_from_audio(self, audio, thred=0.03): - audio = torch.from_numpy(audio).float().to(self.device).unsqueeze(0) - mel = self.mel_extractor(audio, center=True) - hidden = self.mel2hidden(mel) - hidden = hidden.squeeze(0).cpu().numpy() - if self.is_half == True: - hidden = hidden.astype("float32") - f0 = self.decode(hidden, thred=thred) - return f0 - - def to_local_average_cents(self, salience, thred=0.05): - center = np.argmax(salience, axis=1) - salience = np.pad(salience, ((0, 0), (4, 4))) - center += 4 - todo_salience = [] - todo_cents_mapping = [] - starts = center - 4 - ends = center + 5 - for idx in range(salience.shape[0]): - todo_salience.append(salience[:, starts[idx] : ends[idx]][idx]) - todo_cents_mapping.append(self.cents_mapping[starts[idx] : ends[idx]]) - todo_salience = np.array(todo_salience) - todo_cents_mapping = np.array(todo_cents_mapping) - product_sum = np.sum(todo_salience * todo_cents_mapping, 1) - weight_sum = np.sum(todo_salience, 1) - devided = product_sum / weight_sum - maxx = np.max(salience, axis=1) - devided[maxx <= thred] = 0 - return devided diff --git a/rvc/pretraineds/.gitignore b/rvc/pretraineds/.gitignore deleted file mode 100644 index e69de29..0000000 diff --git a/rvc/pretraineds/pretrained_v1/.gitignore b/rvc/pretraineds/pretrained_v1/.gitignore deleted file mode 100644 index e69de29..0000000 diff --git a/rvc/pretraineds/pretrained_v2/.gitignore b/rvc/pretraineds/pretrained_v2/.gitignore deleted file mode 100644 index e69de29..0000000