Skip to content
This repository has been archived by the owner on Nov 11, 2023. It is now read-only.

Feat loguru 4.1 stable #377

Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -165,3 +165,9 @@ pretrain/
.vscode/launch.json

trained/**/

configs/
filelists/*
filelists/val.txt
test_pipe.py
test_queue.py
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -350,6 +350,8 @@ After completing the above steps, the dataset directory will contain the preproc
python train.py -c configs/config.json -m 44k
```

Here is a method that allows you to pause training and save the model at any time. Simply create a `stop.txt` file in the running directory, and the training will stop and save the model after the next step.

### Diffusion Model (optional)

If the shallow diffusion function is needed, the diffusion model needs to be trained. The diffusion model training method is as follows:
Expand Down
14 changes: 8 additions & 6 deletions README_zh_CN.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
<div align="center">
<img alt="LOGO" src="https://avatars.githubusercontent.com/u/127122328?s=400&u=5395a98a4f945a3a50cb0cc96c2747505d190dbc&v=4" width="300" height="300" />

# SoftVC VITS Singing Voice Conversion
# SoftVC VITS Singing Voice Conversion(HuanLin Ver)

[**English**](./README.md) | [**中文简体**](./README_zh_CN.md)

Expand Down Expand Up @@ -353,6 +353,8 @@ python preprocess_hubert_f0.py --f0_predictor dio --use_diff --num_processes 8
python train.py -c configs/config.json -m 44k
```

这里提供了一个可以随时暂停训练并保存模型的方法,在运行目录下面新建一个 `stop.txt` 即可,训练会在下一个 step 结束后停止并保存模型

### 扩散模型(可选)

尚若需要浅扩散功能,需要训练扩散模型,扩散模型训练方法为:
Expand All @@ -373,14 +375,14 @@ python inference_main.py -m "logs/44k/G_30400.pth" -c "configs/config.json" -n "
```

必填项部分:
+ `-m` | `--model_path`:模型路径
+ `-c` | `--config_path`:配置文件路径
+ `-n` | `--clean_names`:wav 文件名列表,放在 raw 文件夹下
+ `-m` | `--model_path`:模型路径 (接受参数 `{lastest}`,示例: `"logs/44k/{lastest}"`,将获取最新保存点)
+ `-c` | `--config_path`:配置文件路径,缺省时使用 `logs/44k/config.json`
+ `-t` | `--trans`:音高调整,支持正负(半音)
+ `-s` | `--spk_list`:合成目标说话人名称
+ `-cl` | `--clip`:音频强制切片,默认 0 为自动切片,单位为秒/s
+ `-s` | `--spk_list`:合成目标说话人名称,缺省时我也不知道怎么干
+ `-n` | `--clean_names`:wav 文件名列表,放在 raw 文件夹下,缺省时 fallback 到 raw 文件夹中所有 wav 文件

可选项部分:部分具体见下一节
+ `-cl` | `--clip`:音频强制切片,默认 0 为自动切片,单位为秒/s
+ `-lg` | `--linear_gradient`:两段音频切片的交叉淡入长度,如果强制切片后出现人声不连贯可调整该数值,如果连贯建议采用默认值 0,单位为秒
+ `-f0p` | `--f0_predictor`:选择 F0 预测器,可选择 crepe,pm,dio,harvest,rmvpe,fcpe, 默认为 pm(注意:crepe 为原 F0 使用均值滤波器)
+ `-a` | `--auto_predict_f0`:语音转换自动预测音高,转换歌声时不要打开这个会严重跑调
Expand Down
107 changes: 107 additions & 0 deletions configs/config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
{
"train": {
"log_interval": 200,
"eval_interval": 800,
"seed": 1234,
"epochs": 10000,
"learning_rate": 0.0001,
"betas": [
0.8,
0.99
],
"eps": 1e-09,
"batch_size": 6,
"fp16_run": false,
"half_type": "fp16",
"lr_decay": 0.999875,
"segment_size": 10240,
"init_lr_ratio": 1,
"warmup_epochs": 0,
"c_mel": 45,
"c_kl": 1.0,
"use_sr": true,
"max_speclen": 512,
"port": "8001",
"keep_ckpts": 3,
"all_in_mem": false,
"vol_aug": false
},
"data": {
"training_files": "filelists/train.txt",
"validation_files": "filelists/val.txt",
"max_wav_value": 32768.0,
"sampling_rate": 44100,
"filter_length": 2048,
"hop_length": 512,
"win_length": 2048,
"n_mel_channels": 80,
"mel_fmin": 0.0,
"mel_fmax": 22050,
"unit_interpolate_mode": "nearest"
},
"model": {
"inter_channels": 192,
"hidden_channels": 192,
"filter_channels": 768,
"n_heads": 2,
"n_layers": 6,
"kernel_size": 3,
"p_dropout": 0.1,
"resblock": "1",
"resblock_kernel_sizes": [
3,
7,
11
],
"resblock_dilation_sizes": [
[
1,
3,
5
],
[
1,
3,
5
],
[
1,
3,
5
]
],
"upsample_rates": [
8,
8,
2,
2,
2
],
"upsample_initial_channel": 512,
"upsample_kernel_sizes": [
16,
16,
4,
4,
4
],
"n_layers_q": 3,
"n_layers_trans_flow": 3,
"n_flow_layer": 4,
"use_spectral_norm": false,
"gin_channels": 768,
"ssl_dim": 768,
"n_speakers": 1,
"vocoder_name": "nsf-hifigan",
"speech_encoder": "vec768l12",
"speaker_embedding": false,
"vol_embedding": false,
"use_depthwise_conv": false,
"flow_share_parameter": false,
"use_automatic_f0_prediction": true,
"use_transformer_flow": false
},
"spk": {
"TEST": 0
}
}
51 changes: 51 additions & 0 deletions configs/diffusion.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
data:
block_size: 512
cnhubertsoft_gate: 10
duration: 2
encoder: vec768l12
encoder_hop_size: 320
encoder_out_channels: 768
encoder_sample_rate: 16000
extensions:
- wav
sampling_rate: 44100
training_files: filelists/train.txt
unit_interpolate_mode: nearest
validation_files: filelists/val.txt
device: cuda
env:
expdir: logs/44k/diffusion
gpu_id: 0
infer:
method: dpm-solver++
speedup: 10
model:
k_step_max: 0
n_chans: 512
n_hidden: 256
n_layers: 20
n_spk: 1
timesteps: 1000
type: Diffusion
use_pitch_aug: true
spk:
TEST: 0
train:
amp_dtype: fp32
batch_size: 48
cache_all_data: true
cache_device: cpu
cache_fp16: true
decay_step: 100000
epochs: 100000
gamma: 0.5
interval_force_save: 5000
interval_log: 10
interval_val: 2000
lr: 0.0001
num_workers: 4
save_opt: false
weight_decay: 0
vocoder:
ckpt: pretrain/nsf_hifigan/model
type: nsf-hifigan
31 changes: 16 additions & 15 deletions filelists/train.txt
Original file line number Diff line number Diff line change
@@ -1,15 +1,16 @@
./dataset/44k/taffy/000549.wav
./dataset/44k/nyaru/000004.wav
./dataset/44k/nyaru/000006.wav
./dataset/44k/taffy/000551.wav
./dataset/44k/nyaru/000009.wav
./dataset/44k/taffy/000561.wav
./dataset/44k/nyaru/000001.wav
./dataset/44k/taffy/000553.wav
./dataset/44k/nyaru/000002.wav
./dataset/44k/taffy/000560.wav
./dataset/44k/taffy/000557.wav
./dataset/44k/nyaru/000005.wav
./dataset/44k/taffy/000554.wav
./dataset/44k/taffy/000550.wav
./dataset/44k/taffy/000559.wav
./dataset/44k/TEST/354440190 (9).wav
./dataset/44k/TEST/354440190 (6).wav
./dataset/44k/TEST/354440190 (1).wav
./dataset/44k/TEST/354440190 (11).wav
./dataset/44k/TEST/354440190 (10).wav
./dataset/44k/TEST/354440190 (15).wav
./dataset/44k/TEST/354440190 (12).wav
./dataset/44k/TEST/354440190 (2).wav
./dataset/44k/TEST/354440190 (7).wav
./dataset/44k/TEST/vo_tips_TCG001_4_littlePrince_01.wav
./dataset/44k/TEST/vo_tips_TCG001_37_littlePrince_01.wav
./dataset/44k/TEST/354440190 (5).wav
./dataset/44k/TEST/vo_tips_TCG001_4_littlePrince_02.wav
./dataset/44k/TEST/354440190 (4).wav
./dataset/44k/TEST/354440190 (14).wav
./dataset/44k/TEST/354440190 (13).wav
6 changes: 2 additions & 4 deletions filelists/val.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,2 @@
./dataset/44k/nyaru/000003.wav
./dataset/44k/nyaru/000007.wav
./dataset/44k/taffy/000558.wav
./dataset/44k/taffy/000556.wav
./dataset/44k/TEST/354440190 (3).wav
./dataset/44k/TEST/354440190 (8).wav
103 changes: 54 additions & 49 deletions inference/infer_tool.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@
from inference import slicer
from models import SynthesizerTrn

import logger

Check warning on line 25 in inference/infer_tool.py

View workflow job for this annotation

GitHub Actions / ruff

[ruff] reported by reviewdog 🐶 Raw Output: inference/infer_tool.py:25:-import logger inference/infer_tool.py:26:-

logging.getLogger('matplotlib').setLevel(logging.WARNING)


Expand Down Expand Up @@ -67,6 +69,7 @@
if Path(audio_path).suffix == '.wav':
return
raw_audio, raw_sample_rate = librosa.load(audio_path, mono=True, sr=None)

soundfile.write(Path(audio_path).with_suffix(".wav"), raw_audio, raw_sample_rate)


Expand Down Expand Up @@ -443,56 +446,58 @@

global_frame = 0
audio = []
for (slice_tag, data) in audio_data:
print(f'#=====segment start, {round(len(data) / audio_sr, 3)}s======')
# padd
length = int(np.ceil(len(data) / audio_sr * self.target_sample))
if slice_tag:
print('jump empty segment')
_audio = np.zeros(length)
audio.extend(list(pad_array(_audio, length)))
global_frame += length // self.hop_size
continue
if per_size != 0:
datas = split_list_by_n(data, per_size,lg_size)
else:
datas = [data]
for k,dat in enumerate(datas):
per_length = int(np.ceil(len(dat) / audio_sr * self.target_sample)) if clip_seconds!=0 else length
if clip_seconds!=0:
print(f'###=====segment clip start, {round(len(dat) / audio_sr, 3)}s======')
with logger.Progress() as progress:
for (slice_tag, data) in progress.track(audio_data):
logger.info(f'segment start, {round(len(data) / audio_sr, 3)}s')
# padd
pad_len = int(audio_sr * pad_seconds)
dat = np.concatenate([np.zeros([pad_len]), dat, np.zeros([pad_len])])
raw_path = io.BytesIO()
soundfile.write(raw_path, dat, audio_sr, format="wav")
raw_path.seek(0)
out_audio, out_sr, out_frame = self.infer(spk, tran, raw_path,
cluster_infer_ratio=cluster_infer_ratio,
auto_predict_f0=auto_predict_f0,
noice_scale=noice_scale,
f0_predictor = f0_predictor,
enhancer_adaptive_key = enhancer_adaptive_key,
cr_threshold = cr_threshold,
k_step = k_step,
frame = global_frame,
spk_mix = use_spk_mix,
second_encoding = second_encoding,
loudness_envelope_adjustment = loudness_envelope_adjustment
)
global_frame += out_frame
_audio = out_audio.cpu().numpy()
pad_len = int(self.target_sample * pad_seconds)
_audio = _audio[pad_len:-pad_len]
_audio = pad_array(_audio, per_length)
if lg_size!=0 and k!=0:
lg1 = audio[-(lg_size_r+lg_size_c_r):-lg_size_c_r] if lgr_num != 1 else audio[-lg_size:]
lg2 = _audio[lg_size_c_l:lg_size_c_l+lg_size_r] if lgr_num != 1 else _audio[0:lg_size]
lg_pre = lg1*(1-lg)+lg2*lg
audio = audio[0:-(lg_size_r+lg_size_c_r)] if lgr_num != 1 else audio[0:-lg_size]
audio.extend(lg_pre)
_audio = _audio[lg_size_c_l+lg_size_r:] if lgr_num != 1 else _audio[lg_size:]
audio.extend(list(_audio))
length = int(np.ceil(len(data) / audio_sr * self.target_sample))
if slice_tag:
logger.info('jump empty segment')
_audio = np.zeros(length)
audio.extend(list(pad_array(_audio, length)))
global_frame += length // self.hop_size
continue
if per_size != 0:
datas = split_list_by_n(data, per_size,lg_size)
else:
datas = [data]
for k,dat in enumerate(datas):
per_length = int(np.ceil(len(dat) / audio_sr * self.target_sample)) if clip_seconds!=0 else length
if clip_seconds!=0:
logger.info(f'segment clip start, {round(len(dat) / audio_sr, 3)}s')
# padd
pad_len = int(audio_sr * pad_seconds)
dat = np.concatenate([np.zeros([pad_len]), dat, np.zeros([pad_len])])
raw_path = io.BytesIO()
soundfile.write(raw_path, dat, audio_sr, format="wav")
raw_path.seek(0)
out_audio, out_sr, out_frame = self.infer(spk, tran, raw_path,
cluster_infer_ratio=cluster_infer_ratio,
auto_predict_f0=auto_predict_f0,
noice_scale=noice_scale,
f0_predictor = f0_predictor,
enhancer_adaptive_key = enhancer_adaptive_key,
cr_threshold = cr_threshold,
k_step = k_step,
frame = global_frame,
spk_mix = use_spk_mix,
second_encoding = second_encoding,
loudness_envelope_adjustment = loudness_envelope_adjustment
)
global_frame += out_frame
_audio = out_audio.cpu().numpy()
pad_len = int(self.target_sample * pad_seconds)
_audio = _audio[pad_len:-pad_len]
_audio = pad_array(_audio, per_length)
if lg_size!=0 and k!=0:
lg1 = audio[-(lg_size_r+lg_size_c_r):-lg_size_c_r] if lgr_num != 1 else audio[-lg_size:]
lg2 = _audio[lg_size_c_l:lg_size_c_l+lg_size_r] if lgr_num != 1 else _audio[0:lg_size]
lg_pre = lg1*(1-lg)+lg2*lg
audio = audio[0:-(lg_size_r+lg_size_c_r)] if lgr_num != 1 else audio[0:-lg_size]
audio.extend(lg_pre)
_audio = _audio[lg_size_c_l+lg_size_r:] if lgr_num != 1 else _audio[lg_size:]
audio.extend(list(_audio))

return np.array(audio)

class RealTimeVC:
Expand Down
Loading
Loading