diff --git a/checkpoints/bigvgan/config.json b/checkpoints/bigvgan/config.json deleted file mode 100644 index 635bd89..0000000 --- a/checkpoints/bigvgan/config.json +++ /dev/null @@ -1,63 +0,0 @@ -{ - "resblock": "1", - "num_gpus": 0, - "batch_size": 32, - "learning_rate": 0.0001, - "adam_b1": 0.8, - "adam_b2": 0.99, - "lr_decay": 0.9999996, - "seed": 1234, - - "upsample_rates": [4,4,2,2,2,2], - "upsample_kernel_sizes": [8,8,4,4,4,4], - "upsample_initial_channel": 1536, - "resblock_kernel_sizes": [3,7,11], - "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], - - "use_tanh_at_final": false, - "use_bias_at_final": false, - - "activation": "snakebeta", - "snake_logscale": true, - - "use_cqtd_instead_of_mrd": true, - "cqtd_filters": 128, - "cqtd_max_filters": 1024, - "cqtd_filters_scale": 1, - "cqtd_dilations": [1, 2, 4], - "cqtd_hop_lengths": [512, 256, 256], - "cqtd_n_octaves": [9, 9, 9], - "cqtd_bins_per_octaves": [24, 36, 48], - - "mpd_reshapes": [2, 3, 5, 7, 11], - "use_spectral_norm": false, - "discriminator_channel_mult": 1, - - "use_multiscale_melloss": true, - "lambda_melloss": 15, - - "clip_grad_norm": 500, - - "segment_size": 65536, - "num_mels": 80, - "num_freq": 1025, - "n_fft": 1024, - "hop_size": 256, - "win_size": 1024, - - "sampling_rate": 22050, - - "fmin": 0, - "fmax": null, - "fmax_for_loss": null, - - "normalize_volume": true, - - "num_workers": 4, - - "dist_config": { - "dist_backend": "nccl", - "dist_url": "tcp://localhost:54321", - "world_size": 1 - } -} diff --git a/checkpoints/config.yaml b/checkpoints/config.yaml deleted file mode 100644 index 5067b2e..0000000 --- a/checkpoints/config.yaml +++ /dev/null @@ -1,120 +0,0 @@ -dataset: - bpe_model: bpe.model - sample_rate: 24000 - squeeze: false - mel: - sample_rate: 24000 - n_fft: 1024 - hop_length: 256 - win_length: 1024 - n_mels: 100 - mel_fmin: 0 - normalize: false - -gpt: - model_dim: 1280 - max_mel_tokens: 1815 - max_text_tokens: 600 - heads: 20 - use_mel_codes_as_input: true - mel_length_compression: 1024 - layers: 24 - number_text_tokens: 12000 - number_mel_codes: 8194 - start_mel_token: 8192 - stop_mel_token: 8193 - start_text_token: 0 - stop_text_token: 1 - train_solo_embeddings: false - condition_type: "conformer_perceiver" - condition_module: - output_size: 512 - linear_units: 2048 - attention_heads: 8 - num_blocks: 6 - input_layer: "conv2d2" - perceiver_mult: 2 - emo_condition_module: - output_size: 512 - linear_units: 1024 - attention_heads: 4 - num_blocks: 4 - input_layer: "conv2d2" - perceiver_mult: 2 - -semantic_codec: - codebook_size: 8192 - hidden_size: 1024 - codebook_dim: 8 - vocos_dim: 384 - vocos_intermediate_dim: 2048 - vocos_num_layers: 12 - -s2mel: - preprocess_params: - sr: 22050 - spect_params: - n_fft: 1024 - win_length: 1024 - hop_length: 256 - n_mels: 80 - fmin: 0 - fmax: "None" - - dit_type: "DiT" - reg_loss_type: "l1" - style_encoder: - dim: 192 - length_regulator: - channels: 512 - is_discrete: false - in_channels: 1024 - content_codebook_size: 2048 - sampling_ratios: [1, 1, 1, 1] - vector_quantize: false - n_codebooks: 1 - quantizer_dropout: 0.0 - f0_condition: false - n_f0_bins: 512 - DiT: - hidden_dim: 512 - num_heads: 8 - depth: 13 - class_dropout_prob: 0.1 - block_size: 8192 - in_channels: 80 - style_condition: true - final_layer_type: 'wavenet' - target: 'mel' - content_dim: 512 - content_codebook_size: 1024 - content_type: 'discrete' - f0_condition: false - n_f0_bins: 512 - content_codebooks: 1 - is_causal: false - long_skip_connection: true - zero_prompt_speech_token: false - time_as_token: false - style_as_token: false - uvit_skip_connection: true - add_resblock_in_transformer: false - wavenet: - hidden_dim: 512 - num_layers: 8 - kernel_size: 5 - dilation_rate: 1 - p_dropout: 0.2 - style_condition: true - -gpt_checkpoint: gpt.pth -w2v_stat: wav2vec2bert_stats.pt -s2mel_checkpoint: s2mel.pth -emo_matrix: feat2.pt -spk_matrix: feat1.pt -emo_num: [3, 17, 2, 8, 4, 5, 10, 24] -qwen_emo_path: qwen0.6bemo4-merge/ -vocoder: - type: "bigvgan" - name: "bigvgan" -version: 2.0