{ "architectures": [ "VocosEncodecModel" ], "bandwidths": [ 1.5, 3.0, 6.0, 12.0 ], "codebook_dim": 128, "dtype": "float32", "hidden_act": "gelu", "hidden_size": 384, "hop_length": 320, "intermediate_size": 1152, "istft_padding": "same", "kernel_size": 7, "layer_norm_eps": 1e-06, "layer_scale_init_value": 0.125, "model_type": "vocos_encodec", "n_fft": 1280, "num_layers": 8, "num_quantizers": 16384, "padding": 3, "sample_rate": 24000, "transformers_version": "4.57.0.dev0" }