diff --git a/config.py b/config.py index 5b72235b5..2e9a6e792 100644 --- a/config.py +++ b/config.py @@ -85,14 +85,17 @@ def __init__(self): self.noautoopen, self.paperspace, self.is_cli, + self.simple_cli, + self.simple_cli_args, ) = self.arg_parse() - + self.x_pad, self.x_query, self.x_center, self.x_max = self.device_config() @staticmethod def arg_parse() -> tuple: exe = sys.executable or "python" parser = argparse.ArgumentParser() + subparser = parser.add_subparsers() parser.add_argument("--port", type=int, default=7865, help="Listen port") parser.add_argument("--pycmd", type=str, default=exe, help="Python command") parser.add_argument("--colab", action="store_true", help="Launch in colab") @@ -114,8 +117,231 @@ def arg_parse() -> tuple: action="store_true", help="Use the CLI instead of setting up a gradio UI. This flag will launch an RVC text interface where you can execute functions from infer-web.py!", ) + parser.add_argument( # Fork Feature. Embed a CLI into the infer-web.py + "--simple_cli", choices=["infer", "pre-process", "extract-feature", "train", "train-feature", "extract-model", "uvr", ""], default="", help="Use the simpler CLI instead of the cli interface. Choose from 1) pre-process 2) extract-feature 3) WIP." + ) + + # Arguments for simple cli usage. + parser.add_argument( + "--exp_name", type=str, default="mi-test", help="Experiment name" + ) + parser.add_argument( + "--trainset_dir", + type=str, + default="", + help="Trainset directory", + ) + parser.add_argument( + "--sample_rate", choices=["32k", "40k", "48k"], default="40k", help="Sample rate: 40k (32k, 40k, 48k)" + ) + parser.add_argument( + "--n_workers", type=int, default=8, help="Number of cpu threads to work" + ) + parser.add_argument( + "--gpu", type=int, default=0, help="GPU device index to use" + ) + parser.add_argument( + "--is_pitch_guidance", + type=bool, + default=True, + help="Use pitch guidance (1 for True 0 for False)", + ) + parser.add_argument( + "--f0_method", + type=str, + default="crepe", + help="F0 extraction method", + ) + parser.add_argument( + "--crepe_hop_length", + type=int, + default=128, + help="Hop length for crepe", + ) + parser.add_argument( + "--rvc_version", + choices=["v1", "v2"], + default="v2", + help="RVC version", + ) + parser.add_argument( + "--speaker_id", + type=int, + default=0, + help="Speaker id for multi-speaker model", + ) + parser.add_argument( + "--save_epoch_iter", + type=int, + default=5, + help="Save model every n iterations", + ) + parser.add_argument( + "--epochs", type=int, default=20, help="Number of epochs to train" + ) + parser.add_argument( + "--batch_size", type=int, default=8, help="Batch size for training" + ) + parser.add_argument( + "--latest_ckpt_only", + type=bool, + default=False, + help="Save only the latest checkpoint", + ) + parser.add_argument( + "--cache_trainset", + type=bool, + default=False, + help="Whether to cache training set to vram", + ) + parser.add_argument( + "--save_small_model", + type=bool, + default=False, + help="Save extracted small model every generation?", + ) + + parser.add_argument( + "--model_file_name", + type=str, + default="", + help="Model name with .pth in ./weights", + ) + parser.add_argument( + "--source_audio_path", + type=str, + default="", + help="Source audio path for inference", + ) + parser.add_argument( + "--output_file_name", + type=str, + default="output.wav", + help="Output file name to be placed in './audio-outputs'", + ) + parser.add_argument( + "--feature_index_path", + type=str, + default="", + help="Feature index file path", + ) + parser.add_argument( + "--transposition", + type=int, + default=0, + help="Transpose (integer, number of semitones, raise by an octave: 12, lower by an octave: -12)", + ) + parser.add_argument( + "--infer_f0_method", + type=str, + default="crepe", + help="F0 extraction method for inference", + ) + parser.add_argument( + "--harvest_median_filter_radius", + type=int, + default=3, + help="Harvest median filter radius, default 3.", + ) + parser.add_argument( + "--post_sample_rate", + type=int, + default=0, + help="Resample the output audio in post-processing to the final sample rate. Set to 0 for no resampling.", + ) + parser.add_argument( + "--mix_volume_envelope", + type=float, + default=0.25, + help="Use the volume envelope of the input to replace or mix with the volume envelope of the output. The closer the ratio is to 1, the more the output envelope is used.", + ) + parser.add_argument( + "--feature_index_ratio", + type=float, + default=0.33, + help="Feature index ratio for inference.", + ) + parser.add_argument( + "--voiceless_consonant_protection", + type=float, + default=0.33, + help="Protect voiceless consonants and breath sounds to prevent artifacts such as tearing in electronic music. Set to 0.5 to disable. Decrease the value to increase protection, but it may reduce indexing accuracy." + ) + parser.add_argument( + "--model_path", + type=str, + default="", + help="Model path for extract-model", + ) + parser.add_argument( + "--model_save_name", + type=str, + default="", + help="Model save name for extract-model", + ) + parser.add_argument( + "--model_info", + type=str, + default="", + help="Model info for extract-model", + ) + parser.add_argument( + "--cmd_help", + action="store_true", + help="Print help for simple cli", + ) + # Add --agg and --format + parser.add_argument( + "--agg", + type=int, + default=10, + help="Aggregation for uvr5", + ) + parser.add_argument( + "--format", + type=str, + default="flac", + help="Audio format", + ) + parser.add_argument( + "--uvr5_weight_name", + type=str, + default="", + help="UVR5 weight name", + ) + parser.add_argument( + "--formant_shift", + action="store_true", + help="Whether to formant shift the inference audio before conversion: False (if set to false, you can ignore setting the quefrency and timbre values for formanting)", + ) + parser.add_argument( + "--formant_quefrency", + type=float, + default=8.0, + help="Quefrency for formanting: 8.0 (no need to set if arg14 is False/false)", + ) + parser.add_argument( + "--formant_timbre", + type=float, + default=1.2, + help="Timbre for formanting: 1.2 (no need to set if arg14 is False/false)", + ) + cmd_opts = parser.parse_args() + args_to_assign = ['exp_name', 'trainset_dir', 'sample_rate', 'n_workers', 'gpu', + 'is_pitch_guidance', 'f0_method', 'crepe_hop_length', 'rvc_version', + 'speaker_id', 'save_epoch_iter', 'epochs', 'batch_size', + 'latest_ckpt_only', 'cache_trainset', 'save_small_model', + 'model_file_name', 'source_audio_path', 'output_file_name', + 'feature_index_path', 'transposition', 'infer_f0_method', + 'harvest_median_filter_radius', 'post_sample_rate', + 'mix_volume_envelope', 'feature_index_ratio', + 'voiceless_consonant_protection', 'model_path', + 'model_save_name', 'model_info', 'cmd_help', 'agg', 'format', 'uvr5_weight_name', + 'formant_shift', 'formant_quefrency', 'formant_timbre'] + simple_cli_args = argparse.Namespace(**{arg: getattr(cmd_opts, arg) for arg in args_to_assign}) + cmd_opts.port = cmd_opts.port if 0 <= cmd_opts.port <= 65535 else 7865 return ( @@ -126,6 +352,8 @@ def arg_parse() -> tuple: cmd_opts.noautoopen, cmd_opts.paperspace, cmd_opts.is_cli, + cmd_opts.simple_cli, + simple_cli_args, ) # has_mps is only available in nightly pytorch (for now) and MasOS 12.3+. diff --git a/infer-web.py b/infer-web.py index f27da985e..5f1c22786 100644 --- a/infer-web.py +++ b/infer-web.py @@ -39,7 +39,7 @@ from lib.infer_pack.models_onnx import SynthesizerTrnMsNSFsidM from infer_uvr5 import _audio_pre_, _audio_pre_new from MDXNet import MDXNetDereverb -from my_utils import load_audio, CSVutil +from my_utils import load_audio, CSVutil, get_folder_name from train.process_ckpt import change_info, extract_small_model, merge, show_info from vc_infer_pipeline import VC from sklearn.cluster import MiniBatchKMeans @@ -157,6 +157,8 @@ def load_hubert(): weight_root = "weights" weight_uvr5_root = "uvr5_weights" +uvr5_vocal_root = os.path.join("uvr5_outputs", "vocal") +uvr5_inst_root = os.path.join("uvr5_outputs", "inst") index_root = "./logs/" audio_root = "audios" names = [] @@ -402,6 +404,7 @@ def vc_multi( yield "\n".join(infos) yield "\n".join(infos) except: + traceback.print_exc() yield traceback.format_exc() @@ -426,11 +429,13 @@ def uvr(model_name, inp_root, save_root_vocal, paths, save_root_ins, agg, format is_half=config.is_half, ) if inp_root != "": - paths = [os.path.join(inp_root, name) for name in os.listdir(inp_root)] + if os.path.isdir(inp_root): + paths = [os.path.join(inp_root, name) for name in os.listdir(inp_root)] + else: + paths = [inp_root] else: - paths = [path.name for path in paths] - for path in paths: - inp_path = os.path.join(inp_root, path) + paths = [os.path.join(inp_root, path.name) for path in paths] + for inp_path in paths: need_reformat = 1 done = 0 try: @@ -1547,6 +1552,7 @@ def execute_generator_function(genObject): def cli_infer(com): # get VC first com = cli_split_command(com) + print(com) model_name = com[0] source_audio_path = com[1] output_file_name = com[2] @@ -1565,14 +1571,13 @@ def cli_infer(com): protection_amnt = float(com[12]) protect1 = 0.5 - if com[14] == "False" or com[14] == "false": + if com[14] == "False" or com[14] == "false" or com[14] == False: DoFormant = False Quefrency = 0.0 Timbre = 0.0 CSVutil( "csvdb/formanting.csv", "w+", "formanting", DoFormant, Quefrency, Timbre ) - else: DoFormant = True Quefrency = float(com[15]) @@ -1585,39 +1590,61 @@ def cli_infer(com): vc_data = get_vc(model_name, protection_amnt, protect1) print(vc_data) print("Mangio-RVC-Fork Infer-CLI: Performing inference...") - conversion_data = vc_single( - speaker_id, - source_audio_path, - source_audio_path, - transposition, - f0_file, - f0_method, - feature_index_path, - feature_index_path, - feature_ratio, - harvest_median_filter, - resample, - mix, - protection_amnt, - crepe_hop_length, - ) - if "Success." in conversion_data[0]: - print( - "Mangio-RVC-Fork Infer-CLI: Inference succeeded. Writing to %s/%s..." - % ("audio-outputs", output_file_name) - ) - wavfile.write( - "%s/%s" % ("audio-outputs", output_file_name), - conversion_data[1][0], - conversion_data[1][1], - ) - print( - "Mangio-RVC-Fork Infer-CLI: Finished! Saved output to %s/%s" - % ("audio-outputs", output_file_name) - ) + # Check if source_audio_path is a folder, if so, use vc_multi instead. + if os.path.isdir(source_audio_path): + opt_root = os.path.abspath("multi-audio-outputs/") + # Get the folder name of source_audio_path + source_audio_path_folder_name = get_folder_name(source_audio_path) + opt_root = os.path.join(opt_root, source_audio_path_folder_name) + os.makedirs(opt_root, exist_ok=True) + for res in vc_multi( + sid=speaker_id, + dir_path=source_audio_path, + opt_root=opt_root, + paths=None, + f0_up_key=transposition, + f0_method=f0_method, + file_index=feature_index_path, + file_index2=feature_index_path, + index_rate=feature_ratio, + filter_radius=harvest_median_filter, + resample_sr=resample, + rms_mix_rate=mix, + protect=protection_amnt, + format1="flac", + crepe_hop_length=crepe_hop_length, + ): + if "Success." in res: + print("Mangio-RVC-Fork Infer-CLI: Inference succeeded. Writing to %s" % opt_root) + else: + print("Mangio-RVC-Fork Infer-CLI: Inference failed. Here's the traceback: ") + print(res) else: - print("Mangio-RVC-Fork Infer-CLI: Inference failed. Here's the traceback: ") - print(conversion_data[0]) + print("Mangio-RVC-Fork Infer-CLI: Detected file. Using vc_single...") + conversion_data = vc_single( + sid=speaker_id, + input_audio_path0=source_audio_path, + input_audio_path1=None, + f0_up_key=transposition, + f0_file=f0_file, + f0_method=f0_method, + file_index=feature_index_path, + file_index2=feature_index_path, + index_rate=feature_ratio, + filter_radius=harvest_median_filter, + resample_sr=resample, + rms_mix_rate=mix, + protect=protection_amnt, + crepe_hop_length=crepe_hop_length, + ) + + if "Success." in conversion_data[0]: + print("Mangio-RVC-Fork Infer-CLI: Inference succeeded. Writing to %s/%s..." % ('audio-outputs', output_file_name)) + wavfile.write('%s/%s' % ('audio-outputs', output_file_name), conversion_data[1][0], conversion_data[1][1]) + print("Mangio-RVC-Fork Infer-CLI: Finished! Saved output to %s/%s" % ('audio-outputs', output_file_name)) + else: + print("Mangio-RVC-Fork Infer-CLI: Inference failed. Here's the traceback: ") + print(conversion_data[0]) def cli_pre_process(com): @@ -1718,6 +1745,7 @@ def cli_extract_model(com): has_pitch_guidance = com[3] info = com[4] version = com[5] + # FIXME: Line 72. `opt["weight"][key] = ckpt[key].half() -> 'dict' object has no attribute 'half'` extract_small_model_process = extract_small_model( model_path, save_name, sample_rate, has_pitch_guidance, info, version ) @@ -1742,6 +1770,23 @@ def preset_apply(preset, qfer, tmbr): ) +def cli_uvr(com): + com = cli_split_command(com) + print("Mangio-RVC-Fork UVR: Starting... Please wait") + for res in uvr( + model_name=com[0], + inp_root=com[1], + save_root_vocal=uvr5_vocal_root, + paths=None, + save_root_ins=uvr5_inst_root, + agg=com[2], + format0=com[3], + ): + if "Success" in res: + print("Mangio-RVC-Fork UVR: Success!") + else: + print(f"Mangio-RVC-Fork UVR: Failed! Please check: {res}") + def print_page_details(): if cli_current_page == "HOME": print( @@ -1884,7 +1929,101 @@ def cli_navigation_loop(): ) cli_navigation_loop() -# endregion +#endregion + +#region Simple CLI App + +def simple_cli_main(): + print(f"You're now in {config.simple_cli} mode.") + command = "" + func = None + if config.simple_cli and config.simple_cli_args.cmd_help: + # TODO: Add help and example for each command + print(f"Help for {config.simple_cli} command: WIP\n") + return + if config.simple_cli == "infer": + # FIXME: The 13th argument is not clear. + command = f"{config.simple_cli_args.model_file_name} \ + {config.simple_cli_args.source_audio_path} \ + {config.simple_cli_args.output_file_name} \ + {config.simple_cli_args.feature_index_path} \ + {config.simple_cli_args.speaker_id} \ + {config.simple_cli_args.transposition} \ + {config.simple_cli_args.infer_f0_method} \ + {config.simple_cli_args.crepe_hop_length} \ + {config.simple_cli_args.harvest_median_filter_radius} \ + {config.simple_cli_args.post_sample_rate} \ + {config.simple_cli_args.mix_volume_envelope} \ + {config.simple_cli_args.feature_index_ratio} \ + {config.simple_cli_args.voiceless_consonant_protection} \ + 0.45 \ + {config.simple_cli_args.formant_shift} \ + {config.simple_cli_args.formant_quefrency} \ + {config.simple_cli_args.formant_timbre}" + func = cli_infer + elif config.simple_cli == "pre-process": + command = f"{config.simple_cli_args.exp_name} \ + {config.simple_cli_args.trainset_dir} \ + {config.simple_cli_args.sample_rate} \ + {config.simple_cli_args.n_workers}" + func = cli_pre_process + elif config.simple_cli == "extract-feature": + command = f"{config.simple_cli_args.exp_name} \ + {config.simple_cli_args.gpu} \ + {config.simple_cli_args.n_workers} \ + {int(config.simple_cli_args.is_pitch_guidance)} \ + {config.simple_cli_args.f0_method} \ + {config.simple_cli_args.crepe_hop_length} \ + {config.simple_cli_args.rvc_version}" + func = cli_extract_feature + elif config.simple_cli == "train": + command = f"{config.simple_cli_args.exp_name} \ + {config.simple_cli_args.sample_rate} \ + {int(config.simple_cli_args.is_pitch_guidance)} \ + {config.simple_cli_args.speaker_id} \ + {config.simple_cli_args.save_epoch_iter} \ + {config.simple_cli_args.epochs} \ + {config.simple_cli_args.batch_size} \ + {config.simple_cli_args.gpu} \ + {int(config.simple_cli_args.latest_ckpt_only)} \ + {int(config.simple_cli_args.cache_trainset)} \ + {int(config.simple_cli_args.save_small_model)} \ + {config.simple_cli_args.rvc_version}" + func = cli_train + elif config.simple_cli == "train-feature": + command = f"{config.simple_cli_args.exp_name} \ + {config.simple_cli_args.rvc_version}" + func = cli_train_feature + elif config.simple_cli == "extract-model": + command = f"{config.simple_cli_args.model_path} \ + {config.simple_cli_args.model_save_name} \ + {config.simple_cli_args.sample_rate} \ + {int(config.simple_cli_args.is_pitch_guidance)} \ + {config.simple_cli_args.model_info} \ + {config.simple_cli_args.rvc_version}" + func = cli_extract_model + elif config.simple_cli == "uvr": + command = f"{config.simple_cli_args.uvr5_weight_name} \ + {config.simple_cli_args.source_audio_path} \ + {config.simple_cli_args.agg} \ + {config.simple_cli_args.format}" + func = cli_uvr + else: + raise Exception("Unknown simple cli mode: %s" % config.simple_cli) + + if command == "": + raise Exception("Fatal Error. Command is empty.") + if func == None: + raise Exception("Fatal Error. Function is empty.") + + func(command) + +if(config.simple_cli != ""): + print("Hi! It's simple cli here.") + simple_cli_main() + sys.exit(0) + +#endregion # region RVC WebUI App diff --git a/my_utils.py b/my_utils.py index 7ef15b912..bf0b6bc55 100644 --- a/my_utils.py +++ b/my_utils.py @@ -1,3 +1,4 @@ +import os import ffmpeg import numpy as np @@ -149,3 +150,6 @@ def load_audio(file, sr, DoFormant, Quefrency, Timbre): converted = False return np.frombuffer(out, np.float32).flatten() + +def get_folder_name(path): + return os.path.basename(os.path.normpath(path)) if '.' not in os.path.basename(os.path.normpath(path)) else ValueError("The path is not a directory path") \ No newline at end of file