diff --git a/denoiser.hpp b/denoiser.hpp index 32f402786..b0b9391f1 100644 --- a/denoiser.hpp +++ b/denoiser.hpp @@ -481,6 +481,14 @@ struct CompVisVDenoiser : public CompVisDenoiser { } }; +struct ComVisX0Denoiser : public CompVisDenoiser { + std::vector get_scalings(float sigma) override { + float c_skip = 0.0f; + float c_out = 1.0f; + float c_in = 1.0f; + } +}; + struct EDMVDenoiser : public CompVisVDenoiser { float min_sigma = 0.002; float max_sigma = 120.0; @@ -568,6 +576,15 @@ struct DiscreteFlowDenoiser : public Denoiser { } }; +struct DiscreteFlowX0Denoiser : public DiscreteFlowDenoiser { + std::vector get_scalings(float sigma) override { + float c_skip = 0.0f; + float c_out = 1.0f; + float c_in = 1.0f; + return {c_skip, c_out, c_in}; + } +}; + float flux_time_shift(float mu, float sigma, float t) { return std::exp(mu) / (std::exp(mu) + std::pow((1.0 / t - 1.0), sigma)); } @@ -631,6 +648,15 @@ struct FluxFlowDenoiser : public Denoiser { } }; +struct FluxFlowX0Denoiser : public FluxFlowDenoiser { + std::vector get_scalings(float sigma) override { + float c_skip = 0.0f; + float c_out = 1.0f; + float c_in = 1.0f; + return {c_skip, c_out, c_in}; + } +}; + struct Flux2FlowDenoiser : public FluxFlowDenoiser { Flux2FlowDenoiser() = default; diff --git a/flux.hpp b/flux.hpp index 1df2874ae..6e56bd648 100644 --- a/flux.hpp +++ b/flux.hpp @@ -781,7 +781,7 @@ namespace Flux { Flux(FluxParams params) : params(params) { if (params.version == VERSION_CHROMA_RADIANCE) { - std::pair kernel_size = {(int)params.patch_size, (int)params.patch_size}; + std::pair kernel_size = {16, 16}; std::pair stride = kernel_size; blocks["img_in_patch"] = std::make_shared(params.in_channels, @@ -1068,6 +1068,14 @@ namespace Flux { auto img = pad_to_patch_size(ctx->ggml_ctx, x); auto orig_img = img; + if (patch_size != 16) { + int ratio = patch_size / 16; + // It's supposed to be using GGML_SCALE_MODE_NEAREST, but this seems more stable + // Maybe the implementation of nearest-neighbor interpolation in ggml behaves differently than the one in PyTorch? + // img = F.interpolate(img, size=(H//2, W//2), mode="nearest") + img = ggml_interpolate(ctx->ggml_ctx, img, W / ratio, H / ratio, C, x->ne[3], GGML_SCALE_MODE_BILINEAR); + } + auto img_in_patch = std::dynamic_pointer_cast(blocks["img_in_patch"]); img = img_in_patch->forward(ctx, img); // [N, hidden_size, H/patch_size, W/patch_size] @@ -1290,6 +1298,9 @@ namespace Flux { // not schnell flux_params.guidance_embed = true; } + if (tensor_name.find("__32x32__") != std::string::npos) { + flux_params.patch_size = 32; + } if (tensor_name.find("distilled_guidance_layer.in_proj.weight") != std::string::npos) { // Chroma flux_params.is_chroma = true; diff --git a/model.cpp b/model.cpp index 0480efefb..131de3c32 100644 --- a/model.cpp +++ b/model.cpp @@ -1731,7 +1731,14 @@ bool ModelLoader::save_to_gguf_file(const std::string& file_path, ggml_type type // tensor_storage.n_dims, // tensor_storage.ne[0], tensor_storage.ne[1], tensor_storage.ne[2], tensor_storage.ne[3], // tensor->n_dims, tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3]); - + + if (!tensor->data) { + GGML_ASSERT(ggml_nelements(tensor) == 0); + // avoid crashing the gguf writer by setting a dummy pointer for zero-sized tensors + LOG_DEBUG("setting dummy pointer for zero-sized tensor %s", name.c_str()); + tensor->data = ggml_get_mem_buffer(ggml_ctx); + } + *dst_tensor = tensor; gguf_add_tensor(gguf_ctx, tensor); diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index 2cb588213..324a47205 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -828,7 +828,12 @@ class StableDiffusionGGML { } } } else if (sd_version_is_flux(version)) { - pred_type = FLUX_FLOW_PRED; + if (tensor_storage_map.find("model.diffusion_model.__x0__") != tensor_storage_map.end()) { + pred_type = FLUX_FLOW_X0_PRED; + } else { + pred_type = FLUX_FLOW_PRED; + } + if (flow_shift == INFINITY) { flow_shift = 1.0f; // TODO: validate for (const auto& [name, tensor_storage] : tensor_storage_map) { @@ -871,6 +876,11 @@ class StableDiffusionGGML { denoiser = std::make_shared(); break; } + case FLUX_FLOW_X0_PRED: { + LOG_INFO("running in x0-prediction Flux FLOW mode"); + denoiser = std::make_shared(); + break; + } default: { LOG_ERROR("Unknown predition type %i", pred_type); ggml_free(ctx); @@ -1316,9 +1326,9 @@ class StableDiffusionGGML { uint32_t dim = latents->ne[ggml_n_dims(latents) - 1]; if (preview_mode == PREVIEW_PROJ) { - int64_t patch_sz = 1; - const float(*latent_rgb_proj)[channel] = nullptr; - float* latent_rgb_bias = nullptr; + int64_t patch_sz = 1; + const float (*latent_rgb_proj)[channel] = nullptr; + float* latent_rgb_bias = nullptr; if (dim == 128) { if (sd_version_is_flux2(version)) { @@ -2424,6 +2434,7 @@ const char* prediction_to_str[] = { "edm_v", "sd3_flow", "flux_flow", + "flux_flow_x0" "flux2_flow", }; diff --git a/stable-diffusion.h b/stable-diffusion.h index e4abc8dcd..ee56099ba 100644 --- a/stable-diffusion.h +++ b/stable-diffusion.h @@ -70,6 +70,7 @@ enum prediction_t { EDM_V_PRED, FLOW_PRED, FLUX_FLOW_PRED, + FLUX_FLOW_X0_PRED, FLUX2_FLOW_PRED, PREDICTION_COUNT };