diff --git a/rules/cre-2025-0140/sd-webui-oom.yaml b/rules/cre-2025-0140/sd-webui-oom.yaml new file mode 100644 index 0000000..8bfb98f --- /dev/null +++ b/rules/cre-2025-0140/sd-webui-oom.yaml @@ -0,0 +1,72 @@ +rules: + - cre: + id: CRE-2025-0140 + severity: 1 + title: Stable Diffusion WebUI CUDA Out of Memory during batch processing + category: memory-problem + author: Prequel + description: | + Stable Diffusion WebUI encounters a CUDA out of memory (OOM) error during batch image generation, + causing the entire generation pipeline to crash. This typically occurs when the batch size or image + dimensions exceed available VRAM capacity, particularly on consumer GPUs with limited memory. + cause: | + The root cause is attempting to allocate more VRAM than available for tensor operations during + the cross-attention computation phase. Common triggers include: + - Batch size set too high for available VRAM + - High resolution images (1024x1024 or larger) without optimization flags + - Multiple ControlNet or LoRA models loaded simultaneously + - Insufficient use of memory optimization options (--medvram, --lowvram) + impact: | + - Immediate termination of the generation pipeline + - Loss of all in-progress image generations in the batch + - Potential corruption of partially generated images + - API endpoints return error responses, breaking automation workflows + - System requires manual intervention to recover + impactScore: 8 + mitigation: | + Immediate fixes: + 1. Reduce batch size (set to 1-2 for consumer GPUs) + 2. Enable memory optimization flags: + - Use --medvram for 6-8GB VRAM GPUs + - Use --lowvram for 4-6GB VRAM GPUs + - Use --no-half for CPU fallback + 3. Reduce image dimensions to 512x512 or 768x768 + 4. Enable attention slicing with --xformers or --opt-split-attention + 5. Clear VRAM before generation: torch.cuda.empty_cache() + + Long-term solutions: + - Implement dynamic batch size adjustment based on available VRAM + - Add pre-flight VRAM checks before starting generation + - Upgrade to GPU with more VRAM (16GB+ recommended) + mitigationScore: 3 + tags: + - stable-diffusion + - cuda + - gpu + - memory-leak + - batch-processing + - pytorch + - webui + - automatic1111 + references: + - https://github.com/AUTOMATIC1111/stable-diffusion-webui/wiki/Optimizations + - https://github.com/AUTOMATIC1111/stable-diffusion-webui/issues/8035 + - https://pytorch.org/docs/stable/notes/cuda.html#memory-management + applications: + - name: stable-diffusion-webui + version: ">= 1.0.0" + - name: pytorch + version: ">= 2.0.0" + metadata: + kind: prequel + id: SEKbLKEnpyQo2mkKvFZYXo + gen: 1 + rule: + set: + window: 10s + event: + source: cre.log.sdwebui + match: + - regex: 'CUDA out of memory\. Tried to allocate .* GiB' + - regex: 'torch\.cuda\.OutOfMemoryError: CUDA out of memory' + - value: 'Generation failed: Out of memory error' \ No newline at end of file diff --git a/rules/cre-2025-0140/test.log b/rules/cre-2025-0140/test.log new file mode 100644 index 0000000..00d053f --- /dev/null +++ b/rules/cre-2025-0140/test.log @@ -0,0 +1,36 @@ +2025-08-27 22:44:48,804 - INFO - [modules.shared] - Loading config from: /home/user/stable-diffusion-webui/config.json +2025-08-27 22:44:48,804 - INFO - [modules.shared] - Command line args: Namespace(batch_size=8, medvram=False, lowvram=False) +2025-08-27 22:44:48,804 - INFO - [modules.sd_models] - Loading model from cache: v1-5-pruned-emaonly.safetensors +2025-08-27 22:44:48,804 - INFO - [modules.devices] - CUDA available: True, Device: cuda:0 (NVIDIA GeForce RTX 3090) +2025-08-27 22:44:48,804 - INFO - [modules.devices] - Total VRAM: 24576 MB, Available: 18432 MB +2025-08-27 22:44:48,804 - INFO - [modules.txt2img] - Starting batch generation: 8 images, 512x512, steps: 20 +2025-08-27 22:44:48,904 - WARNING - [modules.devices] - High VRAM usage detected: 22528 MB / 24576 MB (91.7%) +2025-08-27 22:44:48,905 - INFO - [modules.processing] - Processing batch 1/8... +2025-08-27 22:44:48,905 - WARNING - [torch.cuda] - Allocated memory: 23.5 GB, Reserved: 24.0 GB +2025-08-27 22:44:48,905 - INFO - [modules.processing] - Processing batch 2/8... +2025-08-27 22:44:48,905 - WARNING - [modules.devices] - VRAM usage critical: 24064 MB / 24576 MB (97.9%) +2025-08-27 22:44:49,005 - INFO - [modules.processing] - Processing batch 3/8... +2025-08-27 22:44:49,005 - ERROR - [torch.cuda] - CUDA out of memory. Tried to allocate 2.00 GiB (GPU 0; 24.00 GiB total capacity; 22.50 GiB already allocated; 512.00 MiB free; 23.00 GiB reserved in total by PyTorch) +2025-08-27 22:44:49,005 - ERROR - [modules.processing] - RuntimeError: CUDA out of memory during cross-attention computation +2025-08-27 22:44:49,005 - ERROR - [modules.processing] - Traceback (most recent call last): +2025-08-27 22:44:49,005 - ERROR - [modules.processing] - File "modules/processing.py", line 732, in process_images_inner +2025-08-27 22:44:49,005 - ERROR - [modules.processing] - samples = sampler.sample(p, x, conditioning, unconditional_conditioning) +2025-08-27 22:44:49,005 - ERROR - [modules.processing] - File "modules/sd_samplers_kdiffusion.py", line 234, in sample +2025-08-27 22:44:49,005 - ERROR - [modules.processing] - samples = self.launch_sampling(steps, lambda: self.func) +2025-08-27 22:44:49,005 - ERROR - [modules.processing] - torch.cuda.OutOfMemoryError: CUDA out of memory +2025-08-27 22:44:49,005 - CRITICAL - [modules.shared] - Generation failed: Out of memory error +2025-08-27 22:44:49,005 - ERROR - [modules.api.api] - API error: {'error': 'OutOfMemoryError', 'detail': 'CUDA out of memory. Try reducing batch size or image dimensions'} +2025-08-27 22:44:49,005 - WARNING - [modules.devices] - Attempting VRAM cleanup... +2025-08-27 22:44:49,005 - INFO - [torch.cuda] - Clearing cache and collecting garbage +2025-08-27 22:44:49,005 - ERROR - [modules.processing] - Batch processing interrupted at item 3 of 8 +2025-08-27 22:44:49,005 - ERROR - [modules.ui] - Generation failed after processing 2 images +2025-08-27 22:44:49,105 - INFO - [modules.devices] - Attempting automatic recovery... +2025-08-27 22:44:49,105 - INFO - [torch.cuda] - torch.cuda.empty_cache() called +2025-08-27 22:44:49,105 - INFO - [modules.shared] - Reducing batch size from 8 to 4 +2025-08-27 22:44:49,105 - WARNING - [modules.processing] - Restarting generation with reduced settings +2025-08-27 22:44:49,105 - INFO - [modules.devices] - VRAM after cleanup: 8192 MB available +2025-08-27 22:44:49,105 - INFO - [modules.devices] - CUDA available: False, Using CPU mode +2025-08-27 22:44:49,105 - INFO - [modules.processing] - Processing on CPU, no VRAM limitations +2025-08-27 22:44:49,105 - WARNING - [modules.shared] - High RAM usage: 28 GB / 32 GB +2025-08-27 22:44:49,105 - INFO - [modules.processing] - Successfully completed batch 8/8 +2025-08-27 22:44:49,106 - INFO - [modules.txt2img] - Generation complete: 8 images generated \ No newline at end of file diff --git a/rules/tags/tags.yaml b/rules/tags/tags.yaml index 1acb1dc..2633f96 100644 --- a/rules/tags/tags.yaml +++ b/rules/tags/tags.yaml @@ -844,4 +844,25 @@ tags: description: Issues with Kubernetes pod scheduling due to resource constraints or networking problems - name: cluster-scaling displayName: Cluster Scaling - description: Problems related to Kubernetes cluster scaling operations and capacity management \ No newline at end of file + description: Problems related to Kubernetes cluster scaling operations and capacity management + - name: stable-diffusion + displayName: Stable Diffusion + description: Issues related to Stable Diffusion image generation models + - name: automatic1111 + displayName: AUTOMATIC1111 WebUI + description: Problems specific to the AUTOMATIC1111 Stable Diffusion WebUI + - name: webui + displayName: Web UI + description: Issues with web user interface components + - name: batch-processing + displayName: Batch Processing + description: Problems occurring during batch job processing + - name: cuda + displayName: CUDA + description: NVIDIA CUDA GPU computation issues + - name: gpu + displayName: GPU + description: Graphics Processing Unit related problems + - name: pytorch + displayName: PyTorch + description: Issues related to PyTorch deep learning framework \ No newline at end of file