prequel-dev · Sahelisaha04 · Aug 28, 2025
diff --git a/rules/cre-2025-0140/sd-webui-oom.yaml b/rules/cre-2025-0140/sd-webui-oom.yaml
@@ -0,0 +1,72 @@
+rules:
+  - cre:
+      id: CRE-2025-0140
+      severity: 1
+      title: Stable Diffusion WebUI CUDA Out of Memory during batch processing
+      category: memory-problem
+      author: Prequel
+      description: |
+        Stable Diffusion WebUI encounters a CUDA out of memory (OOM) error during batch image generation,
+        causing the entire generation pipeline to crash. This typically occurs when the batch size or image
+        dimensions exceed available VRAM capacity, particularly on consumer GPUs with limited memory.
+      cause: |
+        The root cause is attempting to allocate more VRAM than available for tensor operations during
+        the cross-attention computation phase. Common triggers include:
+        - Batch size set too high for available VRAM
+        - High resolution images (1024x1024 or larger) without optimization flags
+        - Multiple ControlNet or LoRA models loaded simultaneously
+        - Insufficient use of memory optimization options (--medvram, --lowvram)
+      impact: |
+        - Immediate termination of the generation pipeline
+        - Loss of all in-progress image generations in the batch
+        - Potential corruption of partially generated images
+        - API endpoints return error responses, breaking automation workflows
+        - System requires manual intervention to recover
+      impactScore: 8
+      mitigation: |
+        Immediate fixes:
+        1. Reduce batch size (set to 1-2 for consumer GPUs)
+        2. Enable memory optimization flags:
+           - Use --medvram for 6-8GB VRAM GPUs
+           - Use --lowvram for 4-6GB VRAM GPUs
+           - Use --no-half for CPU fallback
+        3. Reduce image dimensions to 512x512 or 768x768
+        4. Enable attention slicing with --xformers or --opt-split-attention
+        5. Clear VRAM before generation: torch.cuda.empty_cache()
+
+        Long-term solutions:
+        - Implement dynamic batch size adjustment based on available VRAM
+        - Add pre-flight VRAM checks before starting generation
+        - Upgrade to GPU with more VRAM (16GB+ recommended)
+      mitigationScore: 3
+      tags:
+        - stable-diffusion
+        - cuda
+        - gpu
+        - memory-leak
+        - batch-processing
+        - pytorch
+        - webui
+        - automatic1111
+      references:
+        - https://github.com/AUTOMATIC1111/stable-diffusion-webui/wiki/Optimizations
+        - https://github.com/AUTOMATIC1111/stable-diffusion-webui/issues/8035
+        - https://pytorch.org/docs/stable/notes/cuda.html#memory-management
+      applications:
+        - name: stable-diffusion-webui
+          version: ">= 1.0.0"
+        - name: pytorch
+          version: ">= 2.0.0"
+    metadata:
+      kind: prequel
+      id: SEKbLKEnpyQo2mkKvFZYXo
+      gen: 1
+    rule:
+      set:
+        window: 10s
+        event:
+          source: cre.log.sdwebui
+        match:
+          - regex: 'CUDA out of memory\. Tried to allocate .* GiB'
+          - regex: 'torch\.cuda\.OutOfMemoryError: CUDA out of memory'
+          - value: 'Generation failed: Out of memory error'
diff --git a/rules/cre-2025-0140/test.log b/rules/cre-2025-0140/test.log
@@ -0,0 +1,36 @@
+2025-08-27 22:44:48,804 - INFO     - [modules.shared] - Loading config from: /home/user/stable-diffusion-webui/config.json
+2025-08-27 22:44:48,804 - INFO     - [modules.shared] - Command line args: Namespace(batch_size=8, medvram=False, lowvram=False)
+2025-08-27 22:44:48,804 - INFO     - [modules.sd_models] - Loading model from cache: v1-5-pruned-emaonly.safetensors
+2025-08-27 22:44:48,804 - INFO     - [modules.devices] - CUDA available: True, Device: cuda:0 (NVIDIA GeForce RTX 3090)
+2025-08-27 22:44:48,804 - INFO     - [modules.devices] - Total VRAM: 24576 MB, Available: 18432 MB
+2025-08-27 22:44:48,804 - INFO     - [modules.txt2img] - Starting batch generation: 8 images, 512x512, steps: 20
+2025-08-27 22:44:48,904 - WARNING  - [modules.devices] - High VRAM usage detected: 22528 MB / 24576 MB (91.7%)
+2025-08-27 22:44:48,905 - INFO     - [modules.processing] - Processing batch 1/8...
+2025-08-27 22:44:48,905 - WARNING  - [torch.cuda] - Allocated memory: 23.5 GB, Reserved: 24.0 GB
+2025-08-27 22:44:48,905 - INFO     - [modules.processing] - Processing batch 2/8...
+2025-08-27 22:44:48,905 - WARNING  - [modules.devices] - VRAM usage critical: 24064 MB / 24576 MB (97.9%)
+2025-08-27 22:44:49,005 - INFO     - [modules.processing] - Processing batch 3/8...
+2025-08-27 22:44:49,005 - ERROR    - [torch.cuda] - CUDA out of memory. Tried to allocate 2.00 GiB (GPU 0; 24.00 GiB total capacity; 22.50 GiB already allocated; 512.00 MiB free; 23.00 GiB reserved in total by PyTorch)
+2025-08-27 22:44:49,005 - ERROR    - [modules.processing] - RuntimeError: CUDA out of memory during cross-attention computation
+2025-08-27 22:44:49,005 - ERROR    - [modules.processing] - Traceback (most recent call last):
+2025-08-27 22:44:49,005 - ERROR    - [modules.processing] -   File "modules/processing.py", line 732, in process_images_inner
+2025-08-27 22:44:49,005 - ERROR    - [modules.processing] -     samples = sampler.sample(p, x, conditioning, unconditional_conditioning)
+2025-08-27 22:44:49,005 - ERROR    - [modules.processing] -   File "modules/sd_samplers_kdiffusion.py", line 234, in sample
+2025-08-27 22:44:49,005 - ERROR    - [modules.processing] -     samples = self.launch_sampling(steps, lambda: self.func)
+2025-08-27 22:44:49,005 - ERROR    - [modules.processing] - torch.cuda.OutOfMemoryError: CUDA out of memory
+2025-08-27 22:44:49,005 - CRITICAL - [modules.shared] - Generation failed: Out of memory error
+2025-08-27 22:44:49,005 - ERROR    - [modules.api.api] - API error: {'error': 'OutOfMemoryError', 'detail': 'CUDA out of memory. Try reducing batch size or image dimensions'}
+2025-08-27 22:44:49,005 - WARNING  - [modules.devices] - Attempting VRAM cleanup...
+2025-08-27 22:44:49,005 - INFO     - [torch.cuda] - Clearing cache and collecting garbage
+2025-08-27 22:44:49,005 - ERROR    - [modules.processing] - Batch processing interrupted at item 3 of 8
+2025-08-27 22:44:49,005 - ERROR    - [modules.ui] - Generation failed after processing 2 images
+2025-08-27 22:44:49,105 - INFO     - [modules.devices] - Attempting automatic recovery...
+2025-08-27 22:44:49,105 - INFO     - [torch.cuda] - torch.cuda.empty_cache() called
+2025-08-27 22:44:49,105 - INFO     - [modules.shared] - Reducing batch size from 8 to 4
+2025-08-27 22:44:49,105 - WARNING  - [modules.processing] - Restarting generation with reduced settings
+2025-08-27 22:44:49,105 - INFO     - [modules.devices] - VRAM after cleanup: 8192 MB available
+2025-08-27 22:44:49,105 - INFO     - [modules.devices] - CUDA available: False, Using CPU mode
+2025-08-27 22:44:49,105 - INFO     - [modules.processing] - Processing on CPU, no VRAM limitations
+2025-08-27 22:44:49,105 - WARNING  - [modules.shared] - High RAM usage: 28 GB / 32 GB
+2025-08-27 22:44:49,105 - INFO     - [modules.processing] - Successfully completed batch 8/8
+2025-08-27 22:44:49,106 - INFO     - [modules.txt2img] - Generation complete: 8 images generated
diff --git a/rules/tags/tags.yaml b/rules/tags/tags.yaml
@@ -844,4 +844,25 @@ tags:
     description: Issues with Kubernetes pod scheduling due to resource constraints or networking problems
   - name: cluster-scaling
     displayName: Cluster Scaling
-    description: Problems related to Kubernetes cluster scaling operations and capacity management
+    description: Problems related to Kubernetes cluster scaling operations and capacity management
+  - name: stable-diffusion
+    displayName: Stable Diffusion
+    description: Issues related to Stable Diffusion image generation models
+  - name: automatic1111
+    displayName: AUTOMATIC1111 WebUI
+    description: Problems specific to the AUTOMATIC1111 Stable Diffusion WebUI
+  - name: webui
+    displayName: Web UI
+    description: Issues with web user interface components
+  - name: batch-processing
+    displayName: Batch Processing
+    description: Problems occurring during batch job processing
+  - name: cuda
+    displayName: CUDA
+    description: NVIDIA CUDA GPU computation issues
+  - name: gpu
+    displayName: GPU
+    description: Graphics Processing Unit related problems
+  - name: pytorch
+    displayName: PyTorch
+    description: Issues related to PyTorch deep learning framework