From 6f6ac066c9d07ccb5ebabac8223a380deaf2899e Mon Sep 17 00:00:00 2001
From: Damian Schneider <daedae@gmx.ch>
Date: Sat, 6 Dec 2025 14:27:15 +0100
Subject: [PATCH 1/4] replaced blur-rendering with ellipse rendering

- better size control as size is now exactly mappable to pixels so it can be matched exactly to the collision distance
- no more gaps due to collision distance mismatch
- much faster: saw up to 30% improvement in FPS
- also adjusted some of the FX to make better use of the new rendering
---
 wled00/FX.cpp               |  58 ++++---
 wled00/FXparticleSystem.cpp | 319 +++++++++++++++++++-----------------
 wled00/FXparticleSystem.h   |  21 ++-
 3 files changed, 221 insertions(+), 177 deletions(-)

diff --git a/wled00/FX.cpp b/wled00/FX.cpp
index eb72ff4f9d..83e6785492 100644
--- a/wled00/FX.cpp
+++ b/wled00/FX.cpp
@@ -8386,11 +8386,12 @@ uint16_t mode_particlepit(void) {
         PartSys->particles[i].sat = ((SEGMENT.custom3) << 3) + 7;
         // set particle size
         if (SEGMENT.custom1 == 255) {
-          PartSys->setParticleSize(1); // set global size to 1 for advanced rendering (no single pixel particles)
+          PartSys->perParticleSize = true;
           PartSys->advPartProps[i].size = hw_random16(SEGMENT.custom1); // set each particle to random size
         } else {
+          PartSys->perParticleSize = false;
           PartSys->setParticleSize(SEGMENT.custom1); // set global size
-          PartSys->advPartProps[i].size = 0; // use global size
+          PartSys->advPartProps[i].size = SEGMENT.custom1; // also set individual size for consistency
         }
         break; // emit only one particle per round
       }
@@ -8408,7 +8409,7 @@ uint16_t mode_particlepit(void) {
 
   return FRAMETIME;
 }
-static const char _data_FX_MODE_PARTICLEPIT[] PROGMEM = "PS Ballpit@Speed,Intensity,Size,Hardness,Saturation,Cylinder,Walls,Ground;;!;2;pal=11,sx=100,ix=220,c1=120,c2=130,c3=31,o3=1";
+static const char _data_FX_MODE_PARTICLEPIT[] PROGMEM = "PS Ballpit@Speed,Intensity,Size,Hardness,Saturation,Cylinder,Walls,Ground;;!;2;pal=11,sx=100,ix=220,c1=70,c2=180,c3=31,o3=1";
 
 /*
   Particle Waterfall
@@ -8492,7 +8493,7 @@ uint16_t mode_particlebox(void) {
   uint32_t i;
 
   if (SEGMENT.call == 0) { // initialization
-    if (!initParticleSystem2D(PartSys, 1)) // init
+    if (!initParticleSystem2D(PartSys, 1, 0, true)) // init
       return mode_static(); // allocation failed or not 2D
     PartSys->setBounceX(true);
     PartSys->setBounceY(true);
@@ -8505,19 +8506,24 @@ uint16_t mode_particlebox(void) {
     return mode_static(); // something went wrong, no data!
 
   PartSys->updateSystem(); // update system properties (dimensions and data pointers)
-  PartSys->setParticleSize(SEGMENT.custom3<<3);
   PartSys->setWallHardness(min(SEGMENT.custom2, (uint8_t)200)); // wall hardness is 200 or more
   PartSys->enableParticleCollisions(true, max(2, (int)SEGMENT.custom2)); // enable collisions and set particle collision hardness
   PartSys->setUsedParticles(map(SEGMENT.intensity, 0, 255, 2, 153)); // 1% - 60%
+  if (SEGMENT.custom3 < 31)
+    PartSys->setParticleSize(SEGMENT.custom3<<3); // set global size if not max (resets perParticleSize)
+  else
+    PartSys->perParticleSize = true; // per particle size, uses advPartProps.size (randomized below)
+
   // add in new particles if amount has changed
   for (i = 0; i < PartSys->usedParticles; i++) {
-    if (PartSys->particles[i].ttl < 260) { // initialize handed over particles and dead particles
+    if (PartSys->particles[i].ttl < 260) { // initialize dead particles
       PartSys->particles[i].ttl = 260; // full brigthness
       PartSys->particles[i].x = hw_random16(PartSys->maxX);
       PartSys->particles[i].y = hw_random16(PartSys->maxY);
       PartSys->particles[i].hue = hw_random8(); // make it colorful
       PartSys->particleFlags[i].perpetual = true; // never die
       PartSys->particleFlags[i].collide = true; // all particles colllide
+      PartSys->advPartProps[i].size = hw_random8(); // random size, used only if size is set to max (SEGMENT.custom3=31)
       break; // only spawn one particle per frame for less chaotic transitions
     }
   }
@@ -8773,22 +8779,10 @@ uint16_t mode_particleattractor(void) {
 
   // Particle System settings
   PartSys->updateSystem(); // update system properties (dimensions and data pointers)
-  attractor = reinterpret_cast<PSparticle *>(PartSys->PSdataEnd);
-
   PartSys->setColorByAge(SEGMENT.check1);
   PartSys->setParticleSize(SEGMENT.custom1 >> 1); //set size globally
   PartSys->setUsedParticles(map(SEGMENT.intensity, 0, 255, 25, 190));
-
-  if (SEGMENT.custom2 > 0) // collisions enabled
-    PartSys->enableParticleCollisions(true, map(SEGMENT.custom2, 1, 255, 120, 255)); // enable collisions and set particle collision hardness
-  else
-    PartSys->enableParticleCollisions(false);
-
-  if (SEGMENT.call == 0) {
-    attractor->vx = PartSys->sources[0].source.vy; // set to spray movemement but reverse x and y
-    attractor->vy = PartSys->sources[0].source.vx;
-  }
-
+  attractor = reinterpret_cast<PSparticle *>(PartSys->PSdataEnd);
   // set attractor properties
   attractor->ttl = 100; // never dies
   if (SEGMENT.check2) {
@@ -8799,6 +8793,15 @@ uint16_t mode_particleattractor(void) {
     attractor->x = PartSys->maxX >> 1; // set to center
     attractor->y = PartSys->maxY >> 1;
   }
+  if (SEGMENT.call == 0) {
+    attractor->vx = PartSys->sources[0].source.vy; // set to spray movemement but reverse x and y
+    attractor->vy = PartSys->sources[0].source.vx;
+  }
+
+  if (SEGMENT.custom2 > 0) // collisions enabled
+    PartSys->enableParticleCollisions(true, map(SEGMENT.custom2, 1, 255, 120, 255)); // enable collisions and set particle collision hardness
+  else
+    PartSys->enableParticleCollisions(false);
 
   if (SEGMENT.call % 5 == 0)
     PartSys->sources[0].source.hue++;
@@ -8828,6 +8831,7 @@ uint16_t mode_particleattractor(void) {
   PartSys->update(); // update and render
   return FRAMETIME;
 }
+//static const char _data_FX_MODE_PARTICLEATTRACTOR[] PROGMEM = "PS Attractor@Mass,Particles,Size,Collide,Friction,AgeColor,Move,Swallow;;!;2;pal=9,sx=100,ix=82,c1=1,c2=0";
 static const char _data_FX_MODE_PARTICLEATTRACTOR[] PROGMEM = "PS Attractor@Mass,Particles,Size,Collide,Friction,AgeColor,Move,Swallow;;!;2;pal=9,sx=100,ix=82,c1=2,c2=0";
 
 /*
@@ -9157,6 +9161,7 @@ uint16_t mode_particleblobs(void) {
     PartSys->setWallHardness(255);
     PartSys->setWallRoughness(255);
     PartSys->setCollisionHardness(255);
+    PartSys->perParticleSize = true; // enable per particle size control
   }
   else
     PartSys = reinterpret_cast<ParticleSystem2D *>(SEGENV.data); // if not first call, just set the pointer to the PS
@@ -9247,8 +9252,6 @@ uint16_t mode_particlegalaxy(void) {
   // Particle System settings
   PartSys->updateSystem(); // update system properties (dimensions and data pointers)
   uint8_t particlesize = SEGMENT.custom1;
-  if(SEGMENT.check3)
-    particlesize =  SEGMENT.custom1 ? 1 : 0; // set size to 0 (single pixel) or 1 (quad pixel) so motion blur works and adds streaks
   PartSys->setParticleSize(particlesize); // set size globally
   PartSys->setMotionBlur(250 * SEGMENT.check3); // adds trails to single/quad pixel particles, no effect if size > 1
 
@@ -9316,7 +9319,7 @@ uint16_t mode_particlegalaxy(void) {
   PartSys->update(); // update and render
   return FRAMETIME;
 }
-static const char _data_FX_MODE_PARTICLEGALAXY[] PROGMEM = "PS Galaxy@!,!,Size,,Color,,Starfield,Trace;;!;2;pal=59,sx=80,c1=2,c3=4";
+static const char _data_FX_MODE_PARTICLEGALAXY[] PROGMEM = "PS Galaxy@!,!,Size,,Color,,Starfield,Trace;;!;2;pal=59,sx=80,c1=1,c3=4";
 
 #endif //WLED_DISABLE_PARTICLESYSTEM2D
 #endif // WLED_DISABLE_2D
@@ -9463,6 +9466,13 @@ uint16_t mode_particlePinball(void) {
   PartSys->enableParticleCollisions(SEGMENT.check1, 255); // enable collisions and set particle collision to high hardness
   PartSys->setUsedParticles(SEGMENT.intensity);
   PartSys->setColorByPosition(SEGMENT.check3);
+  /*
+  // TODO: update 1D system to use the same logic for per particle size as 2D system
+  if (SEGMENT.custom1 < 255)
+    PartSys->setParticleSize(SEGMENT.custom1); // set size globally
+  else
+    PartSys->perParticleSize = true;
+  */ 
 
   bool updateballs = false;
   if (SEGENV.aux1 != SEGMENT.speed + SEGMENT.intensity + SEGMENT.check2 + SEGMENT.custom1 + PartSys->usedParticles) { // user settings change or more particles are available
@@ -9486,7 +9496,7 @@ uint16_t mode_particlePinball(void) {
         }
         PartSys->particles[i].hue = hw_random8(); //set ball colors to random
         PartSys->advPartProps[i].sat = 255;
-        PartSys->advPartProps[i].size = SEGMENT.custom1;
+        PartSys->advPartProps[i].size = SEGMENT.custom1 < 255 ? SEGMENT.custom1 : hw_random8(); //set ball size
       }
       speedsum += abs(PartSys->particles[i].vx);
     }
@@ -9523,7 +9533,7 @@ uint16_t mode_particlePinball(void) {
       SEGENV.step += interval + hw_random16(interval);
       PartSys->sources[0].source.hue = hw_random16(); //set ball color
       PartSys->sources[0].sat = 255;
-      PartSys->sources[0].size = SEGMENT.custom1;
+      PartSys->sources[0].size = SEGMENT.custom1 < 255 ? SEGMENT.custom1 : hw_random8(); //set ball size
       PartSys->sprayEmit(PartSys->sources[0]);
     }
   }
diff --git a/wled00/FXparticleSystem.cpp b/wled00/FXparticleSystem.cpp
index 1a1ed08850..ea31430ef4 100644
--- a/wled00/FXparticleSystem.cpp
+++ b/wled00/FXparticleSystem.cpp
@@ -33,7 +33,7 @@ ParticleSystem2D::ParticleSystem2D(uint32_t width, uint32_t height, uint32_t num
   setWallHardness(255); // set default wall hardness to max
   setWallRoughness(0); // smooth walls by default
   setGravity(0); //gravity disabled by default
-  setParticleSize(1); // 2x2 rendering size by default
+  setParticleSize(1); // 2x2 rendering size by default (disables per particle size control by default)
   motionBlur = 0; //no fading by default
   smearBlur = 0; //no smearing by default
   emitIndex = 0;
@@ -58,7 +58,7 @@ void ParticleSystem2D::update(void) {
     applyGravity();
 
   //update size settings before handling collisions
-  if (advPartSize) {
+  if (advPartSize != nullptr) {
     for (uint32_t i = 0; i < usedParticles; i++) {
       if (updateSize(&advPartProps[i], &advPartSize[i]) == false) { // if particle shrinks to 0 size
         particles[i].ttl = 0; // kill particle
@@ -139,7 +139,6 @@ void ParticleSystem2D::setColorByAge(bool enable) {
 }
 
 void ParticleSystem2D::setMotionBlur(uint8_t bluramount) {
-  if (particlesize < 2) // only allow motion blurring on default particle sizes or advanced size (cannot combine motion blur with normal blurring used for particlesize, would require another buffer)
     motionBlur = bluramount;
 }
 
@@ -148,13 +147,13 @@ void ParticleSystem2D::setSmearBlur(uint8_t bluramount) {
 }
 
 
-// render size using smearing (see blur function)
+// set global particle size
 void ParticleSystem2D::setParticleSize(uint8_t size) {
   particlesize = size;
   particleHardRadius = PS_P_MINHARDRADIUS; // ~1 pixel
+  perParticleSize = false; // disable per particle size control if global size is set
   if (particlesize > 1) {
-    particleHardRadius = max(particleHardRadius, (uint32_t)particlesize); // radius used for wall collisions & particle collisions
-    motionBlur = 0; // disable motion blur if particle size is set
+    particleHardRadius = PS_P_MINHARDRADIUS + ((particlesize * 52) >> 6); // use 1 pixel + 80% of size for hard radius (slight overlap with boarders so they do not "float" and nicer stacking)
   }
   else if (particlesize == 0)
     particleHardRadius = particleHardRadius >> 1; // single pixel particles have half the radius (i.e. 1/2 pixel)
@@ -194,7 +193,7 @@ int32_t ParticleSystem2D::sprayEmit(const PSsource &emitter) {
       particles[emitIndex].sat = emitter.source.sat;
       particleFlags[emitIndex].collide = emitter.sourceFlags.collide;
       particles[emitIndex].ttl = hw_random16(emitter.minLife, emitter.maxLife);
-      if (advPartProps)
+      if (advPartProps != nullptr)
         advPartProps[emitIndex].size = emitter.size;
       break;
     }
@@ -231,17 +230,14 @@ void ParticleSystem2D::particleMoveUpdate(PSparticle &part, PSparticleFlags &par
     if (options->colorByAge)
       part.hue = min(part.ttl, (uint16_t)255); //set color to ttl
 
-    int32_t renderradius = PS_P_HALFRADIUS; // used to check out of bounds
+    int32_t renderradius = PS_P_HALFRADIUS - 1 + particlesize; // used to check out of bounds, if its more than half a radius out of bounds, it will render to x = -2/-1 or x=max/max+1 in standard 2x2 rendering
     int32_t newX = part.x + (int32_t)part.vx;
     int32_t newY = part.y + (int32_t)part.vy;
     partFlags.outofbounds = false; // reset out of bounds (in case particle was created outside the matrix and is now moving into view) note: moving this to checks below adds code and is not faster
 
-    if (advancedproperties) { //using individual particle size?
-      setParticleSize(particlesize); // updates default particleHardRadius
-      if (advancedproperties->size > PS_P_MINHARDRADIUS) {
-        particleHardRadius += (advancedproperties->size - PS_P_MINHARDRADIUS); // update radius
-        renderradius = particleHardRadius;
-      }
+    if (perParticleSize && advancedproperties != nullptr) { // using individual particle size
+      renderradius = PS_P_HALFRADIUS - 1 + advancedproperties->size;
+      particleHardRadius = PS_P_MINHARDRADIUS + ((advancedproperties->size * 52) >> 6); // use 1 pixel + 80% of size for hard radius (slight overlap with boarders so they do not "float")
     }
     // note: if wall collisions are enabled, bounce them before they reach the edge, it looks much nicer if the particle does not go half out of view
     if (options->bounceY) {
@@ -347,7 +343,7 @@ bool ParticleSystem2D::updateSize(PSadvancedParticle *advprops, PSsizeControl *a
     if (newsize > advsize->minsize) {
       newsize -= increment;
       if (newsize <= advsize->minsize) {
-        if (advsize->minsize == 0) 
+        if (advsize->minsize == 0)
           return false; // particle shrunk to zero
         advsize->shrink = false; // disable shrinking
         newsize = advsize->minsize; // limit
@@ -556,7 +552,7 @@ void ParticleSystem2D::pointAttractor(const uint32_t particleindex, PSparticle &
 // warning: do not render out of bounds particles or system will crash! rendering does not check if particle is out of bounds
 // firemode is only used for PS Fire FX
 void ParticleSystem2D::render() {
-  if(framebuffer == nullptr) {
+  if (framebuffer == nullptr) {
     PSPRINTLN(F("PS render: no framebuffer!"));
     return;
   }
@@ -600,33 +596,114 @@ void ParticleSystem2D::render() {
         hsv2rgb(baseHSV, baseRGB.color32); // convert back to RGB
       }
     }
-    if(gammaCorrectCol) brightness = gamma8(brightness); // apply gamma correction, used for gamma-inverted brightness distribution
+    if (gammaCorrectCol) brightness = gamma8(brightness); // apply gamma correction, used for gamma-inverted brightness distribution
     renderParticle(i, brightness, baseRGB, particlesettings.wrapX, particlesettings.wrapY);
   }
 
-  // apply global size rendering
-  if (particlesize > 1) {
-    uint32_t passes = particlesize / 64 + 1; // number of blur passes, four passes max
-    uint32_t bluramount = particlesize;
-    uint32_t bitshift = 0;
-    for (uint32_t i = 0; i < passes; i++) {
-      if (i == 2) // for the last two passes, use higher amount of blur (results in a nicer brightness gradient with soft edges)
-        bitshift = 1;
-      blur2D(framebuffer, maxXpixel + 1, maxYpixel + 1, bluramount << bitshift, bluramount << bitshift);
-      bluramount -= 64;
-    }
-  }
-
   // apply 2D blur to rendered frame
   if (smearBlur) {
     blur2D(framebuffer, maxXpixel + 1, maxYpixel + 1, smearBlur, smearBlur);
   }
 }
 
+// render particle as ellipse/circle with linear brightness falloff and sub-pixel precision
+void WLED_O2_ATTR ParticleSystem2D::renderParticleEllipse(const uint32_t particleindex, const uint8_t brightness, const CRGBW& color, const bool wrapX, const bool wrapY) {
+  uint32_t size = particlesize;
+  if (perParticleSize && advPartProps != nullptr) // individual particle size
+    size = advPartProps[particleindex].size;
+
+  // particle position with sub-pixel precision
+  int32_t x_subcenter = particles[particleindex].x;
+  int32_t y_subcenter = particles[particleindex].y;
+
+  // example: for x = 128, a paticle is exacly between pixel 1 and 2, with a radius of 2 pixels, we draw pixels 0-3
+  // integer center jumps when x = 127 -> pixel 1 goes to x = 128 -> pixel 2
+  // when calculating the dx, we need to take this into account: at x = 128 the x offset is 1, the pixel center is at pixel 2:
+  // for pixel 1, dx = 1 * PS_P_RADIUS - 128 = -64 but the center of the pixel is actually only -32 from the particle center so need to add half a radius:
+  // dx = pixel_x * PS_P_RADIUS - x_subcenter + PS_P_HALFRADIUS
+
+  // sub-pixel offset (0-63)
+  int32_t x_offset = x_subcenter & (PS_P_RADIUS - 1); // same as modulo PS_P_RADIUS but faster
+  int32_t y_offset = y_subcenter & (PS_P_RADIUS - 1);
+  // integer pixel position, this is rounded down
+  int32_t x_center = (x_subcenter) >> PS_P_RADIUS_SHIFT;
+  int32_t y_center = (y_subcenter) >> PS_P_RADIUS_SHIFT;
+
+  // ellipse radii in pixels
+  uint32_t xsize = size;
+  uint32_t ysize = size;
+  if (advPartSize != nullptr && advPartSize[particleindex].asymmetry > 0) {
+    getParticleXYsize(&advPartProps[particleindex], &advPartSize[particleindex], xsize, ysize);
+  }
+
+  int32_t rx_subpixel = xsize+65; // size = 1 means radius of just over 1 pixel
+  int32_t ry_subpixel = ysize+65; // size = 255 is radius of 5, so add 65 -> 65+255=320, 320>>6=5 pixels
+
+  // rendering bounding box in pixels
+  int32_t rx_pixels = (rx_subpixel >> PS_P_RADIUS_SHIFT);
+  int32_t ry_pixels = (ry_subpixel >> PS_P_RADIUS_SHIFT);
+
+  int32_t x_min = x_center - rx_pixels;
+  int32_t x_max = x_center + rx_pixels;
+  int32_t y_min = y_center - ry_pixels;
+  int32_t y_max = y_center + ry_pixels;
+
+  // cache for speed
+  uint32_t matrixX = maxXpixel + 1;
+  uint32_t matrixY = maxYpixel + 1;
+  uint32_t rx_sq = rx_subpixel * rx_subpixel;
+  uint32_t ry_sq = ry_subpixel * ry_subpixel;
+
+  // iterate over bounding box and render each pixel
+  for (int32_t py = y_min; py <= y_max; py++) {
+    for (int32_t px = x_min; px <= x_max; px++) {
+      // distance from particle center, explanation see above
+      int32_t dx_subpixel = (px << PS_P_RADIUS_SHIFT) - x_subcenter + PS_P_HALFRADIUS;
+      int32_t dy_subpixel = (py << PS_P_RADIUS_SHIFT) - y_subcenter + PS_P_HALFRADIUS;
+
+      // calculate brightness based on squared distance to ellipse center
+      uint8_t pixel_brightness = calculateEllipseBrightness(dx_subpixel, dy_subpixel, rx_sq, ry_sq, brightness);
+
+      if (pixel_brightness == 0) continue; // Skip fully transparent pixels
+
+      // apply inverse gamma correction if needed, if this is skipped, particles flicker due to changing total brightness
+      if (gammaCorrectCol) {
+        pixel_brightness = gamma8inv(pixel_brightness); // invert brigthess so brightness distribution is linear after gamma correction
+      }
+
+      // Handle wrapping and bounds
+      int32_t render_x = px;
+      int32_t render_y = py;
+
+      // Check bounds and apply wrapping
+      if (render_x < 0) {
+        if (!wrapX) continue;
+        render_x += matrixX;
+      } else if (render_x > maxXpixel) {
+        if (!wrapX) continue;
+        render_x -= matrixX;
+      }
+
+      if (render_y < 0) {
+        if (!wrapY) continue;
+        render_y += matrixY;
+      } else if (render_y > maxYpixel) {
+        if (!wrapY) continue;
+        render_y -= matrixY;
+      }
+      // Render pixel
+      uint32_t idx = render_x + (maxYpixel - render_y) * matrixX; // flip y coordinate (0,0 is bottom left in PS but top left in framebuffer)
+      framebuffer[idx] = fast_color_scaleAdd(framebuffer[idx], color, pixel_brightness);
+    }
+  }
+}
+
+
 // calculate pixel positions and brightness distribution and render the particle to local buffer or global buffer
 void WLED_O2_ATTR ParticleSystem2D::renderParticle(const uint32_t particleindex, const uint8_t brightness, const CRGBW& color, const bool wrapX, const bool wrapY) {
   uint32_t size = particlesize;
-  if (advPartProps && advPartProps[particleindex].size > 0) // use advanced size properties (0 means use global size including single pixel rendering)
+
+  if (perParticleSize && advPartProps != nullptr) // use advanced size properties
     size = advPartProps[particleindex].size;
 
   if (size == 0) { // single pixel rendering
@@ -638,6 +715,13 @@ void WLED_O2_ATTR ParticleSystem2D::renderParticle(const uint32_t particleindex,
     }
     return;
   }
+
+  if (size > 1) { // size > 1: render as ellipse
+    renderParticleEllipse(particleindex, brightness, color, wrapX, wrapY); // larger size rendering
+    return;
+  }
+
+  // size = 1: standard 2x2 pixel rendering using bilinear interpolation (20% faster than ellipse rendering)
   uint8_t pxlbrightness[4]; // brightness values for the four pixels representing a particle
   struct {
     int32_t x,y;
@@ -645,6 +729,7 @@ void WLED_O2_ATTR ParticleSystem2D::renderParticle(const uint32_t particleindex,
   bool pixelvalid[4] = {true, true, true, true}; // is set to false if pixel is out of bounds
 
   // add half a radius as the rendering algorithm always starts at the bottom left, this leaves things positive, so shifts can be used, then shift coordinate by a full pixel (x--/y-- below)
+  // if sub-pixel position is 0-PS_P_HALFRADIUS it will render to x>>PS_P_RADIUS_SHIFT as the right pixel
   int32_t xoffset = particles[particleindex].x + PS_P_HALFRADIUS;
   int32_t yoffset = particles[particleindex].y + PS_P_HALFRADIUS;
   int32_t dx = xoffset & (PS_P_RADIUS - 1); // relativ particle position in subpixel space
@@ -662,7 +747,7 @@ void WLED_O2_ATTR ParticleSystem2D::renderParticle(const uint32_t particleindex,
 
   // calculate brightness values for all four pixels representing a particle using linear interpolation
   // could check for out of frame pixels here but calculating them is faster (very few are out)
-  // precalculate values for speed optimization
+  // precalculate values for speed optimization. Note: rounding is not perfect but close enough, some inaccuracy is traded for speed
   int32_t precal1 = (int32_t)PS_P_RADIUS - dx;
   int32_t precal2 = ((int32_t)PS_P_RADIUS - dy) * brightness;
   int32_t precal3 = dy * brightness;
@@ -674,118 +759,48 @@ void WLED_O2_ATTR ParticleSystem2D::renderParticle(const uint32_t particleindex,
   // - scale brigthness with gamma correction (done in render())
   // - apply inverse gamma correction to brightness values
   // - gamma is applied again in show() -> the resulting brightness distribution is linear but gamma corrected in total
-  if(gammaCorrectCol) {
+  if (gammaCorrectCol) {
     pxlbrightness[0] = gamma8inv(pxlbrightness[0]); // use look-up-table for invers gamma
     pxlbrightness[1] = gamma8inv(pxlbrightness[1]);
     pxlbrightness[2] = gamma8inv(pxlbrightness[2]);
     pxlbrightness[3] = gamma8inv(pxlbrightness[3]);
   }
 
-  if (advPartProps && advPartProps[particleindex].size > 1) { //render particle to a bigger size
-    uint32_t renderbuffer[100]; // 10x10 pixel buffer
-    memset(renderbuffer, 0, sizeof(renderbuffer)); // clear buffer
-    //particle size to pixels: < 64 is 4x4, < 128 is 6x6, < 192 is 8x8, bigger is 10x10
-    //first, render the pixel to the center of the renderbuffer, then apply 2D blurring
-    renderbuffer[4 + (4 * 10)] = fast_color_scaleAdd(renderbuffer[4 + (4 * 10)], color, pxlbrightness[0]); // order is: bottom left, bottom right, top right, top left
-    renderbuffer[5 + (4 * 10)] = fast_color_scaleAdd(renderbuffer[5 + (4 * 10)], color, pxlbrightness[1]);
-    renderbuffer[5 + (5 * 10)] = fast_color_scaleAdd(renderbuffer[5 + (5 * 10)], color, pxlbrightness[2]);
-    renderbuffer[4 + (5 * 10)] = fast_color_scaleAdd(renderbuffer[4 + (5 * 10)], color, pxlbrightness[3]);
-    uint32_t rendersize = 2; // initialize render size, minimum is 4x4 pixels, it is incremented int he loop below to start with 4
-    uint32_t offset = 4; // offset to zero coordinate to write/read data in renderbuffer (actually needs to be 3, is decremented in the loop below)
-    uint32_t maxsize = advPartProps[particleindex].size;
-    uint32_t xsize = maxsize;
-    uint32_t ysize = maxsize;
-    if (advPartSize) { // use advanced size control
-      if (advPartSize[particleindex].asymmetry > 0)
-        getParticleXYsize(&advPartProps[particleindex], &advPartSize[particleindex], xsize, ysize);
-      maxsize = (xsize > ysize) ? xsize : ysize; // choose the bigger of the two
-    }
-    maxsize = maxsize/64 + 1; // number of blur passes depends on maxsize, four passes max
-    uint32_t bitshift = 0;
-    for (uint32_t i = 0; i < maxsize; i++) {
-      if (i == 2) //for the last two passes, use higher amount of blur (results in a nicer brightness gradient with soft edges)
-        bitshift = 1;
-      rendersize += 2;
-      offset--;
-      blur2D(renderbuffer, rendersize, rendersize, xsize << bitshift, ysize << bitshift, offset, offset, true);
-      xsize = xsize > 64 ? xsize - 64 : 0;
-      ysize = ysize > 64 ? ysize - 64 : 0;
-    }
-
-    // calculate origin coordinates to render the particle to in the framebuffer
-    uint32_t xfb_orig = x - (rendersize>>1) + 1 - offset;
-    uint32_t yfb_orig = y - (rendersize>>1) + 1 - offset;
-    uint32_t xfb, yfb; // coordinates in frame buffer to write to note: by making this uint, only overflow has to be checked (spits a warning though)
-
-    //note on y-axis flip: WLED has the y-axis defined from top to bottom, so y coordinates must be flipped. doing this in the buffer xfer clashes with 1D/2D combined rendering, which does not invert y
-    //                     transferring the 1D buffer in inverted fashion will flip the x-axis of overlaid 2D FX, so the y-axis flip is done here so the buffer is flipped in y, giving correct results
-
-    // transfer particle renderbuffer to framebuffer
-    for (uint32_t xrb = offset; xrb < rendersize + offset; xrb++) {
-      xfb = xfb_orig + xrb;
-      if (xfb > (uint32_t)maxXpixel) {
-      if (wrapX) { // wrap x to the other side if required
-        if (xfb > (uint32_t)maxXpixel << 1) // xfb is "negative", handle it
-          xfb = (maxXpixel + 1) + (int32_t)xfb; // this always overflows to within bounds
-        else
-          xfb = xfb % (maxXpixel + 1); // note: without the above "negative" check, this works only for powers of 2
-      }
-      else
-        continue;
-      }
-
-      for (uint32_t yrb = offset; yrb < rendersize + offset; yrb++) {
-        yfb = yfb_orig + yrb;
-        if (yfb > (uint32_t)maxYpixel) {
-          if (wrapY) {// wrap y to the other side if required
-            if (yfb > (uint32_t)maxYpixel << 1) // yfb is "negative", handle it
-              yfb = (maxYpixel + 1) + (int32_t)yfb; // this always overflows to within bounds
-            else
-              yfb = yfb % (maxYpixel + 1); // note: without the above "negative" check, this works only for powers of 2
-          }
-          else
-          continue;
-        }
-        uint32_t idx = xfb + (maxYpixel - yfb) * (maxXpixel + 1); // flip y coordinate (0,0 is bottom left in PS but top left in framebuffer)
-        framebuffer[idx] = fast_color_scaleAdd(framebuffer[idx], renderbuffer[xrb + yrb * 10]);
-      }
-    }
-    } else { // standard rendering (2x2 pixels)
-    // check for out of frame pixels and wrap them if required: x,y is bottom left pixel coordinate of the particle
-    if (x < 0) { // left pixels out of frame
-      if (wrapX) { // wrap x to the other side if required
-        pixco[0].x = pixco[3].x = maxXpixel;
-      } else {
-        pixelvalid[0] = pixelvalid[3] = false; // out of bounds
-      }
+  // standard rendering (2x2 pixels)
+  // check for out of frame pixels and wrap them if required: x,y is bottom left pixel coordinate of the particle
+  if (x < 0) { // left pixels out of frame
+    if (wrapX) { // wrap x to the other side if required
+      pixco[0].x = pixco[3].x = maxXpixel;
+    } else {
+      pixelvalid[0] = pixelvalid[3] = false; // out of bounds
     }
-    else if (pixco[1].x > (int32_t)maxXpixel) { // right pixels, only has to be checked if left pixel is in frame
-      if (wrapX) { // wrap y to the other side if required
-        pixco[1].x = pixco[2].x = 0;
-      } else {
-        pixelvalid[1] = pixelvalid[2] = false; // out of bounds
-      }
+  }
+  else if (pixco[1].x > (int32_t)maxXpixel) { // right pixels, only has to be checked if left pixel is in frame
+    if (wrapX) { // wrap y to the other side if required
+      pixco[1].x = pixco[2].x = 0;
+    } else {
+      pixelvalid[1] = pixelvalid[2] = false; // out of bounds
     }
+  }
 
-    if (y < 0) { // bottom pixels out of frame
-      if (wrapY) { // wrap y to the other side if required
-        pixco[0].y = pixco[1].y = maxYpixel;
-      } else {
-        pixelvalid[0] = pixelvalid[1] = false; // out of bounds
-      }
+  if (y < 0) { // bottom pixels out of frame
+    if (wrapY) { // wrap y to the other side if required
+      pixco[0].y = pixco[1].y = maxYpixel;
+    } else {
+      pixelvalid[0] = pixelvalid[1] = false; // out of bounds
     }
-    else if (pixco[2].y > maxYpixel) { // top pixels
-      if (wrapY) { // wrap y to the other side if required
-        pixco[2].y = pixco[3].y = 0;
-      } else {
-        pixelvalid[2] = pixelvalid[3] = false; // out of bounds
-      }
+  }
+  else if (pixco[2].y > maxYpixel) { // top pixels
+    if (wrapY) { // wrap y to the other side if required
+      pixco[2].y = pixco[3].y = 0;
+    } else {
+      pixelvalid[2] = pixelvalid[3] = false; // out of bounds
     }
-    for (uint32_t i = 0; i < 4; i++) {
-      if (pixelvalid[i]) {
-        uint32_t idx = pixco[i].x + (maxYpixel - pixco[i].y) * (maxXpixel + 1); // flip y coordinate (0,0 is bottom left in PS but top left in framebuffer)
-        framebuffer[idx] = fast_color_scaleAdd(framebuffer[idx], color, pxlbrightness[i]); // order is: bottom left, bottom right, top right, top left
-      }
+  }
+  for (uint32_t i = 0; i < 4; i++) {
+    if (pixelvalid[i]) {
+      uint32_t idx = pixco[i].x + (maxYpixel - pixco[i].y) * (maxXpixel + 1); // flip y coordinate (0,0 is bottom left in PS but top left in framebuffer)
+      framebuffer[idx] = fast_color_scaleAdd(framebuffer[idx], color, pxlbrightness[i]); // order is: bottom left, bottom right, top right, top left
     }
   }
 }
@@ -795,14 +810,15 @@ void WLED_O2_ATTR ParticleSystem2D::renderParticle(const uint32_t particleindex,
 // for code simplicity, no y slicing is done, making very tall matrix configurations less efficient
 // note: also tested adding y slicing, it gives diminishing returns, some FX even get slower. FX not using gravity would benefit with a 10% FPS improvement
 void ParticleSystem2D::handleCollisions() {
+  if (perParticleSize && advPartProps != nullptr)
+    particleHardRadius = 255; // max radius for collision detection if using per-particle size TODO: could optimize by fetching max size from advPartProps
+
   uint32_t collDistSq = particleHardRadius << 1; // distance is double the radius note: particleHardRadius is updated when setting global particle size
   collDistSq = collDistSq * collDistSq; // square it for faster comparison (square is one operation)
   // note: partices are binned in x-axis, assumption is that no more than half of the particles are in the same bin
   // if they are, collisionStartIdx is increased so each particle collides at least every second frame (which still gives decent collisions)
   constexpr int BIN_WIDTH = 6 * PS_P_RADIUS; // width of a bin in sub-pixels
   int32_t overlap = particleHardRadius << 1; // overlap bins to include edge particles to neighbouring bins
-  if (advPartProps) //may be using individual particle size
-    overlap += 512; // add 2 * max radius (approximately)
   uint32_t maxBinParticles = max((uint32_t)50, (usedParticles + 1) / 2); // assume no more than half of the particles are in the same bin, do not bin small amounts of particles
   uint32_t numBins = (maxX + (BIN_WIDTH - 1)) / BIN_WIDTH; // number of bins in x direction
   uint16_t binIndices[maxBinParticles]; // creat array on stack for indices, 2kB max for 1024 particles (ESP32_MAXPARTICLES/2)
@@ -820,7 +836,7 @@ void ParticleSystem2D::handleCollisions() {
     for (uint32_t i = 0; i < usedParticles; i++) {
       if (particles[pidx].ttl > 0) { // is alive
         if (particles[pidx].x >= binStart && particles[pidx].x <= binEnd) { // >= and <= to include particles on the edge of the bin (overlap to ensure boarder particles collide with adjacent bins)
-          if(particleFlags[pidx].outofbounds == 0 && particleFlags[pidx].collide) { // particle is in frame and does collide note: checking flags is quite slow and usually these are set, so faster to check here
+          if (particleFlags[pidx].outofbounds == 0 && particleFlags[pidx].collide) { // particle is in frame and does collide note: checking flags is quite slow and usually these are set, so faster to check here
             if (binParticleCount >= maxBinParticles) { // bin is full, more particles in this bin so do the rest next frame
               nextFrameStartIdx = pidx; // bin overflow can only happen once as bin size is at least half of the particles (or half +1)
               break;
@@ -837,9 +853,8 @@ void ParticleSystem2D::handleCollisions() {
       uint32_t idx_i = binIndices[i];
       for (uint32_t j = i + 1; j < binParticleCount; j++) { // check against higher number particles
         uint32_t idx_j = binIndices[j];
-        if (advPartProps) { //may be using individual particle size
-          setParticleSize(particlesize); // updates base particleHardRadius
-          collDistSq = (particleHardRadius << 1) + (((uint32_t)advPartProps[idx_i].size + (uint32_t)advPartProps[idx_j].size) >> 1); // collision distance note: not 100% clear why the >> 1 is needed, but it is.
+        if (perParticleSize && advPartProps != nullptr) { // using individual particle size
+          collDistSq = (PS_P_MINHARDRADIUS << 1) + ((((uint32_t)advPartProps[idx_i].size + (uint32_t)advPartProps[idx_j].size) * 52) >> 6); // collision distance, use 80% of size for tighter stacking (slight overlap)
           collDistSq = collDistSq * collDistSq; // square it for faster comparison
         }
         int32_t dx = (particles[idx_j].x + particles[idx_j].vx) - (particles[idx_i].x + particles[idx_i].vx); // distance with lookahead
@@ -1103,6 +1118,8 @@ bool initParticleSystem2D(ParticleSystem2D *&PartSys, uint32_t requestedsources,
   uint32_t cols = SEGMENT.virtualWidth();
   uint32_t rows = SEGMENT.virtualHeight();
   uint32_t pixels = cols * rows;
+  if (sizecontrol)
+    advanced = true; // size control needs advanced properties, prevent wrong usage
 
   uint32_t numparticles = calculateNumberOfParticles2D(pixels, advanced, sizecontrol);
   PSPRINT(" segmentsize:" + String(cols) + " x " + String(rows));
@@ -1418,7 +1435,7 @@ void ParticleSystem1D::applyFriction(int32_t coefficient) {
 // if wrap is set, particles half out of bounds are rendered to the other side of the matrix
 // warning: do not render out of bounds particles or system will crash! rendering does not check if particle is out of bounds
 void ParticleSystem1D::render() {
-  if(framebuffer == nullptr) {
+  if (framebuffer == nullptr) {
     PSPRINTLN(F("PS render: no framebuffer!"));
     return;
   }
@@ -1447,7 +1464,7 @@ void ParticleSystem1D::render() {
     brightness = min(particles[i].ttl << 1, (int)255);
     baseRGB = ColorFromPaletteWLED(SEGPALETTE, particles[i].hue, 255, blend);
 
-    if (advPartProps) { //saturation is advanced property in 1D system
+    if (advPartProps != nullptr) { //saturation is advanced property in 1D system
       if (advPartProps[i].sat < 255) {
         CHSV32 baseHSV;
         rgb2hsv(baseRGB.color32, baseHSV); // convert to HSV
@@ -1455,7 +1472,7 @@ void ParticleSystem1D::render() {
         hsv2rgb(baseHSV, baseRGB.color32); // convert back to RGB
       }
     }
-    if(gammaCorrectCol) brightness = gamma8(brightness); // apply gamma correction, used for gamma-inverted brightness distribution
+    if (gammaCorrectCol) brightness = gamma8(brightness); // apply gamma correction, used for gamma-inverted brightness distribution
     renderParticle(i, brightness, baseRGB, particlesettings.wrap);
   }
   // apply smear-blur to rendered frame
@@ -1472,7 +1489,7 @@ void ParticleSystem1D::render() {
   }
 #ifndef WLED_DISABLE_2D
   // transfer local buffer to segment if using 1D->2D mapping
-  if(SEGMENT.is2D() && SEGMENT.map1D2D) {
+  if (SEGMENT.is2D() && SEGMENT.map1D2D) {
     for (int x = 0; x <= maxXpixel; x++) {
     //for (int x = 0; x < SEGMENT.vLength(); x++) {
       SEGMENT.setPixelColor(x, framebuffer[x]); // this applies the mapping
@@ -1484,7 +1501,7 @@ void ParticleSystem1D::render() {
 // calculate pixel positions and brightness distribution and render the particle to local buffer or global buffer
 void WLED_O2_ATTR ParticleSystem1D::renderParticle(const uint32_t particleindex, const uint8_t brightness, const CRGBW &color, const bool wrap) {
   uint32_t size = particlesize;
-  if (advPartProps) // use advanced size properties (1D system has no large size global rendering TODO: add large global rendering?)
+  if (advPartProps != nullptr) // use advanced size properties (1D system has no large size global rendering TODO: add large global rendering?)
     size = advPartProps[particleindex].size;
 
   if (size == 0) { //single pixel particle, can be out of bounds as oob checking is made for 2-pixel particles (and updating it uses more code)
@@ -1516,12 +1533,12 @@ void WLED_O2_ATTR ParticleSystem1D::renderParticle(const uint32_t particleindex,
   // - scale brigthness with gamma correction (done in render())
   // - apply inverse gamma correction to brightness values
   // - gamma is applied again in show() -> the resulting brightness distribution is linear but gamma corrected in total
-  if(gammaCorrectCol) {
+  if (gammaCorrectCol) {
     pxlbrightness[0] = gamma8inv(pxlbrightness[0]); // use look-up-table for invers gamma
     pxlbrightness[1] = gamma8inv(pxlbrightness[1]);
   }
   // check if particle has advanced size properties and buffer is available
-  if (advPartProps && advPartProps[particleindex].size > 1) {
+  if (advPartProps != nullptr && advPartProps[particleindex].size > 1) {
     uint32_t renderbuffer[10]; // 10 pixel buffer
     memset(renderbuffer, 0, sizeof(renderbuffer)); // clear buffer
     //render particle to a bigger size
@@ -1596,7 +1613,7 @@ void ParticleSystem1D::handleCollisions() {
   // if they are, collisionStartIdx is increased so each particle collides at least every second frame (which still gives decent collisions)
   constexpr int BIN_WIDTH = 32 * PS_P_RADIUS_1D; // width of each bin, a compromise between speed and accuracy (larger bins are faster but collapse more)
   int32_t overlap = particleHardRadius << 1; // overlap bins to include edge particles to neighbouring bins
-  if (advPartProps) //may be using individual particle size
+  if (advPartProps != nullptr) //may be using individual particle size
     overlap += 256; // add 2 * max radius (approximately)
   uint32_t maxBinParticles = max((uint32_t)50, (usedParticles + 1) / 4); // do not bin small amounts, limit max to 1/4 of particles
   uint32_t numBins = (maxX + (BIN_WIDTH - 1)) / BIN_WIDTH; // calculate number of bins
@@ -1613,7 +1630,7 @@ void ParticleSystem1D::handleCollisions() {
     for (uint32_t i = 0; i < usedParticles; i++) {
       if (particles[pidx].ttl > 0) { // alivee
         if (particles[pidx].x >= binStart && particles[pidx].x <= binEnd) { // >= and <= to include particles on the edge of the bin (overlap to ensure boarder particles collide with adjacent bins)
-          if(particleFlags[pidx].outofbounds == 0 && particleFlags[pidx].collide) { // particle is in frame and does collide note: checking flags is quite slow and usually these are set, so faster to check here
+          if (particleFlags[pidx].outofbounds == 0 && particleFlags[pidx].collide) { // particle is in frame and does collide note: checking flags is quite slow and usually these are set, so faster to check here
             if (binParticleCount >= maxBinParticles) { // bin is full, more particles in this bin so do the rest next frame
               nextFrameStartIdx = pidx; // bin overflow can only happen once as bin size is at least half of the particles (or half +1)
               break;
@@ -1630,7 +1647,7 @@ void ParticleSystem1D::handleCollisions() {
       uint32_t idx_i = binIndices[i];
       for (uint32_t j = i + 1; j < binParticleCount; j++) { // check against higher number particles
         uint32_t idx_j = binIndices[j];
-        if (advPartProps) { // use advanced size properties
+        if (advPartProps != nullptr) { // use advanced size properties
           collisiondistance = (PS_P_MINHARDRADIUS_1D << particlesize) + ((advPartProps[idx_i].size + advPartProps[idx_j].size) >> 1);
         }
         int32_t dx = (particles[idx_j].x + particles[idx_j].vx) - (particles[idx_i].x + particles[idx_i].vx); // distance between particles with lookahead
@@ -1735,7 +1752,7 @@ void ParticleSystem1D::updatePSpointers(bool isadvanced) {
   sources = reinterpret_cast<PSsource1D *>(particleFlags + numParticles); // pointer to source(s)
   PSdataEnd = reinterpret_cast<uint8_t *>(sources + numSources);   // pointer to first available byte after the PS for FX additional data (already aligned to 4 byte boundary)
 #ifndef WLED_DISABLE_2D
-  if(SEGMENT.is2D() && SEGMENT.map1D2D) {
+  if (SEGMENT.is2D() && SEGMENT.map1D2D) {
     framebuffer = reinterpret_cast<uint32_t *>(sources + numSources); // use local framebuffer for 1D->2D mapping
     PSdataEnd = reinterpret_cast<uint8_t *>(framebuffer + SEGMENT.maxMappingLength()); // pointer to first available byte after the PS for FX additional data (still aligned to 4 byte boundary)
   }
@@ -1790,7 +1807,7 @@ bool allocateParticleSystemMemory1D(const uint32_t numparticles, const uint32_t
   requiredmemory += sizeof(PSparticle1D) * numparticles;
   requiredmemory += sizeof(PSsource1D) * numsources;
 #ifndef WLED_DISABLE_2D
-  if(SEGMENT.is2D())
+  if (SEGMENT.is2D())
     requiredmemory += sizeof(uint32_t) * SEGMENT.maxMappingLength(); // need local buffer for mapped rendering
 #endif
   requiredmemory += additionalbytes;
diff --git a/wled00/FXparticleSystem.h b/wled00/FXparticleSystem.h
index 7503cad93e..0ff510c41a 100644
--- a/wled00/FXparticleSystem.h
+++ b/wled00/FXparticleSystem.h
@@ -103,7 +103,7 @@ typedef union {
 
 // struct for additional particle settings (option)
 typedef struct { // 2 bytes
-  uint8_t size; // particle size, 255 means 10 pixels in diameter, 0  means use global size (including single pixel rendering)
+  uint8_t size; // particle size, 255 means 10 pixels in diameter, set perParticleSize = true to enable
   uint8_t forcecounter; // counter for applying forces to individual particles
 } PSadvancedParticle;
 
@@ -190,11 +190,13 @@ class ParticleSystem2D {
   int32_t maxXpixel, maxYpixel; // last physical pixel that can be drawn to (FX can read this to read segment size if required), equal to width-1 / height-1
   uint32_t numSources; // number of sources
   uint32_t usedParticles; // number of particles used in animation, is relative to 'numParticles'
+  bool perParticleSize; // if true, uses individual particle sizes from advPartProps if available (disabled when calling setParticleSize())
   //note: some variables are 32bit for speed and code size at the cost of ram
 
 private:
   //rendering functions
   void render();
+  void renderParticleEllipse(const uint32_t particleindex, const uint8_t brightness, const CRGBW& color, const bool wrapX, const bool wrapY);
   [[gnu::hot]] void renderParticle(const uint32_t particleindex, const uint8_t brightness, const CRGBW& color, const bool wrapX, const bool wrapY);
   //paricle physics applied by system if flags are set
   void applyGravity(); // applies gravity to all particles
@@ -232,6 +234,21 @@ bool initParticleSystem2D(ParticleSystem2D *&PartSys, const uint32_t requestedso
 uint32_t calculateNumberOfParticles2D(const uint32_t pixels, const bool advanced, const bool sizecontrol);
 uint32_t calculateNumberOfSources2D(const uint32_t pixels, const uint32_t requestedsources);
 bool allocateParticleSystemMemory2D(const uint32_t numparticles, const uint32_t numsources, const bool advanced, const bool sizecontrol, const uint32_t additionalbytes);
+
+// distance-based brightness for ellipse rendering, returns brightness (0-255) based on distance from ellipse center
+inline uint8_t calculateEllipseBrightness(int32_t dx, int32_t dy, int32_t rxsq, int32_t rysq, uint8_t maxBrightness) {
+  // square the distances
+  uint32_t dx_sq = dx * dx;
+  uint32_t dy_sq = dy * dy;
+
+  uint32_t dist_sq = ((dx_sq << 8) / rxsq) + ((dy_sq << 8) / rysq); // normalized squared distance in fixed point: (dx²/rx²) * 256 + (dy²/ry²) * 256
+
+  if (dist_sq >= 256) return 0;  // pixel is outside the ellipse, unit radius in fixed point: 256 = 1.0
+  //if (dist_sq <= 96) return maxBrightness; // core at full brightness
+  int32_t falloff = 256 - dist_sq;
+  return (maxBrightness * falloff) >> 8; // linear falloff
+  //return (maxBrightness * falloff * falloff) >> 16; // squared falloff for even softer edges
+}
 #endif // WLED_DISABLE_PARTICLESYSTEM2D
 
 ////////////////////////
@@ -346,7 +363,7 @@ class ParticleSystem1D
   void setColorByPosition(const bool enable);
   void setMotionBlur(const uint8_t bluramount); // note: motion blur can only be used if 'particlesize' is set to zero
   void setSmearBlur(const uint8_t bluramount); // enable 1D smeared blurring of full frame
-  void setParticleSize(const uint8_t size); //size 0 = 1 pixel, size 1 = 2 pixels, is overruled if advanced particle is used
+  void setParticleSize(const uint8_t size); // particle diameter: size 0 = 1 pixel, size 1 = 2 pixels, size = 255 = 10 pixels, disables per particle size control if called
   void setGravity(int8_t force = 8);
   void enableParticleCollisions(bool enable, const uint8_t hardness = 255);
 

From a421cfeabe23d27189c8c8893dbda15c51f6dbe8 Mon Sep 17 00:00:00 2001
From: Damian Schneider <daedae@gmx.ch>
Date: Sat, 6 Dec 2025 16:31:09 +0100
Subject: [PATCH 2/4] adding mass-ratio to collisions for different sized
 particles

---
 wled00/FXparticleSystem.cpp | 34 +++++++++++++++++++++++++---------
 wled00/FXparticleSystem.h   |  2 +-
 2 files changed, 26 insertions(+), 10 deletions(-)

diff --git a/wled00/FXparticleSystem.cpp b/wled00/FXparticleSystem.cpp
index ea31430ef4..1a9bd74b47 100644
--- a/wled00/FXparticleSystem.cpp
+++ b/wled00/FXparticleSystem.cpp
@@ -849,6 +849,8 @@ void ParticleSystem2D::handleCollisions() {
       if (pidx >= usedParticles) pidx = 0; // wrap around
     }
 
+    uint32_t massratio1 = 0; // 0 means dont use mass ratio (equal mass)
+    uint32_t massratio2 = 0;
     for (uint32_t i = 0; i < binParticleCount; i++) { // go though all 'higher number' particles in this bin and see if any of those are in close proximity and if they are, make them collide
       uint32_t idx_i = binIndices[i];
       for (uint32_t j = i + 1; j < binParticleCount; j++) { // check against higher number particles
@@ -856,12 +858,18 @@ void ParticleSystem2D::handleCollisions() {
         if (perParticleSize && advPartProps != nullptr) { // using individual particle size
           collDistSq = (PS_P_MINHARDRADIUS << 1) + ((((uint32_t)advPartProps[idx_i].size + (uint32_t)advPartProps[idx_j].size) * 52) >> 6); // collision distance, use 80% of size for tighter stacking (slight overlap)
           collDistSq = collDistSq * collDistSq; // square it for faster comparison
+          // calculate mass ratio for collision response
+          uint32_t mass1 = 1 + ((uint32_t)advPartProps[idx_i].size * advPartProps[idx_i].size); // +1 to avoid division by zero
+          uint32_t mass2 = ((uint32_t)advPartProps[idx_j].size * advPartProps[idx_j].size);
+          uint32_t totalmass = mass1 + mass2;
+          massratio1 = (mass2 << 8) / totalmass; // massratio 1 depends on mass of particle 2, i.e. if 2 is heavier -> higher velocity impact on 1
+          massratio2 = (mass1 << 8) / totalmass;
         }
         int32_t dx = (particles[idx_j].x + particles[idx_j].vx) - (particles[idx_i].x + particles[idx_i].vx); // distance with lookahead
         if (dx * dx < collDistSq) { // check x direction, if close, check y direction (squaring is faster than abs() or dual compare)
           int32_t dy = (particles[idx_j].y + particles[idx_j].vy)  - (particles[idx_i].y + particles[idx_i].vy); // distance with lookahead
           if (dy * dy < collDistSq) // particles are close
-            collideParticles(particles[idx_i], particles[idx_j], dx, dy, collDistSq);
+            collideParticles(particles[idx_i], particles[idx_j], dx, dy, collDistSq, massratio1, massratio2);
         }
       }
     }
@@ -871,7 +879,7 @@ void ParticleSystem2D::handleCollisions() {
 
 // handle a collision if close proximity is detected, i.e. dx and/or dy smaller than 2*PS_P_RADIUS
 // takes two pointers to the particles to collide and the particle hardness (softer means more energy lost in collision, 255 means full hard)
-void WLED_O2_ATTR ParticleSystem2D::collideParticles(PSparticle &particle1, PSparticle &particle2, int32_t dx, int32_t dy, const uint32_t collDistSq) {
+void WLED_O2_ATTR ParticleSystem2D::collideParticles(PSparticle &particle1, PSparticle &particle2, int32_t dx, int32_t dy, const uint32_t collDistSq, uint32_t massratio1, uint32_t massratio2) {
   int32_t distanceSquared = dx * dx + dy * dy;
   // Calculate relative velocity note: could zero check but that does not improve overall speed but deminish it as that is rarely the case and pushing is still required
   int32_t relativeVx = (int32_t)particle2.vx - (int32_t)particle1.vx;
@@ -899,11 +907,11 @@ void WLED_O2_ATTR ParticleSystem2D::collideParticles(PSparticle &particle1, PSpa
   int32_t dotProduct = (dx * relativeVx + dy * relativeVy); // is always negative if moving towards each other
 
   if (dotProduct < 0) {// particles are moving towards each other
-    // integer math used to avoid floats.
+    // integer math is much faster than using floats (float divisions are slow on all ESPs)
     // overflow check: dx/dy are 7bit, relativV are 8bit -> dotproduct is 15bit, dotproduct/distsquared ist 8b, multiplied by collisionhardness of 8bit. so a 16bit shift is ok, make it 15 to be sure no overflows happen
     // note: cannot use right shifts as bit shifting in right direction is asymmetrical for positive and negative numbers and this needs to be accurate! the trick is: only shift positive numers
     // Calculate new velocities after collision
-    int32_t surfacehardness = 1 + max(collisionHardness, (int32_t)PS_P_MINSURFACEHARDNESS); // if particles are soft, the impulse must stay above a limit or collisions slip through at higher speeds, 170 seems to be a good value
+    int32_t surfacehardness = max(collisionHardness, (int32_t)PS_P_MINSURFACEHARDNESS >> 1); // if particles are soft, the impulse must stay above a limit or collisions slip through at higher speeds, 170 seems to be a good value
     int32_t impulse = (((((-dotProduct) << 15) / distanceSquared) * surfacehardness) >> 8); // note: inverting before bitshift corrects for asymmetry in right-shifts (is slightly faster)
 
     #if defined(CONFIG_IDF_TARGET_ESP32C3) || defined(ESP8266) // use bitshifts with rounding instead of division (2x faster)
@@ -913,11 +921,19 @@ void WLED_O2_ATTR ParticleSystem2D::collideParticles(PSparticle &particle1, PSpa
     int32_t ximpulse = (impulse * dx) / 32767;
     int32_t yimpulse = (impulse * dy) / 32767;
     #endif
-    particle1.vx -= ximpulse; // note: impulse is inverted, so subtracting it
-    particle1.vy -= yimpulse;
-    particle2.vx += ximpulse;
-    particle2.vy += yimpulse;
-
+    // if particles are not the same size, use a mass ratio. mass ratio is set to 0 if particles are the same size
+    if (massratio1) {
+      particle1.vx -= (ximpulse * massratio1) >> 7; // mass ratio is in fixed point 8bit, multiply by two to account for the fact that we distribute the impulse to both particles
+      particle1.vy -= (yimpulse * massratio1) >> 7;
+      particle2.vx += (ximpulse * massratio2) >> 7;
+      particle2.vy += (yimpulse * massratio2) >> 7;
+    }
+    else {
+      particle1.vx -= ximpulse; // note: impulse is inverted, so subtracting it
+      particle1.vy -= yimpulse;
+      particle2.vx += ximpulse;
+      particle2.vy += yimpulse;
+    }
     if (collisionHardness < PS_P_MINSURFACEHARDNESS && (SEGMENT.call & 0x07) == 0) { // if particles are soft, they become 'sticky' i.e. apply some friction (they do pile more nicely and stop sloshing around)
       const uint32_t coeff = collisionHardness + (255 - PS_P_MINSURFACEHARDNESS);
       // Note: could call applyFriction, but this is faster and speed is key here
diff --git a/wled00/FXparticleSystem.h b/wled00/FXparticleSystem.h
index 0ff510c41a..e4f203ee30 100644
--- a/wled00/FXparticleSystem.h
+++ b/wled00/FXparticleSystem.h
@@ -201,7 +201,7 @@ class ParticleSystem2D {
   //paricle physics applied by system if flags are set
   void applyGravity(); // applies gravity to all particles
   void handleCollisions();
-  [[gnu::hot]] void collideParticles(PSparticle &particle1, PSparticle &particle2, const int32_t dx, const int32_t dy, const uint32_t collDistSq);
+  void collideParticles(PSparticle &particle1, PSparticle &particle2, const int32_t dx, const int32_t dy, const uint32_t collDistSq, uint32_t massratio1, uint32_t massratio2);
   void fireParticleupdate();
   //utility functions
   void updatePSpointers(const bool isadvanced, const bool sizecontrol); // update the data pointers to current segment data space

From 19bc3c513a69a69c39a0b324bc8af97c18366f72 Mon Sep 17 00:00:00 2001
From: Damian Schneider <daedae@gmx.ch>
Date: Sat, 13 Dec 2025 19:05:21 +0100
Subject: [PATCH 3/4] lots of tweaks, updated 1D rendering and collisions

- bugfix in mass based 2D collisions
- added improved and faster large size rendering to 1D system
- added per-particle size rendering to 1D system
- improved and simplified collision handling in 1D system
- removed local blurring functions in PS as they are not needed anymore for particle rendering
- adapted FX to work with the new rendering
- fixed outdated AR handling in PS FX
- fixed infinite loop if not enough memory
- updated PS Hourglass drop interval to simpler math: speed / 10 = time in seconds and improved particle handling
- reduced speed in PS Pinball to fix collision slip-through
- PS Box now auto-adjusts number of particles based on matrix size and particle size
- added safety check to 2D particle rendering to not crash if something goes wrong with out-of bounds particle rendering
- improved binning for particle collisions: dont use binning for small number of particles (faster)
- Some code cleanup
---
 wled00/FX.cpp               | 157 ++++----
 wled00/FXparticleSystem.cpp | 687 ++++++++++++++++++------------------
 wled00/FXparticleSystem.h   |  20 +-
 3 files changed, 421 insertions(+), 443 deletions(-)

diff --git a/wled00/FX.cpp b/wled00/FX.cpp
index 83e6785492..09343902f3 100644
--- a/wled00/FX.cpp
+++ b/wled00/FX.cpp
@@ -8243,7 +8243,7 @@ uint16_t mode_particlefire(void) {
   uint32_t numFlames; // number of flames: depends on fire width. for a fire width of 16 pixels, about 25-30 flames give good results
 
   if (SEGMENT.call == 0) { // initialization
-    if (!initParticleSystem2D(PartSys, SEGMENT.virtualWidth(), 4)) //maximum number of source (PS may limit based on segment size); need 4 additional bytes for time keeping (uint32_t lastcall)
+    if (!initParticleSystem2D(PartSys, SEGMENT.vWidth(), 4)) //maximum number of source (PS may limit based on segment size); need 4 additional bytes for time keeping (uint32_t lastcall)
       return mode_static(); // allocation failed or not 2D
     SEGENV.aux0 = hw_random16(); // aux0 is wind position (index) in the perlin noise
   }
@@ -8283,10 +8283,10 @@ uint16_t mode_particlefire(void) {
       PartSys->sources[i].source.x = (PartSys->maxX >> 1) - (spread >> 1) + hw_random(spread); // change flame position: distribute randomly on chosen width
       PartSys->sources[i].source.y = -(PS_P_RADIUS << 2); // set the source below the frame
       PartSys->sources[i].source.ttl = 20 + hw_random16((SEGMENT.custom1 * SEGMENT.custom1) >> 8) / (1 + (firespeed >> 5)); //'hotness' of fire, faster flames reduce the effect or flame height will scale too much with speed
-      PartSys->sources[i].maxLife = hw_random16(SEGMENT.virtualHeight() >> 1) + 16; // defines flame height together with the vy speed, vy speed*maxlife/PS_P_RADIUS is the average flame height
+      PartSys->sources[i].maxLife = hw_random16(SEGMENT.vHeight() >> 1) + 16; // defines flame height together with the vy speed, vy speed*maxlife/PS_P_RADIUS is the average flame height
       PartSys->sources[i].minLife = PartSys->sources[i].maxLife >> 1;
       PartSys->sources[i].vx = hw_random16(5) - 2; // emitting speed (sideways)
-      PartSys->sources[i].vy = (SEGMENT.virtualHeight() >> 1) + (firespeed >> 4) + (SEGMENT.custom1 >> 4); // emitting speed (upwards)
+      PartSys->sources[i].vy = (SEGMENT.vHeight() >> 1) + (firespeed >> 4) + (SEGMENT.custom1 >> 4); // emitting speed (upwards)
       PartSys->sources[i].var = 2 + hw_random16(2 + (firespeed >> 4)); // speed variation around vx,vy (+/- var)
     }
   }
@@ -8316,11 +8316,11 @@ uint16_t mode_particlefire(void) {
   if(hw_random8() < 10 + (SEGMENT.intensity >> 2)) {
     for (i = 0; i < PartSys->usedParticles; i++) {
       if (PartSys->particles[i].ttl == 0) { // find a dead particle
-        PartSys->particles[i].ttl = hw_random16(SEGMENT.virtualHeight()) + 30;
+        PartSys->particles[i].ttl = hw_random16(SEGMENT.vHeight()) + 30;
         PartSys->particles[i].x = PartSys->sources[0].source.x;
         PartSys->particles[i].y = PartSys->sources[0].source.y;
         PartSys->particles[i].vx = PartSys->sources[0].source.vx;
-        PartSys->particles[i].vy = (SEGMENT.virtualHeight() >> 1) + (firespeed >> 4) + ((30 + (SEGMENT.intensity >> 1) + SEGMENT.custom1) >> 4); // emitting speed (upwards)
+        PartSys->particles[i].vy = (SEGMENT.vHeight() >> 1) + (firespeed >> 4) + ((30 + (SEGMENT.intensity >> 1) + SEGMENT.custom1) >> 4); // emitting speed (upwards)
         break; // emit only one particle
       }
     }
@@ -8508,9 +8508,11 @@ uint16_t mode_particlebox(void) {
   PartSys->updateSystem(); // update system properties (dimensions and data pointers)
   PartSys->setWallHardness(min(SEGMENT.custom2, (uint8_t)200)); // wall hardness is 200 or more
   PartSys->enableParticleCollisions(true, max(2, (int)SEGMENT.custom2)); // enable collisions and set particle collision hardness
-  PartSys->setUsedParticles(map(SEGMENT.intensity, 0, 255, 2, 153)); // 1% - 60%
+  int maxParticleSize = min(((SEGMENT.vWidth() * SEGMENT.vHeight()) >> 2), 255U); // max particle size based on matrix size
+  unsigned currentParticleSize = map(SEGMENT.custom3, 0, 31, 0, maxParticleSize);
+  PartSys->setUsedParticles(map(SEGMENT.intensity, 0, 255, 2, 153) / (1 + (currentParticleSize >> 4))); // 1% - 60%, reduce if using larger size
   if (SEGMENT.custom3 < 31)
-    PartSys->setParticleSize(SEGMENT.custom3<<3); // set global size if not max (resets perParticleSize)
+    PartSys->setParticleSize(currentParticleSize); // set global size if not max (resets perParticleSize)
   else
     PartSys->perParticleSize = true; // per particle size, uses advPartProps.size (randomized below)
 
@@ -8523,7 +8525,7 @@ uint16_t mode_particlebox(void) {
       PartSys->particles[i].hue = hw_random8(); // make it colorful
       PartSys->particleFlags[i].perpetual = true; // never die
       PartSys->particleFlags[i].collide = true; // all particles colllide
-      PartSys->advPartProps[i].size = hw_random8(); // random size, used only if size is set to max (SEGMENT.custom3=31)
+      PartSys->advPartProps[i].size = hw_random8(maxParticleSize); // random size, used only if size is set to max (SEGMENT.custom3=31)
       break; // only spawn one particle per frame for less chaotic transitions
     }
   }
@@ -8813,13 +8815,11 @@ uint16_t mode_particleattractor(void) {
     PartSys->angleEmit(PartSys->sources[0], SEGENV.aux0 + 0x7FFF, 12); // emit at 180° as well
   // apply force
   uint32_t strength = SEGMENT.speed;
-  #ifdef USERMOD_AUDIOREACTIVE
   um_data_t *um_data;
   if (UsermodManager::getUMData(&um_data, USERMOD_ID_AUDIOREACTIVE)) { // AR active, do not use simulated data
     uint32_t volumeSmth = (uint32_t)(*(float*) um_data->u_data[0]); // 0-255
     strength = (SEGMENT.speed * volumeSmth) >> 8;
   }
-  #endif
   for (uint32_t i = 0; i < PartSys->usedParticles; i++) {
     PartSys->pointAttractor(i, *attractor, strength, SEGMENT.check3);
   }
@@ -8878,7 +8878,6 @@ uint16_t mode_particlespray(void) {
   PartSys->sources[0].source.y = map(SEGMENT.custom2, 0, 255, 0, PartSys->maxY);
   uint16_t angle = (256 - (((int32_t)SEGMENT.custom3 + 1) << 3)) << 8;
 
-  #ifdef USERMOD_AUDIOREACTIVE
   um_data_t *um_data;
   if (UsermodManager::getUMData(&um_data, USERMOD_ID_AUDIOREACTIVE)) { // get AR data, do not use simulated data
     uint32_t volumeSmth  = (uint8_t)(*(float*)   um_data->u_data[0]); //0 to 255
@@ -8902,17 +8901,6 @@ uint16_t mode_particlespray(void) {
       PartSys->angleEmit(PartSys->sources[0], angle, SEGMENT.speed >> 2);
     }
   }
-  #else
-  // change source properties
-  if (SEGMENT.call % (11 - (SEGMENT.intensity / 25)) == 0) { // every nth frame, cycle color and emit particles
-    PartSys->sources[0].maxLife = 300; // lifetime in frames. note: could be done in init part, but AR moderequires this to be dynamic
-    PartSys->sources[0].minLife = 100;
-    PartSys->sources[0].source.hue++; // = hw_random16(); //change hue of spray source
-    // PartSys->sources[i].var = SEGMENT.custom3; // emiting variation = nozzle size (custom 3 goes from 0-32)
-    // spray[j].source.hue = hw_random16(); //set random color for each particle (using palette)
-    PartSys->angleEmit(PartSys->sources[0], angle, SEGMENT.speed >> 2);
-  }
-  #endif
 
   PartSys->update(); // update and render
   return FRAMETIME;
@@ -9204,16 +9192,14 @@ uint16_t mode_particleblobs(void) {
   SEGENV.aux0 = SEGMENT.speed; //write state back
   SEGENV.aux1 = SEGMENT.custom1;
 
-  #ifdef USERMOD_AUDIOREACTIVE
   um_data_t *um_data;
-  if (UsermodManager::getUMData(&um_data, USERMOD_ID_AUDIOREACTIVE)) { // get AR data, do not use simulated data
+  if (UsermodManager::getUMData(&um_data, USERMOD_ID_AUDIOREACTIVE)) { // get AR data if available, do not use simulated data
     uint8_t volumeSmth = (uint8_t)(*(float*)um_data->u_data[0]);
     for (uint32_t i = 0; i < PartSys->usedParticles; i++) { // update particles
       if (SEGMENT.check3) //pulsate selected
         PartSys->advPartProps[i].size = volumeSmth;
     }
   }
-  #endif
 
   PartSys->setMotionBlur(((SEGMENT.custom3) << 3) + 7);
   PartSys->update(); // update and render
@@ -9422,7 +9408,7 @@ uint16_t mode_particleDrip(void) {
       if (PartSys->particles[i].hue < 245)
         PartSys->particles[i].hue += 8;
     }
-    //increase speed on high settings by calling the move function twice
+    //increase speed on high settings by calling the move function twice note: this can lead to missed collisions
     if (SEGMENT.speed > 200)
       PartSys->particleMoveUpdate(PartSys->particles[i], PartSys->particleFlags[i]);
   }
@@ -9434,8 +9420,8 @@ static const char _data_FX_MODE_PARTICLEDRIP[] PROGMEM = "PS DripDrop@Speed,!,Sp
 
 
 /*
-  Particle Replacement for "Bbouncing Balls by Aircoookie"
-  Also replaces rolling balls and juggle (and maybe popcorn)
+  Particle Version of "Bouncing Balls by Aircoookie"
+  Also does rolling balls and juggle (and popcorn)
   Uses palette for particle color
   by DedeHai (Damian Schneider)
 */
@@ -9446,10 +9432,10 @@ uint16_t mode_particlePinball(void) {
     if (!initParticleSystem1D(PartSys, 1, 128, 0, true)) // init
       return mode_static(); // allocation failed or is single pixel
     PartSys->sources[0].sourceFlags.collide = true; // seeded particles will collide (if enabled)
-    PartSys->sources[0].source.x = PS_P_RADIUS_1D; //emit at bottom
-    PartSys->setKillOutOfBounds(true); // out of bounds particles dont return
+    PartSys->sources[0].source.x = -1000; // shoot up from below
+    //PartSys->setKillOutOfBounds(true); // out of bounds particles dont return (except on top, taken care of by gravity setting)
     SEGENV.aux0 = 1;
-    SEGENV.aux1 = 5000; //set out of range to ensure uptate on first call
+    SEGENV.aux1 = 5000; // set settings out of range to ensure uptate on first call
   }
   else
     PartSys = reinterpret_cast<ParticleSystem1D *>(SEGENV.data); // if not first call, just set the pointer to the PS
@@ -9460,69 +9446,71 @@ uint16_t mode_particlePinball(void) {
   // Particle System settings
   //uint32_t hardness = 240 + (SEGMENT.custom1>>4);
   PartSys->updateSystem(); // update system properties (dimensions and data pointers)
-  PartSys->setGravity(map(SEGMENT.custom3, 0 , 31, 0 , 16)); // set gravity (8 is default strength)
+  PartSys->setGravity(map(SEGMENT.custom3, 0 , 31, 0 , 8)); // set gravity (8 is default strength)
   PartSys->setBounce(SEGMENT.custom3); // disables bounce if no gravity is used
   PartSys->setMotionBlur(SEGMENT.custom2); // anable motion blur
   PartSys->enableParticleCollisions(SEGMENT.check1, 255); // enable collisions and set particle collision to high hardness
-  PartSys->setUsedParticles(SEGMENT.intensity);
   PartSys->setColorByPosition(SEGMENT.check3);
-  /*
-  // TODO: update 1D system to use the same logic for per particle size as 2D system
+  uint32_t maxParticles = max(20, SEGMENT.intensity / (1 + (SEGMENT.check2 * (SEGMENT.custom1 >> 5)))); // max particles depends on intensity and rolling balls mode + size
   if (SEGMENT.custom1 < 255)
     PartSys->setParticleSize(SEGMENT.custom1); // set size globally
-  else
-    PartSys->perParticleSize = true;
-  */ 
+  else {
+    PartSys->perParticleSize = true; // use random individual particle size (see below)
+    maxParticles *= 2; // use more particles if individual s  ize is used as there is more space
+  }
+  PartSys->setUsedParticles(maxParticles); // reduce if using larger size and rolling balls mode
 
   bool updateballs = false;
   if (SEGENV.aux1 != SEGMENT.speed + SEGMENT.intensity + SEGMENT.check2 + SEGMENT.custom1 + PartSys->usedParticles) { // user settings change or more particles are available
     SEGENV.step = SEGMENT.call; // reset delay
     updateballs = true;
-    PartSys->sources[0].maxLife = SEGMENT.custom3 ? 5000 : 0xFFFF; // maximum lifetime in frames/2 (very long if not using gravity, this is enough to travel 4000 pixels at min speed)
+    PartSys->sources[0].maxLife = SEGMENT.custom3 ? 1000 : 0xFFFF; // maximum lifetime in frames/2 (very long if not using gravity, this is enough to travel 4000 pixels at min speed)
     PartSys->sources[0].minLife = PartSys->sources[0].maxLife >> 1;
   }
 
-  if (SEGMENT.check2) { //rolling balls
+  if (SEGMENT.check2) { // rolling balls
     PartSys->setGravity(0);
     PartSys->setWallHardness(255);
     int speedsum = 0;
     for (uint32_t i = 0; i < PartSys->usedParticles; i++) {
-        PartSys->particles[i].ttl = 260; // keep particles alive
-      if (updateballs) { //speed changed or particle is dead, set particle properties
+        PartSys->particles[i].ttl = 500; // keep particles alive
+      if (updateballs) { // speed changed or particle is dead, set particle properties
         PartSys->particleFlags[i].collide = true;
-        if (PartSys->particles[i].x == 0) { // still at initial position (when not switching from a PS)
+        if (PartSys->particles[i].x == 0) { // still at initial position
           PartSys->particles[i].x = hw_random16(PartSys->maxX); // random initial position for all particles
           PartSys->particles[i].vx = (hw_random16() & 0x01) ? 1 : -1; // random initial direction
         }
         PartSys->particles[i].hue = hw_random8(); //set ball colors to random
         PartSys->advPartProps[i].sat = 255;
-        PartSys->advPartProps[i].size = SEGMENT.custom1 < 255 ? SEGMENT.custom1 : hw_random8(); //set ball size
+        PartSys->advPartProps[i].size = hw_random8(); // set ball size for individual size mode
       }
       speedsum += abs(PartSys->particles[i].vx);
     }
     int32_t avgSpeed = speedsum / PartSys->usedParticles;
-    int32_t setSpeed = 2 + (SEGMENT.speed >> 3);
+    int32_t setSpeed = 2 + (SEGMENT.speed >> 2);
     if (avgSpeed < setSpeed) { // if balls are slow, speed up some of them at random to keep the animation going
       for (int i = 0; i < setSpeed - avgSpeed; i++) {
         int idx = hw_random16(PartSys->usedParticles);
-        PartSys->particles[idx].vx += PartSys->particles[idx].vx >= 0 ? 1 : -1; // add 1, keep direction
+        if (abs(PartSys->particles[idx].vx) < PS_P_MAXSPEED)
+          PartSys->particles[idx].vx += PartSys->particles[idx].vx >= 0 ? 1 : -1; // add 1, keep direction
       }
     }
     else if (avgSpeed > setSpeed + 8) // if avg speed is too high, apply friction to slow them down
       PartSys->applyFriction(1);
   }
-  else { //bouncing balls
+  else { // bouncing balls
     PartSys->setWallHardness(220);
     PartSys->sources[0].var = SEGMENT.speed >> 3;
     int32_t newspeed = 2 + (SEGMENT.speed >> 1) - (SEGMENT.speed >> 3);
     PartSys->sources[0].v = newspeed;
     //check for balls that are 'laying on the ground' and remove them
     for (uint32_t i = 0; i < PartSys->usedParticles; i++) {
-      if (PartSys->particles[i].vx == 0 && PartSys->particles[i].x < (PS_P_RADIUS_1D + SEGMENT.custom1))
-        PartSys->particles[i].ttl = 0;
+      if (PartSys->particles[i].ttl < 50) PartSys->particles[i].ttl = 0; // no dark particles
+      else if (PartSys->particles[i].vx == 0 && PartSys->particles[i].x < (PS_P_RADIUS_1D + SEGMENT.custom1))
+        PartSys->particles[i].ttl -= 50; // age fast
+
       if (updateballs) {
-        PartSys->advPartProps[i].size = SEGMENT.custom1;
-        if (SEGMENT.custom3 == 0) //gravity off, update speed
+        if (SEGMENT.custom3 == 0) // gravity off, update speed
           PartSys->particles[i].vx = PartSys->particles[i].vx > 0 ? newspeed : -newspeed; //keep the direction
       }
     }
@@ -9533,14 +9521,14 @@ uint16_t mode_particlePinball(void) {
       SEGENV.step += interval + hw_random16(interval);
       PartSys->sources[0].source.hue = hw_random16(); //set ball color
       PartSys->sources[0].sat = 255;
-      PartSys->sources[0].size = SEGMENT.custom1 < 255 ? SEGMENT.custom1 : hw_random8(); //set ball size
+      PartSys->sources[0].size = hw_random8(); //set ball size
       PartSys->sprayEmit(PartSys->sources[0]);
     }
   }
   SEGENV.aux1 = SEGMENT.speed + SEGMENT.intensity + SEGMENT.check2 + SEGMENT.custom1 + PartSys->usedParticles;
-  for (uint32_t i = 0; i < PartSys->usedParticles; i++) {
-    PartSys->particleMoveUpdate(PartSys->particles[i], PartSys->particleFlags[i]); // double the speed
-  }
+  //for (uint32_t i = 0; i < PartSys->usedParticles; i++) {
+  //  PartSys->particleMoveUpdate(PartSys->particles[i], PartSys->particleFlags[i]); // double the speed  note: this leads to bad collisions, also need to run collision detection before
+  //}
 
   PartSys->update(); // update and render
   return FRAMETIME;
@@ -9888,7 +9876,7 @@ uint16_t mode_particleHourglass(void) {
   PartSys->setUsedParticles(1 + ((SEGMENT.intensity * 255) >> 8));
   PartSys->setMotionBlur(SEGMENT.custom2); // anable motion blur
   PartSys->setGravity(map(SEGMENT.custom3, 0, 31, 1, 30));
-  PartSys->enableParticleCollisions(true, 32); // hardness value found by experimentation on different settings
+  PartSys->enableParticleCollisions(true, 64); // hardness value (found by experimentation on different settings)
 
   uint32_t colormode = SEGMENT.custom1 >> 5; // 0-7
 
@@ -9902,6 +9890,12 @@ uint16_t mode_particleHourglass(void) {
     SEGENV.aux0 = PartSys->usedParticles - 1; // initial state, start with highest number particle
   }
 
+  // re-order particles in case heavy collisions flipped particles (highest number index particle is on the "bottom")
+  for (uint32_t i = 0; i < PartSys->usedParticles - 1; i++) {
+    if (PartSys->particles[i].x < PartSys->particles[i+1].x && PartSys->particleFlags[i].fixed == false && PartSys->particleFlags[i+1].fixed == false) {
+      std::swap(PartSys->particles[i].x, PartSys->particles[i+1].x);
+    }
+  }
   // calculate target position depending on direction
   auto calcTargetPos = [&](size_t i) {
     return PartSys->particleFlags[i].reversegrav ?
@@ -9909,12 +9903,12 @@ uint16_t mode_particleHourglass(void) {
         : (PartSys->usedParticles - i) * PS_P_RADIUS_1D - positionOffset;
   };
 
-
   for (uint32_t i = 0; i < PartSys->usedParticles; i++) { // check if particle reached target position after falling
     if (PartSys->particleFlags[i].fixed == false && abs(PartSys->particles[i].vx) < 5) {
       int32_t targetposition = calcTargetPos(i);
-      bool closeToTarget = abs(targetposition - PartSys->particles[i].x) < 3 * PS_P_RADIUS_1D;
-      if (closeToTarget) { // close to target and slow speed
+      bool belowtarget = PartSys->particleFlags[i].reversegrav ? (PartSys->particles[i].x > targetposition) : (PartSys->particles[i].x < targetposition);
+      bool closeToTarget = abs(targetposition - PartSys->particles[i].x) < PS_P_RADIUS_1D;
+      if (belowtarget || closeToTarget) { // overshot target or close to target and slow speed
         PartSys->particles[i].x = targetposition; // set exact position
         PartSys->particleFlags[i].fixed = true;   // pin particle
       }
@@ -9928,25 +9922,17 @@ uint16_t mode_particleHourglass(void) {
         case 0: PartSys->particles[i].hue = 120; break; // fixed at 120, if flip is activated, this can make red and green (use palette 34)
         case 1: PartSys->particles[i].hue = basehue; break; // fixed selectable color
         case 2: // 2 colors inverleaved (same code as 3)
-        case 3: PartSys->particles[i].hue = ((SEGMENT.custom1 & 0x1F) << 1) + (i % colormode)*74; break; // interleved colors (every 2 or 3 particles)
+        case 3: PartSys->particles[i].hue = ((SEGMENT.custom1 & 0x1F) << 1) + (i % 3)*74; break; // 3 interleved colors
         case 4: PartSys->particles[i].hue = basehue + (i * 255) / PartSys->usedParticles;  break; // gradient palette colors
         case 5: PartSys->particles[i].hue = basehue + (i * 1024) / PartSys->usedParticles;  break; // multi gradient palette colors
         case 6: PartSys->particles[i].hue = i + (strip.now >> 3);  break; // disco! moving color gradient
-        default: break;
+        default: break; // use color by position
       }
     }
     if (SEGMENT.check1 && !PartSys->particleFlags[i].reversegrav) // flip color when fallen
       PartSys->particles[i].hue += 120;
   }
 
-  // re-order particles in case collisions flipped particles (highest number index particle is on the "bottom")
-  for (uint32_t i = 0; i < PartSys->usedParticles - 1; i++) {
-    if (PartSys->particles[i].x < PartSys->particles[i+1].x && PartSys->particleFlags[i].fixed == false && PartSys->particleFlags[i+1].fixed == false) {
-      std::swap(PartSys->particles[i].x, PartSys->particles[i+1].x);
-    }
-  }
-
-
   if (SEGENV.aux1 == 1) { // last countdown call before dropping starts, reset all particles
     for (uint32_t i = 0; i < PartSys->usedParticles; i++) {
       PartSys->particleFlags[i].collide = true;
@@ -9958,19 +9944,19 @@ uint16_t mode_particleHourglass(void) {
   }
 
   if (SEGENV.aux1 == 0) { // countdown passed, run
-    if (strip.now >= SEGENV.step) { // drop a particle, do not drop more often than every second frame or particles tangle up quite badly
+    if (strip.now >= SEGENV.step) { // drop a particle
       // set next drop time
       if (SEGMENT.check3 && *direction) // fast reset
         SEGENV.step = strip.now + 100; // drop one particle every 100ms
       else // normal interval
-        SEGENV.step = strip.now + max(20, SEGMENT.speed * 20); // map speed slider from 0.1s to 5s
+        SEGENV.step = strip.now + max(100, SEGMENT.speed * 100); // map speed slider from 0.1s to 25.5s
       if (SEGENV.aux0 < PartSys->usedParticles) {
         PartSys->particleFlags[SEGENV.aux0].reversegrav = *direction; // let this particle fall or rise
         PartSys->particleFlags[SEGENV.aux0].fixed = false; // unpin
       }
       else { // overflow
         *direction = !(*direction); // flip direction
-        SEGENV.aux1 = SEGMENT.virtualLength() + 100; // set countdown
+        SEGENV.aux1 = (SEGMENT.check2) * SEGMENT.vLength() + 100; // set restart countdown, make it short if auto start is unchecked
       }
       if (*direction == 0) // down, start dropping the highest number particle
         SEGENV.aux0--; // next particle
@@ -9978,14 +9964,14 @@ uint16_t mode_particleHourglass(void) {
         SEGENV.aux0++;
     }
   }
-  else if (SEGMENT.check2) // auto reset
+  else if (SEGMENT.check2) // auto start/reset
     SEGENV.aux1--; // countdown
 
   PartSys->update(); // update and render
 
   return FRAMETIME;
 }
-static const char _data_FX_MODE_PS_HOURGLASS[] PROGMEM = "PS Hourglass@Interval,!,Color,Blur,Gravity,Colorflip,Start,Fast Reset;,!;!;1;pal=34,sx=50,ix=200,c1=140,c2=80,c3=4,o1=1,o2=1,o3=1";
+static const char _data_FX_MODE_PS_HOURGLASS[] PROGMEM = "PS Hourglass@Interval,!,Color,Blur,Gravity,Colorflip,Start,Fast Reset;,!;!;1;pal=34,sx=5,ix=200,c1=140,c2=80,c3=4,o1=1,o2=1,o3=1";
 
 /*
   Particle based Spray effect (like a volcano, possible replacement for popcorn)
@@ -10102,7 +10088,7 @@ uint16_t mode_particleBalance(void) {
   }
 
   uint32_t randomindex = hw_random16(PartSys->usedParticles);
-  PartSys->particles[randomindex].vx = ((int32_t)PartSys->particles[randomindex].vx * 200) / 255;  // apply friction to random particle to reduce clumping (without collisions)
+  PartSys->particles[randomindex].vx = ((int32_t)PartSys->particles[randomindex].vx * 200) / 255;  // apply friction to random particle to reduce clumping
 
   //if (SEGMENT.check2 && (SEGMENT.call & 0x07) == 0) // no walls, apply friction to smooth things out
   if ((SEGMENT.call & 0x0F) == 0 && SEGMENT.custom3 > 4) // apply friction every 16th frame to smooth things out (except for low tilt)
@@ -10128,7 +10114,7 @@ by DedeHai (Damian Schneider)
 uint16_t mode_particleChase(void) {
   ParticleSystem1D *PartSys = nullptr;
   if (SEGMENT.call == 0) { // initialization
-    if (!initParticleSystem1D(PartSys, 1, 255, 2, true)) // init
+    if (!initParticleSystem1D(PartSys, 1, 191, 2, true)) // init
       return mode_static(); // allocation failed or is single pixel
     SEGENV.aux0 = 0xFFFF; // invalidate
     *PartSys->PSdataEnd = 1; // huedir
@@ -10142,15 +10128,17 @@ uint16_t mode_particleChase(void) {
   PartSys->updateSystem(); // update system properties (dimensions and data pointers)
   PartSys->setColorByPosition(SEGMENT.check3);
   PartSys->setMotionBlur(7 + ((SEGMENT.custom3) << 3)); // anable motion blur
-  uint32_t numParticles = 1 + map(SEGMENT.intensity, 0, 255, 2, 255 / (1 + (SEGMENT.custom1 >> 6))); // depends on intensity and particle size (custom1), minimum 1
+  uint32_t numParticles = 1 + map(SEGMENT.intensity, 0, 255, 0, PartSys->usedParticles / (1 + (SEGMENT.custom1 >> 5))); // depends on intensity and particle size (custom1), minimum 1
   numParticles = min(numParticles, PartSys->usedParticles); // limit to available particles
   int32_t huestep = 1 + ((((uint32_t)SEGMENT.custom2 << 19) / numParticles) >> 16); // hue increment
   uint32_t settingssum = SEGMENT.speed + SEGMENT.intensity + SEGMENT.custom1 + SEGMENT.custom2 + SEGMENT.check1 + SEGMENT.check2 + SEGMENT.check3;
   if (SEGENV.aux0 != settingssum) { // settings changed changed, update
     if (SEGMENT.check1)
       SEGENV.step = PartSys->advPartProps[0].size / 2 + (PartSys->maxX / numParticles);
-    else
-      SEGENV.step = (PartSys->maxX + (PS_P_RADIUS_1D << 5)) / numParticles; // spacing between particles
+    else {
+      SEGENV.step = (PartSys->maxX + (PS_P_RADIUS_1D << 6)) / numParticles; // spacing between particles
+      SEGENV.step = (SEGENV.step / PS_P_RADIUS_1D) * PS_P_RADIUS_1D; // round down to nearest multiple of particle subpixel unit to align to pixel grid (makes them move in union)
+    }
     for (int32_t i = 0; i < (int32_t)PartSys->usedParticles; i++) {
       PartSys->advPartProps[i].sat = 255;
       PartSys->particles[i].x = (i - 1) * SEGENV.step; // distribute evenly (starts out of frame for i=0)
@@ -10616,7 +10604,7 @@ by DedeHai (Damian Schneider)
 uint16_t mode_particleSpringy(void) {
   ParticleSystem1D *PartSys = nullptr;
   if (SEGMENT.call == 0) { // initialization
-    if (!initParticleSystem1D(PartSys, 1, 128, 0, true)) // init
+    if (!initParticleSystem1D(PartSys, 1, 128, 0, true)) // init with advanced properties (used for spring forces)
       return mode_static(); // allocation failed or is single pixel
     SEGENV.aux0 = SEGENV.aux1 = 0xFFFF; // invalidate settings
   }
@@ -10629,18 +10617,20 @@ uint16_t mode_particleSpringy(void) {
   PartSys->setMotionBlur(220 * SEGMENT.check1); // anable motion blur
   PartSys->setSmearBlur(50); // smear a little
   PartSys->setUsedParticles(map(SEGMENT.custom1, 0, 255, 30 >> SEGMENT.check2, 255  >> (SEGMENT.check2*2))); // depends on density and particle size
- // PartSys->enableParticleCollisions(true, 140); // enable particle collisions, can not be set too hard or impulses will not strech the springs if soft.
+  //PartSys->enableParticleCollisions(true, 140); // enable particle collisions, can not be set too hard or impulses will not strech the springs if soft.
   int32_t springlength = PartSys->maxX / (PartSys->usedParticles); // spring length (spacing between particles)
   int32_t springK = map(SEGMENT.speed, 0, 255, 5, 35); // spring constant (stiffness)
 
   uint32_t settingssum = SEGMENT.custom1 + SEGMENT.check2;
+  PartSys->setParticleSize(SEGMENT.check2 ? 120 : 1); // large or small particles
+
   if (SEGENV.aux0 != settingssum) { // number of particles changed, update distribution
     for (int32_t i = 0; i < (int32_t)PartSys->usedParticles; i++) {
       PartSys->advPartProps[i].sat = 255; // full saturation
-      //PartSys->particleFlags[i].collide = true; // enable collision for particles
+      //PartSys->particleFlags[i].collide = true; // enable collision for particles -> results in chaos, removed for now
       PartSys->particles[i].x = (i+1) * ((PartSys->maxX) / (PartSys->usedParticles)); // distribute
       //PartSys->particles[i].vx = 0; //reset speed
-      PartSys->advPartProps[i].size = SEGMENT.check2 ? 190 : 2; // set size, small or big
+      //PartSys->advPartProps[i].size = SEGMENT.check2 ? 190 : 2; // set size, small or big -> use global size
     }
     SEGENV.aux0 = settingssum;
   }
@@ -10732,7 +10722,6 @@ uint16_t mode_particleSpringy(void) {
       int speed = SEGMENT.custom3 - 10 - (index ? 10 : 0); // map 11-20 and 21-30 to 1-10
       int phase = strip.now * ((1 + (SEGMENT.speed >> 4)) * speed);
       if (SEGMENT.check2) amplitude <<= 1; // double amplitude for XL particles
-      //PartSys->applyForce(PartSys->particles[index], (sin16_t(phase) * amplitude) >> 15, PartSys->advPartProps[index].forcecounter); // apply acceleration
       PartSys->particles[index].x = restposition + ((sin16_t(phase) * amplitude) >> 12); // apply position
     }
     else {
diff --git a/wled00/FXparticleSystem.cpp b/wled00/FXparticleSystem.cpp
index 1a9bd74b47..9aff84b83b 100644
--- a/wled00/FXparticleSystem.cpp
+++ b/wled00/FXparticleSystem.cpp
@@ -88,7 +88,7 @@ void ParticleSystem2D::updateFire(const uint8_t intensity,const bool renderonly)
 
 // set percentage of used particles as uint8_t i.e 127 means 50% for example
 void ParticleSystem2D::setUsedParticles(uint8_t percentage) {
-  usedParticles = (numParticles * ((int)percentage+1)) >> 8; // number of particles to use (percentage is 0-255, 255 = 100%)
+  usedParticles = max((uint32_t)1, (numParticles * ((int)percentage+1)) >> 8); // number of particles to use (percentage is 0-255, 255 = 100%)
   PSPRINT(" SetUsedpaticles: allocated particles: ");
   PSPRINT(numParticles);
   PSPRINT(" ,used particles: ");
@@ -214,7 +214,7 @@ void ParticleSystem2D::flameEmit(const PSsource &emitter) {
 // angle = 0 means in positive x-direction (i.e. to the right)
 int32_t ParticleSystem2D::angleEmit(PSsource &emitter, const uint16_t angle, const int32_t speed) {
   emitter.vx = ((int32_t)cos16_t(angle) * speed) / (int32_t)32600; // cos16_t() and sin16_t() return signed 16bit, division should be 32767 but 32600 gives slightly better rounding
-  emitter.vy = ((int32_t)sin16_t(angle) * speed) / (int32_t)32600; // note: cannot use bit shifts as bit shifting is asymmetrical for positive and negative numbers and this needs to be accurate!
+  emitter.vy = ((int32_t)sin16_t(angle) * speed) / (int32_t)32600; // note: cannot use bit shifts as bit shifting is asymmetrical (1>>1=0 / -1>>1=-1) and this needs to be accurate!
   return sprayEmit(emitter);
 }
 
@@ -236,8 +236,11 @@ void ParticleSystem2D::particleMoveUpdate(PSparticle &part, PSparticleFlags &par
     partFlags.outofbounds = false; // reset out of bounds (in case particle was created outside the matrix and is now moving into view) note: moving this to checks below adds code and is not faster
 
     if (perParticleSize && advancedproperties != nullptr) { // using individual particle size
-      renderradius = PS_P_HALFRADIUS - 1 + advancedproperties->size;
-      particleHardRadius = PS_P_MINHARDRADIUS + ((advancedproperties->size * 52) >> 6); // use 1 pixel + 80% of size for hard radius (slight overlap with boarders so they do not "float")
+      renderradius = PS_P_HALFRADIUS - 1 + advancedproperties->size; // note: single pixel particles should be zero but OOB checks in rendering function handle this
+      if (advancedproperties->size > 0)
+        particleHardRadius = PS_P_MINHARDRADIUS + ((advancedproperties->size * 52) >> 6); // use 1 pixel + 80% of size for hard radius (slight overlap with boarders so they do not "float")
+      else // single pixel particles use half the collision distance for walls
+        particleHardRadius = PS_P_MINHARDRADIUS >> 1;
     }
     // note: if wall collisions are enabled, bounce them before they reach the edge, it looks much nicer if the particle does not go half out of view
     if (options->bounceY) {
@@ -446,7 +449,7 @@ void ParticleSystem2D::applyForce(const int8_t xforce, const int8_t yforce) {
 // force is in 3.4 fixed point notation so force=16 means apply v+1 each frame (useful force range is +/- 127)
 void ParticleSystem2D::applyAngleForce(PSparticle &part, const int8_t force, const uint16_t angle, uint8_t &counter) {
   int8_t xforce = ((int32_t)force * cos16_t(angle)) / 32767; // force is +/- 127
-  int8_t yforce = ((int32_t)force * sin16_t(angle)) / 32767; // note: cannot use bit shifts as bit shifting is asymmetrical for positive and negative numbers and this needs to be accurate!
+  int8_t yforce = ((int32_t)force * sin16_t(angle)) / 32767; // note: cannot use bit shifts as bit shifting is asymmetrical (1>>1=0 / -1>>1=-1) and this needs to be accurate!
   applyForce(part, xforce, yforce, counter);
 }
 
@@ -460,7 +463,7 @@ void ParticleSystem2D::applyAngleForce(const uint32_t particleindex, const int8_
 // angle is from 0-65535 (=0-360deg) angle = 0 means in positive x-direction (i.e. to the right)
 void ParticleSystem2D::applyAngleForce(const int8_t force, const uint16_t angle) {
   int8_t xforce = ((int32_t)force * cos16_t(angle)) / 32767; // force is +/- 127
-  int8_t yforce = ((int32_t)force * sin16_t(angle)) / 32767; // note: cannot use bit shifts as bit shifting is asymmetrical for positive and negative numbers and this needs to be accurate!
+  int8_t yforce = ((int32_t)force * sin16_t(angle)) / 32767; // note: cannot use bit shifts as bit shifting is asymmetrical (1>>1=0 / -1>>1=-1) and this needs to be accurate!
   applyForce(xforce, yforce);
 }
 
@@ -543,7 +546,7 @@ void ParticleSystem2D::pointAttractor(const uint32_t particleindex, PSparticle &
 
   int32_t force = ((int32_t)strength << 16) / distanceSquared;
   int8_t xforce = (force * dx) / 1024; // scale to a lower value, found by experimenting
-  int8_t yforce = (force * dy) / 1024; // note: cannot use bit shifts as bit shifting is asymmetrical for positive and negative numbers and this needs to be accurate!
+  int8_t yforce = (force * dy) / 1024; // note: cannot use bit shifts as bit shifting is asymmetrical (1>>1=0 / -1>>1=-1) and this needs to be accurate!
   applyForce(particleindex, xforce, yforce);
 }
 
@@ -602,109 +605,16 @@ void ParticleSystem2D::render() {
 
   // apply 2D blur to rendered frame
   if (smearBlur) {
-    blur2D(framebuffer, maxXpixel + 1, maxYpixel + 1, smearBlur, smearBlur);
+    SEGMENT.blur2D(smearBlur, smearBlur, true);
   }
 }
 
-// render particle as ellipse/circle with linear brightness falloff and sub-pixel precision
-void WLED_O2_ATTR ParticleSystem2D::renderParticleEllipse(const uint32_t particleindex, const uint8_t brightness, const CRGBW& color, const bool wrapX, const bool wrapY) {
-  uint32_t size = particlesize;
-  if (perParticleSize && advPartProps != nullptr) // individual particle size
-    size = advPartProps[particleindex].size;
-
-  // particle position with sub-pixel precision
-  int32_t x_subcenter = particles[particleindex].x;
-  int32_t y_subcenter = particles[particleindex].y;
-
-  // example: for x = 128, a paticle is exacly between pixel 1 and 2, with a radius of 2 pixels, we draw pixels 0-3
-  // integer center jumps when x = 127 -> pixel 1 goes to x = 128 -> pixel 2
-  // when calculating the dx, we need to take this into account: at x = 128 the x offset is 1, the pixel center is at pixel 2:
-  // for pixel 1, dx = 1 * PS_P_RADIUS - 128 = -64 but the center of the pixel is actually only -32 from the particle center so need to add half a radius:
-  // dx = pixel_x * PS_P_RADIUS - x_subcenter + PS_P_HALFRADIUS
-
-  // sub-pixel offset (0-63)
-  int32_t x_offset = x_subcenter & (PS_P_RADIUS - 1); // same as modulo PS_P_RADIUS but faster
-  int32_t y_offset = y_subcenter & (PS_P_RADIUS - 1);
-  // integer pixel position, this is rounded down
-  int32_t x_center = (x_subcenter) >> PS_P_RADIUS_SHIFT;
-  int32_t y_center = (y_subcenter) >> PS_P_RADIUS_SHIFT;
-
-  // ellipse radii in pixels
-  uint32_t xsize = size;
-  uint32_t ysize = size;
-  if (advPartSize != nullptr && advPartSize[particleindex].asymmetry > 0) {
-    getParticleXYsize(&advPartProps[particleindex], &advPartSize[particleindex], xsize, ysize);
-  }
-
-  int32_t rx_subpixel = xsize+65; // size = 1 means radius of just over 1 pixel
-  int32_t ry_subpixel = ysize+65; // size = 255 is radius of 5, so add 65 -> 65+255=320, 320>>6=5 pixels
-
-  // rendering bounding box in pixels
-  int32_t rx_pixels = (rx_subpixel >> PS_P_RADIUS_SHIFT);
-  int32_t ry_pixels = (ry_subpixel >> PS_P_RADIUS_SHIFT);
-
-  int32_t x_min = x_center - rx_pixels;
-  int32_t x_max = x_center + rx_pixels;
-  int32_t y_min = y_center - ry_pixels;
-  int32_t y_max = y_center + ry_pixels;
-
-  // cache for speed
-  uint32_t matrixX = maxXpixel + 1;
-  uint32_t matrixY = maxYpixel + 1;
-  uint32_t rx_sq = rx_subpixel * rx_subpixel;
-  uint32_t ry_sq = ry_subpixel * ry_subpixel;
-
-  // iterate over bounding box and render each pixel
-  for (int32_t py = y_min; py <= y_max; py++) {
-    for (int32_t px = x_min; px <= x_max; px++) {
-      // distance from particle center, explanation see above
-      int32_t dx_subpixel = (px << PS_P_RADIUS_SHIFT) - x_subcenter + PS_P_HALFRADIUS;
-      int32_t dy_subpixel = (py << PS_P_RADIUS_SHIFT) - y_subcenter + PS_P_HALFRADIUS;
-
-      // calculate brightness based on squared distance to ellipse center
-      uint8_t pixel_brightness = calculateEllipseBrightness(dx_subpixel, dy_subpixel, rx_sq, ry_sq, brightness);
-
-      if (pixel_brightness == 0) continue; // Skip fully transparent pixels
-
-      // apply inverse gamma correction if needed, if this is skipped, particles flicker due to changing total brightness
-      if (gammaCorrectCol) {
-        pixel_brightness = gamma8inv(pixel_brightness); // invert brigthess so brightness distribution is linear after gamma correction
-      }
-
-      // Handle wrapping and bounds
-      int32_t render_x = px;
-      int32_t render_y = py;
-
-      // Check bounds and apply wrapping
-      if (render_x < 0) {
-        if (!wrapX) continue;
-        render_x += matrixX;
-      } else if (render_x > maxXpixel) {
-        if (!wrapX) continue;
-        render_x -= matrixX;
-      }
-
-      if (render_y < 0) {
-        if (!wrapY) continue;
-        render_y += matrixY;
-      } else if (render_y > maxYpixel) {
-        if (!wrapY) continue;
-        render_y -= matrixY;
-      }
-      // Render pixel
-      uint32_t idx = render_x + (maxYpixel - render_y) * matrixX; // flip y coordinate (0,0 is bottom left in PS but top left in framebuffer)
-      framebuffer[idx] = fast_color_scaleAdd(framebuffer[idx], color, pixel_brightness);
-    }
-  }
-}
-
-
 // calculate pixel positions and brightness distribution and render the particle to local buffer or global buffer
 void WLED_O2_ATTR ParticleSystem2D::renderParticle(const uint32_t particleindex, const uint8_t brightness, const CRGBW& color, const bool wrapX, const bool wrapY) {
   uint32_t size = particlesize;
 
   if (perParticleSize && advPartProps != nullptr) // use advanced size properties
-    size = advPartProps[particleindex].size;
+    size = 1 + advPartProps[particleindex].size; // add 1 to avoid single pixel size particles (collisions do not support it)
 
   if (size == 0) { // single pixel rendering
     uint32_t x = particles[particleindex].x >> PS_P_RADIUS_SHIFT;
@@ -717,7 +627,7 @@ void WLED_O2_ATTR ParticleSystem2D::renderParticle(const uint32_t particleindex,
   }
 
   if (size > 1) { // size > 1: render as ellipse
-    renderParticleEllipse(particleindex, brightness, color, wrapX, wrapY); // larger size rendering
+    renderLargeParticle(size, particleindex, brightness, color, wrapX, wrapY); // larger size rendering
     return;
   }
 
@@ -760,19 +670,19 @@ void WLED_O2_ATTR ParticleSystem2D::renderParticle(const uint32_t particleindex,
   // - apply inverse gamma correction to brightness values
   // - gamma is applied again in show() -> the resulting brightness distribution is linear but gamma corrected in total
   if (gammaCorrectCol) {
-    pxlbrightness[0] = gamma8inv(pxlbrightness[0]); // use look-up-table for invers gamma
-    pxlbrightness[1] = gamma8inv(pxlbrightness[1]);
-    pxlbrightness[2] = gamma8inv(pxlbrightness[2]);
-    pxlbrightness[3] = gamma8inv(pxlbrightness[3]);
+    for (uint32_t i = 0; i < 4; i++) {
+      pxlbrightness[i] = gamma8inv(pxlbrightness[i]); // use look-up-table for invers gamma
+    }
   }
 
   // standard rendering (2x2 pixels)
   // check for out of frame pixels and wrap them if required: x,y is bottom left pixel coordinate of the particle
-  if (x < 0) { // left pixels out of frame
+  if (pixco[0].x < 0) { // left pixels out of frame
     if (wrapX) { // wrap x to the other side if required
       pixco[0].x = pixco[3].x = maxXpixel;
     } else {
       pixelvalid[0] = pixelvalid[3] = false; // out of bounds
+      if (pixco[0].x < -1) return; // both left pixels out of bounds, no need to continue (safety check)
     }
   }
   else if (pixco[1].x > (int32_t)maxXpixel) { // right pixels, only has to be checked if left pixel is in frame
@@ -780,14 +690,16 @@ void WLED_O2_ATTR ParticleSystem2D::renderParticle(const uint32_t particleindex,
       pixco[1].x = pixco[2].x = 0;
     } else {
       pixelvalid[1] = pixelvalid[2] = false; // out of bounds
+      if (pixco[0].x > (int32_t)maxXpixel) return; // both pixels out of bounds, no need to continue (safety check)
     }
   }
 
-  if (y < 0) { // bottom pixels out of frame
+  if (pixco[0].y < 0) { // bottom pixels out of frame
     if (wrapY) { // wrap y to the other side if required
       pixco[0].y = pixco[1].y = maxYpixel;
     } else {
       pixelvalid[0] = pixelvalid[1] = false; // out of bounds
+      if (pixco[0].y < -1) return; // both bottom pixels out of bounds, no need to continue (safety check)
     }
   }
   else if (pixco[2].y > maxYpixel) { // top pixels
@@ -795,6 +707,7 @@ void WLED_O2_ATTR ParticleSystem2D::renderParticle(const uint32_t particleindex,
       pixco[2].y = pixco[3].y = 0;
     } else {
       pixelvalid[2] = pixelvalid[3] = false; // out of bounds
+      if (pixco[2].y > (int32_t)maxYpixel + 1) return; // both top pixels out of bounds, no need to continue (safety check)
     }
   }
   for (uint32_t i = 0; i < 4; i++) {
@@ -805,32 +718,123 @@ void WLED_O2_ATTR ParticleSystem2D::renderParticle(const uint32_t particleindex,
   }
 }
 
+// render particle as ellipse/circle with linear brightness falloff and sub-pixel precision
+void WLED_O2_ATTR ParticleSystem2D::renderLargeParticle(const uint32_t size, const uint32_t particleindex, const uint8_t brightness, const CRGBW& color, const bool wrapX, const bool wrapY) {
+  // particle position with sub-pixel precision
+  int32_t x_subcenter = particles[particleindex].x;
+  int32_t y_subcenter = particles[particleindex].y;
+
+  // example: for x = 128, a paticle is exacly between pixel 1 and 2, with a radius of 2 pixels, we draw pixels 0-3
+  // integer center jumps when x = 127 -> pixel 1 goes to x = 128 -> pixel 2
+  // when calculating the dx, we need to take this into account: at x = 128 the x offset is 1, the pixel center is at pixel 2:
+  // for pixel 1, dx = 1 * PS_P_RADIUS - 128 = -64 but the center of the pixel is actually only -32 from the particle center so need to add half a radius:
+  // dx = pixel_x * PS_P_RADIUS - x_subcenter + PS_P_HALFRADIUS
+
+  // sub-pixel offset (0-63)
+  int32_t x_offset = x_subcenter & (PS_P_RADIUS - 1); // same as modulo PS_P_RADIUS but faster
+  int32_t y_offset = y_subcenter & (PS_P_RADIUS - 1);
+  // integer pixel position, this is rounded down
+  int32_t x_center = (x_subcenter) >> PS_P_RADIUS_SHIFT;
+  int32_t y_center = (y_subcenter) >> PS_P_RADIUS_SHIFT;
+
+  // ellipse radii in pixels
+  uint32_t xsize = size;
+  uint32_t ysize = size;
+  if (advPartSize != nullptr && advPartSize[particleindex].asymmetry > 0) {
+    getParticleXYsize(&advPartProps[particleindex], &advPartSize[particleindex], xsize, ysize);
+  }
+
+  int32_t rx_subpixel = xsize + PS_P_RADIUS + 1; // size = 1 means radius of just over 1 pixel, + PS_P_RADIUS (+1 to accoutn for bit-shift loss)
+  int32_t ry_subpixel = ysize + PS_P_RADIUS + 1; // size = 255 is radius of 5, so add 65 -> 65+255=320, 320>>6=5 pixels
+
+  // rendering bounding box in pixels
+  int32_t rx_pixels = (rx_subpixel >> PS_P_RADIUS_SHIFT);
+  int32_t ry_pixels = (ry_subpixel >> PS_P_RADIUS_SHIFT);
+
+  int32_t x_min = x_center - rx_pixels; // note: the "+1" extension needed for 1D is not required for 2D, it is smooth as-is
+  int32_t x_max = x_center + rx_pixels;
+  int32_t y_min = y_center - ry_pixels;
+  int32_t y_max = y_center + ry_pixels;
+
+  // cache for speed
+  uint32_t matrixX = maxXpixel + 1;
+  uint32_t matrixY = maxYpixel + 1;
+  uint32_t rx_sq = rx_subpixel * rx_subpixel;
+  uint32_t ry_sq = ry_subpixel * ry_subpixel;
+
+  // iterate over bounding box and render each pixel
+  for (int32_t py = y_min; py <= y_max; py++) {
+    for (int32_t px = x_min; px <= x_max; px++) {
+      // Check bounds and apply wrapping
+      int32_t render_x = px;
+      int32_t render_y = py;
+      if (render_x < 0) {
+        if (!wrapX) continue;
+        render_x += matrixX;
+      } else if (render_x > maxXpixel) {
+        if (!wrapX) continue;
+        render_x -= matrixX;
+      }
+
+      if (render_y < 0) {
+        if (!wrapY) continue;
+        render_y += matrixY;
+      } else if (render_y > maxYpixel) {
+        if (!wrapY) continue;
+        render_y -= matrixY;
+      }
+
+      // distance from particle center, explanation see above
+      int32_t dx_subpixel = (px << PS_P_RADIUS_SHIFT) - x_subcenter + PS_P_HALFRADIUS;
+      int32_t dy_subpixel = (py << PS_P_RADIUS_SHIFT) - y_subcenter + PS_P_HALFRADIUS;
+
+      // calculate brightness based on squared distance to ellipse center
+      uint8_t pixel_brightness = calculateEllipseBrightness(dx_subpixel, dy_subpixel, rx_sq, ry_sq, brightness);
+
+      if (pixel_brightness == 0) continue; // skip black pixels
+
+      // apply inverse gamma correction if needed, if this is skipped, particles flicker due to changing total brightness
+      if (gammaCorrectCol) {
+        pixel_brightness = gamma8inv(pixel_brightness); // invert brigthess so brightness distribution is linear after gamma correction
+      }
+
+      // Render pixel
+      uint32_t idx = render_x + (maxYpixel - render_y) * matrixX; // flip y coordinate (0,0 is bottom left in PS but top left in framebuffer)
+      framebuffer[idx] = fast_color_scaleAdd(framebuffer[idx], color, pixel_brightness);
+    }
+  }
+}
+
 // detect collisions in an array of particles and handle them
 // uses binning by dividing the frame into slices in x direction which is efficient if using gravity in y direction (but less efficient for FX that use forces in x direction)
 // for code simplicity, no y slicing is done, making very tall matrix configurations less efficient
 // note: also tested adding y slicing, it gives diminishing returns, some FX even get slower. FX not using gravity would benefit with a 10% FPS improvement
 void ParticleSystem2D::handleCollisions() {
-  if (perParticleSize && advPartProps != nullptr)
-    particleHardRadius = 255; // max radius for collision detection if using per-particle size TODO: could optimize by fetching max size from advPartProps
-
   uint32_t collDistSq = particleHardRadius << 1; // distance is double the radius note: particleHardRadius is updated when setting global particle size
   collDistSq = collDistSq * collDistSq; // square it for faster comparison (square is one operation)
   // note: partices are binned in x-axis, assumption is that no more than half of the particles are in the same bin
   // if they are, collisionStartIdx is increased so each particle collides at least every second frame (which still gives decent collisions)
-  constexpr int BIN_WIDTH = 6 * PS_P_RADIUS; // width of a bin in sub-pixels
+  int binWidth = 6 * PS_P_RADIUS; // width of a bin in sub-pixels
   int32_t overlap = particleHardRadius << 1; // overlap bins to include edge particles to neighbouring bins
+  if (perParticleSize && advPartProps != nullptr)
+    overlap = 512; // max overlap for collision detection if using per-particle size, enough to catch all particles even at max speed
+
   uint32_t maxBinParticles = max((uint32_t)50, (usedParticles + 1) / 2); // assume no more than half of the particles are in the same bin, do not bin small amounts of particles
-  uint32_t numBins = (maxX + (BIN_WIDTH - 1)) / BIN_WIDTH; // number of bins in x direction
+  uint32_t numBins = (maxX + (binWidth - 1)) / binWidth; // number of bins in x direction
+  if (usedParticles < maxBinParticles) {
+    numBins = 1; // use single bin for small number of particles
+    binWidth = maxX + 1;
+  }
   uint16_t binIndices[maxBinParticles]; // creat array on stack for indices, 2kB max for 1024 particles (ESP32_MAXPARTICLES/2)
   uint32_t binParticleCount; // number of particles in the current bin
-  uint16_t nextFrameStartIdx = hw_random16(usedParticles); // index of the first particle in the next frame (set to fixed value if bin overflow)
+  uint32_t nextFrameStartIdx = hw_random16(usedParticles); // index of the first particle in the next frame (set to fixed value if bin overflow)
   uint32_t pidx = collisionStartIdx; //start index in case a bin is full, process remaining particles next frame
 
   // fill the binIndices array for this bin
   for (uint32_t bin = 0; bin < numBins; bin++) {
     binParticleCount = 0; // reset for this bin
-    int32_t binStart = bin * BIN_WIDTH - overlap; // note: first bin will extend to negative, but that is ok as out of bounds particles are ignored
-    int32_t binEnd = binStart + BIN_WIDTH + overlap; // note: last bin can be out of bounds, see above;
+    int32_t binStart = bin * binWidth - overlap; // note: first bin will extend to negative, but that is ok as out of bounds particles are ignored
+    int32_t binEnd = binStart + binWidth + overlap; // note: last bin can be out of bounds, see above;
 
     // fill the binIndices array for this bin
     for (uint32_t i = 0; i < usedParticles; i++) {
@@ -849,8 +853,8 @@ void ParticleSystem2D::handleCollisions() {
       if (pidx >= usedParticles) pidx = 0; // wrap around
     }
 
-    uint32_t massratio1 = 0; // 0 means dont use mass ratio (equal mass)
-    uint32_t massratio2 = 0;
+    int32_t massratio1 = 0; // 0 means dont use mass ratio (equal mass)
+    int32_t massratio2 = 0; // TODO: if implementing "fixed" particles, set to 1 (fixed) and 255 (movable)
     for (uint32_t i = 0; i < binParticleCount; i++) { // go though all 'higher number' particles in this bin and see if any of those are in close proximity and if they are, make them collide
       uint32_t idx_i = binIndices[i];
       for (uint32_t j = i + 1; j < binParticleCount; j++) { // check against higher number particles
@@ -859,12 +863,15 @@ void ParticleSystem2D::handleCollisions() {
           collDistSq = (PS_P_MINHARDRADIUS << 1) + ((((uint32_t)advPartProps[idx_i].size + (uint32_t)advPartProps[idx_j].size) * 52) >> 6); // collision distance, use 80% of size for tighter stacking (slight overlap)
           collDistSq = collDistSq * collDistSq; // square it for faster comparison
           // calculate mass ratio for collision response
-          uint32_t mass1 = 1 + ((uint32_t)advPartProps[idx_i].size * advPartProps[idx_i].size); // +1 to avoid division by zero
-          uint32_t mass2 = ((uint32_t)advPartProps[idx_j].size * advPartProps[idx_j].size);
+          uint32_t mass1 = PS_P_RADIUS + advPartProps[idx_i].size;
+          uint32_t mass2 = PS_P_RADIUS + advPartProps[idx_j].size;
+          mass1 = mass1 * mass1; // mass proportional to area
+          mass2 = mass2 * mass2;
           uint32_t totalmass = mass1 + mass2;
           massratio1 = (mass2 << 8) / totalmass; // massratio 1 depends on mass of particle 2, i.e. if 2 is heavier -> higher velocity impact on 1
           massratio2 = (mass1 << 8) / totalmass;
         }
+        // note: using the same logic as in 1D is much slower though it would be more accurate but it is not really needed in 2D
         int32_t dx = (particles[idx_j].x + particles[idx_j].vx) - (particles[idx_i].x + particles[idx_i].vx); // distance with lookahead
         if (dx * dx < collDistSq) { // check x direction, if close, check y direction (squaring is faster than abs() or dual compare)
           int32_t dy = (particles[idx_j].y + particles[idx_j].vy)  - (particles[idx_i].y + particles[idx_i].vy); // distance with lookahead
@@ -879,7 +886,7 @@ void ParticleSystem2D::handleCollisions() {
 
 // handle a collision if close proximity is detected, i.e. dx and/or dy smaller than 2*PS_P_RADIUS
 // takes two pointers to the particles to collide and the particle hardness (softer means more energy lost in collision, 255 means full hard)
-void WLED_O2_ATTR ParticleSystem2D::collideParticles(PSparticle &particle1, PSparticle &particle2, int32_t dx, int32_t dy, const uint32_t collDistSq, uint32_t massratio1, uint32_t massratio2) {
+void WLED_O2_ATTR ParticleSystem2D::collideParticles(PSparticle &particle1, PSparticle &particle2, int32_t dx, int32_t dy, const uint32_t collDistSq, int32_t massratio1, int32_t massratio2) {
   int32_t distanceSquared = dx * dx + dy * dy;
   // Calculate relative velocity note: could zero check but that does not improve overall speed but deminish it as that is rarely the case and pushing is still required
   int32_t relativeVx = (int32_t)particle2.vx - (int32_t)particle1.vx;
@@ -909,24 +916,29 @@ void WLED_O2_ATTR ParticleSystem2D::collideParticles(PSparticle &particle1, PSpa
   if (dotProduct < 0) {// particles are moving towards each other
     // integer math is much faster than using floats (float divisions are slow on all ESPs)
     // overflow check: dx/dy are 7bit, relativV are 8bit -> dotproduct is 15bit, dotproduct/distsquared ist 8b, multiplied by collisionhardness of 8bit. so a 16bit shift is ok, make it 15 to be sure no overflows happen
-    // note: cannot use right shifts as bit shifting in right direction is asymmetrical for positive and negative numbers and this needs to be accurate! the trick is: only shift positive numers
+    // note: cannot use right shifts as bit shifting in right direction is asymmetrical (1>>1=0 / -1>>1=-1) and this needs to be accurate! the trick is: only shift positive numers
     // Calculate new velocities after collision
     int32_t surfacehardness = max(collisionHardness, (int32_t)PS_P_MINSURFACEHARDNESS >> 1); // if particles are soft, the impulse must stay above a limit or collisions slip through at higher speeds, 170 seems to be a good value
     int32_t impulse = (((((-dotProduct) << 15) / distanceSquared) * surfacehardness) >> 8); // note: inverting before bitshift corrects for asymmetry in right-shifts (is slightly faster)
 
     #if defined(CONFIG_IDF_TARGET_ESP32C3) || defined(ESP8266) // use bitshifts with rounding instead of division (2x faster)
-    int32_t ximpulse = (impulse * dx + ((dx >> 31) & 32767)) >> 15; // note: extracting sign bit and adding rounding value to correct for asymmetry in right shifts
-    int32_t yimpulse = (impulse * dy + ((dy >> 31) & 32767)) >> 15;
+    int32_t ximpulse = (impulse * dx + ((dx >> 31) & 0x7FFF)) >> 15; // note: extracting sign bit and adding rounding value to correct for asymmetry in right shifts
+    int32_t yimpulse = (impulse * dy + ((dy >> 31) & 0x7FFF)) >> 15;
     #else
     int32_t ximpulse = (impulse * dx) / 32767;
     int32_t yimpulse = (impulse * dy) / 32767;
     #endif
     // if particles are not the same size, use a mass ratio. mass ratio is set to 0 if particles are the same size
     if (massratio1) {
-      particle1.vx -= (ximpulse * massratio1) >> 7; // mass ratio is in fixed point 8bit, multiply by two to account for the fact that we distribute the impulse to both particles
-      particle1.vy -= (yimpulse * massratio1) >> 7;
-      particle2.vx += (ximpulse * massratio2) >> 7;
-      particle2.vy += (yimpulse * massratio2) >> 7;
+      int32_t vx1 = (int32_t)particle1.vx - ((ximpulse * massratio1) >> 7); // mass ratio is in fixed point 8bit, multiply by two to account for the fact that we distribute the impulse to both particles
+      int32_t vy1 = (int32_t)particle1.vy - ((yimpulse * massratio1) >> 7);
+      int32_t vx2 = (int32_t)particle2.vx + ((ximpulse * massratio2) >> 7);
+      int32_t vy2 = (int32_t)particle2.vy + ((yimpulse * massratio2) >> 7);
+      // limit speeds to max speed (required if a lot of impulse is transferred from a large to a small particle)
+      particle1.vx = limitSpeed(vx1);
+      particle1.vy = limitSpeed(vy1);
+      particle2.vx = limitSpeed(vx2);
+      particle2.vy = limitSpeed(vy2);
     }
     else {
       particle1.vx -= ximpulse; // note: impulse is inverted, so subtracting it
@@ -951,11 +963,11 @@ void WLED_O2_ATTR ParticleSystem2D::collideParticles(PSparticle &particle1, PSpa
     }
 
     // particles have volume, push particles apart if they are too close
-    // tried lots of configurations, it works best if not moved but given a little velocity, it tends to oscillate less this way
+    // tried lots of configurations, it works best if given a little velocity, it tends to oscillate less this way
     // when hard pushing by offsetting position, they sink into each other under gravity
     // a problem with giving velocity is, that on harder collisions, this adds up as it is not dampened enough, so add friction in the FX if required
     if (distanceSquared < collDistSq && dotProduct > -250) { // too close and also slow, push them apart
-      int32_t notsorandom = dotProduct & 0x01; //dotprouct LSB should be somewhat random, so no need to calculate a random number
+      bool fairlyrandom = dotProduct & 0x01; //dotprouct LSB should be somewhat random, so no need to calculate a random number
       int32_t pushamount = 1 + ((250 + dotProduct) >> 6); // the closer dotproduct is to zero, the closer the particles are
       int32_t push = 0;
       if (dx < 0)  // particle 1 is on the right
@@ -963,7 +975,7 @@ void WLED_O2_ATTR ParticleSystem2D::collideParticles(PSparticle &particle1, PSpa
       else if (dx > 0)
         push = -pushamount;
       else { // on the same x coordinate, shift it a little so they do not stack
-        if (notsorandom)
+        if (fairlyrandom)
           particle1.x++; // move it so pile collapses
         else
           particle1.x--;
@@ -975,7 +987,7 @@ void WLED_O2_ATTR ParticleSystem2D::collideParticles(PSparticle &particle1, PSpa
       else if (dy > 0)
         push = -pushamount;
       else { // dy==0
-        if (notsorandom)
+        if (fairlyrandom)
           particle1.y++; // move it so pile collapses
         else
           particle1.y--;
@@ -1037,56 +1049,6 @@ void ParticleSystem2D::updatePSpointers(bool isadvanced, bool sizecontrol) {
 
 }
 
-// blur a matrix in x and y direction, blur can be asymmetric in x and y
-// for speed, 1D array and 32bit variables are used, make sure to limit them to 8bit (0-255) or result is undefined
-// to blur a subset of the buffer, change the xsize/ysize and set xstart/ystart to the desired starting coordinates (default start is 0/0)
-// subset blurring only works on 10x10 buffer (single particle rendering), if other sizes are needed, buffer width must be passed as parameter
-void blur2D(uint32_t *colorbuffer, uint32_t xsize, uint32_t ysize, uint32_t xblur, uint32_t yblur, uint32_t xstart, uint32_t ystart, bool isparticle) {
-  CRGBW seeppart, carryover;
-  uint32_t seep = xblur >> 1;
-  uint32_t width = xsize; // width of the buffer, used to calculate the index of the pixel
-
-  if (isparticle) { //first and last row are always black in first pass of particle rendering
-    ystart++;
-    ysize--;
-    width = 10; // buffer size is 10x10
-  }
-
-  for (uint32_t y = ystart; y < ystart + ysize; y++) {
-    carryover =  BLACK;
-    uint32_t indexXY = xstart + y * width;
-    for (uint32_t x = xstart; x < xstart + xsize; x++) {
-      seeppart = fast_color_scale(colorbuffer[indexXY], seep); // scale it and seep to neighbours
-      if (x > 0) {
-        colorbuffer[indexXY - 1] = fast_color_scaleAdd(colorbuffer[indexXY - 1], seeppart);
-        colorbuffer[indexXY]     = fast_color_scaleAdd(colorbuffer[indexXY], carryover);
-      }
-      carryover = seeppart;
-      indexXY++; // next pixel in x direction
-    }
-  }
-
-  if (isparticle) { // first and last row are now smeared
-    ystart--;
-    ysize++;
-  }
-
-  seep = yblur >> 1;
-  for (uint32_t x = xstart; x < xstart + xsize; x++) {
-    carryover = BLACK;
-    uint32_t indexXY = x + ystart * width;
-    for (uint32_t y = ystart; y < ystart + ysize; y++) {
-      seeppart = fast_color_scale(colorbuffer[indexXY], seep); // scale it and seep to neighbours
-      if (y > 0) {
-        colorbuffer[indexXY - width] = fast_color_scaleAdd(colorbuffer[indexXY - width], seeppart);
-        colorbuffer[indexXY]         = fast_color_scaleAdd(colorbuffer[indexXY], carryover);
-      }
-      carryover = seeppart;
-      indexXY += width; // next pixel in y direction
-    }
-  }
-}
-
 //non class functions to use for initialization
 uint32_t calculateNumberOfParticles2D(uint32_t const pixels, const bool isadvanced, const bool sizecontrol) {
   uint32_t numberofParticles = pixels;  // 1 particle per pixel (for example 512 particles on 32x16)
@@ -1142,7 +1104,7 @@ bool initParticleSystem2D(ParticleSystem2D *&PartSys, uint32_t requestedsources,
   PSPRINTLN(" request numparticles:" + String(numparticles));
   uint32_t numsources = calculateNumberOfSources2D(pixels, requestedsources);
   bool allocsuccess = false;
-  while(numparticles >= 4) { // make sure we have at least 4 particles or quit
+  while(numparticles >= 5) { // make sure we have at least 5 particles or quit
     if (allocateParticleSystemMemory2D(numparticles, numsources, advanced, sizecontrol, additionalbytes)) {
       PSPRINTLN(F("PS 2D alloc succeeded"));
       allocsuccess = true;
@@ -1205,8 +1167,11 @@ void ParticleSystem1D::update(void) {
     applyGravity();
 
   // handle collisions (can push particles, must be done before updating particles or they can render out of bounds, causing a crash if using local buffer for speed)
-  if (particlesettings.useCollisions)
+  if (particlesettings.useCollisions) {
     handleCollisions();
+    if (perParticleSize)
+      handleCollisions(); // second pass for per particle size (as impulse transfer can recoil at high speed, this improves "slip through" issues for small particles but is expensive)
+  }
 
   //move all particles
   for (uint32_t i = 0; i < usedParticles; i++) {
@@ -1214,7 +1179,7 @@ void ParticleSystem1D::update(void) {
   }
 
   if (particlesettings.colorByPosition) {
-    uint32_t scale = (255 << 16) / maxX;  // speed improvement: multiplication is faster than division
+    uint32_t scale = (255 << 16) / maxX;
     for (uint32_t i = 0; i < usedParticles; i++) {
       particles[i].hue = (scale * particles[i].x) >> 16; // note: x is > 0 if not out of bounds
     }
@@ -1225,7 +1190,7 @@ void ParticleSystem1D::update(void) {
 
 // set percentage of used particles as uint8_t i.e 127 means 50% for example
 void ParticleSystem1D::setUsedParticles(const uint8_t percentage) {
-  usedParticles = (numParticles * ((int)percentage+1)) >> 8; // number of particles to use (percentage is 0-255, 255 = 100%)
+  usedParticles =  max((uint32_t)1, (numParticles * ((int)percentage+1)) >> 8); // number of particles to use (percentage is 0-255, 255 = 100%)
   PSPRINT(" SetUsedpaticles: allocated particles: ");
   PSPRINT(numParticles);
   PSPRINT(" ,used particles: ");
@@ -1269,10 +1234,16 @@ void ParticleSystem1D::setSmearBlur(const uint8_t bluramount) {
   smearBlur = bluramount;
 }
 
-// render size, 0 = 1 pixel, 1 = 2 pixel (interpolated), bigger sizes require adanced properties
+// render size, 0 = 1 pixel, 1 = 2 pixel (interpolated), 255 = 18 pixel diameter
 void ParticleSystem1D::setParticleSize(const uint8_t size) {
-  particlesize = size > 0 ? 1 : 0; // TODO: add support for global sizes? see note above (motion blur)
-  particleHardRadius = PS_P_MINHARDRADIUS_1D >> (!particlesize); // 2 pixel sized particles or single pixel sized particles
+  particlesize = size;
+  particleHardRadius = PS_P_MINHARDRADIUS_1D; // ~1 pixel
+  perParticleSize = false; // disable per particle size control if global size is set
+  if (particlesize > 1) {
+    particleHardRadius = PS_P_MINHARDRADIUS_1D + ((particlesize * 52) >> 6); // use 1 pixel + 80% of size for hard radius (slight overlap with boarders so they do not "float" and nicer stacking)
+  }
+  else if (particlesize == 0)
+    particleHardRadius = particleHardRadius >> 1; // single pixel particles have half the radius (i.e. 1/2 pixel)
 }
 
 // enable/disable gravity, optionally, set the force (force=8 is default) can be -127 to +127, 0 is disable
@@ -1328,16 +1299,16 @@ void ParticleSystem1D::particleMoveUpdate(PSparticle1D &part, PSparticleFlags1D
     if (options->colorByAge)
       part.hue = min(part.ttl, (uint16_t)255); // set color to ttl
 
-    int32_t renderradius = PS_P_HALFRADIUS_1D; // used to check out of bounds, default for 2 pixel rendering
+    int32_t renderradius = PS_P_HALFRADIUS_1D - 1 + particlesize; // used to check out of bounds, default for 2 pixel rendering
     int32_t newX = part.x + (int32_t)part.vx;
     partFlags.outofbounds = false; // reset out of bounds (in case particle was created outside the matrix and is now moving into view)
 
-    if (advancedproperties) { // using individual particle size?
+    if (perParticleSize && advancedproperties != nullptr) { // using individual particle size?
+      renderradius = PS_P_HALFRADIUS - 1 + advancedproperties->size; // note: for single pixel particles, it should be zero, but it does not matter as out of bounds checking is done in rendering function
       if (advancedproperties->size > 1)
-        particleHardRadius = PS_P_MINHARDRADIUS_1D + (advancedproperties->size >> 1);
+        particleHardRadius = PS_P_MINHARDRADIUS_1D + ((advancedproperties->size * 52) >> 6); // use 1 pixel + 80% of size for hard radius (slight overlap with boarders so they do not "float" and nicer stacking)
       else // single pixel particles use half the collision distance for walls
         particleHardRadius = PS_P_MINHARDRADIUS_1D >> 1;
-      renderradius = particleHardRadius; // note: for single pixel particles, it should be zero, but it does not matter as out of bounds checking is done in rendering function
     }
 
     // if wall collisions are enabled, bounce them before they reach the edge, it looks much nicer if the particle is not half out of view
@@ -1493,7 +1464,7 @@ void ParticleSystem1D::render() {
   }
   // apply smear-blur to rendered frame
   if (smearBlur) {
-    blur1D(framebuffer, maxXpixel + 1, smearBlur, 0);
+    SEGMENT.blur(smearBlur, true);
   }
 
   // add background color
@@ -1517,8 +1488,8 @@ void ParticleSystem1D::render() {
 // calculate pixel positions and brightness distribution and render the particle to local buffer or global buffer
 void WLED_O2_ATTR ParticleSystem1D::renderParticle(const uint32_t particleindex, const uint8_t brightness, const CRGBW &color, const bool wrap) {
   uint32_t size = particlesize;
-  if (advPartProps != nullptr) // use advanced size properties (1D system has no large size global rendering TODO: add large global rendering?)
-    size = advPartProps[particleindex].size;
+  if (perParticleSize && advPartProps != nullptr) // use advanced size properties
+    size = 1 + advPartProps[particleindex].size; // add 1 to avoid single pixel size particles (collisions do not support it)
 
   if (size == 0) { //single pixel particle, can be out of bounds as oob checking is made for 2-pixel particles (and updating it uses more code)
     uint32_t x =  particles[particleindex].x >> PS_P_RADIUS_SHIFT_1D;
@@ -1528,6 +1499,12 @@ void WLED_O2_ATTR ParticleSystem1D::renderParticle(const uint32_t particleindex,
     return;
   }
   //render larger particles
+  if (size > 1) { // size > 1: render as gradient line
+    renderLargeParticle(size, particleindex, brightness, color, wrap); // larger size rendering
+    return;
+  }
+
+  // standard rendering (2 pixels per particle)
   bool pxlisinframe[2] = {true, true};
   int32_t pxlbrightness[2];
   int32_t pixco[2]; // physical pixel coordinates of the two pixels representing a particle
@@ -1548,99 +1525,110 @@ void WLED_O2_ATTR ParticleSystem1D::renderParticle(const uint32_t particleindex,
   // adjust brightness such that distribution is linear after gamma correction:
   // - scale brigthness with gamma correction (done in render())
   // - apply inverse gamma correction to brightness values
-  // - gamma is applied again in show() -> the resulting brightness distribution is linear but gamma corrected in total
+  // - gamma is applied again in show() -> the resulting brightness distribution is linear but gamma corrected in total -> fixes brightness fluctuations
   if (gammaCorrectCol) {
     pxlbrightness[0] = gamma8inv(pxlbrightness[0]); // use look-up-table for invers gamma
     pxlbrightness[1] = gamma8inv(pxlbrightness[1]);
   }
-  // check if particle has advanced size properties and buffer is available
-  if (advPartProps != nullptr && advPartProps[particleindex].size > 1) {
-    uint32_t renderbuffer[10]; // 10 pixel buffer
-    memset(renderbuffer, 0, sizeof(renderbuffer)); // clear buffer
-    //render particle to a bigger size
-    //particle size to pixels: 2 - 63 is 4 pixels, < 128 is 6pixels, < 192 is 8 pixels, bigger is 10 pixels
-    //first, render the pixel to the center of the renderbuffer, then apply 1D blurring
-    renderbuffer[4] = fast_color_scaleAdd(renderbuffer[4], color, pxlbrightness[0]);
-    renderbuffer[5] = fast_color_scaleAdd(renderbuffer[5], color, pxlbrightness[1]);
-    uint32_t rendersize = 2; // initialize render size, minimum is 4 pixels, it is incremented int he loop below to start with 4
-    uint32_t offset = 4; // offset to zero coordinate to write/read data in renderbuffer (actually needs to be 3, is decremented in the loop below)
-    uint32_t blurpasses = size/64 + 1; // number of blur passes depends on size, four passes max
-    uint32_t bitshift = 0;
-    for (uint32_t i = 0; i < blurpasses; i++) {
-      if (i == 2) //for the last two passes, use higher amount of blur (results in a nicer brightness gradient with soft edges)
-        bitshift = 1;
-      rendersize += 2;
-      offset--;
-      blur1D(renderbuffer, rendersize, size << bitshift, offset);
-      size = size > 64 ? size - 64 : 0;
-    }
 
-    // calculate origin coordinates to render the particle to in the framebuffer
-    uint32_t xfb_orig = x - (rendersize>>1) + 1 - offset; //note: using uint is fine
-    uint32_t xfb; // coordinates in frame buffer to write to note: by making this uint, only overflow has to be checked
-
-    // transfer particle renderbuffer to framebuffer
-    for (uint32_t xrb = offset; xrb < rendersize+offset; xrb++) {
-      xfb = xfb_orig + xrb;
-      if (xfb > (uint32_t)maxXpixel) {
-        if (wrap) { // wrap x to the other side if required
-          if (xfb > (uint32_t)maxXpixel << 1) // xfb is "negative"
-            xfb = (maxXpixel + 1) + (int32_t)xfb; // this always overflows to within bounds
-          else
-            xfb = xfb % (maxXpixel + 1); // note: without the above "negative" check, this works only for powers of 2
-        }
-        else
-          continue;
-      }
-      #ifdef ESP8266 // no local buffer on ESP8266
-      SEGMENT.addPixelColor(xfb, renderbuffer[xrb], true);
-      #else
-      framebuffer[xfb] = fast_color_scaleAdd(framebuffer[xfb], renderbuffer[xrb]);
-      #endif
+  // check if any pixels are out of frame
+  if (pixco[0] < 0) { // left pixels out of frame
+    if (wrap) // wrap x to the other side if required
+      pixco[0] = maxXpixel;
+    else {
+      pxlisinframe[0] = false; // pixel is out of matrix boundaries, do not render
+      if (pixco[0] < -1)
+        return; // both pixels out of frame (safety check)
     }
   }
-  else { // standard rendering (2 pixels per particle)
-    // check if any pixels are out of frame
-    if (x < 0) { // left pixels out of frame
-      if (wrap) // wrap x to the other side if required
-        pixco[0] = maxXpixel;
-      else
-        pxlisinframe[0] = false; // pixel is out of matrix boundaries, do not render
-    }
-    else if (pixco[1] > (int32_t)maxXpixel) { // right pixel, only has to be checkt if left pixel did not overflow
-      if (wrap) // wrap y to the other side if required
-        pixco[1] = 0;
-      else
-        pxlisinframe[1] = false;
+  else if (pixco[1] > (int32_t)maxXpixel) { // right pixel, only has to be checkt if left pixel did not overflow
+    if (wrap) // wrap y to the other side if required
+      pixco[1] = 0;
+    else {
+      pxlisinframe[1] = false;
+      if (pixco[0] > (int32_t)maxXpixel)
+        return; // both pixels out of frame (safety check)
     }
-    for (uint32_t i = 0; i < 2; i++) {
-      if (pxlisinframe[i]) {
-        framebuffer[pixco[i]] = fast_color_scaleAdd(framebuffer[pixco[i]], color, pxlbrightness[i]);
-      }
+  }
+  for (uint32_t i = 0; i < 2; i++) {
+    if (pxlisinframe[i]) {
+      framebuffer[pixco[i]] = fast_color_scaleAdd(framebuffer[pixco[i]], color, pxlbrightness[i]);
     }
   }
+}
+
+// render particle as a line with linear brightness falloff and sub-pixel precision, size is in 0-255 (1-9 pixel radius)
+void WLED_O2_ATTR ParticleSystem1D::renderLargeParticle(const uint32_t size, const uint32_t particleindex, const uint8_t brightness, const CRGBW& color, const bool wrap) {
+  int32_t x_subcenter = particles[particleindex].x; // particle position in sub-pixel space
+
+  // sub-pixel offset (0-31)
+  int32_t x_offset = x_subcenter & (PS_P_RADIUS_1D - 1); // same as modulo PS_P_RADIUS but faster
+  int32_t x_center = x_subcenter >> PS_P_RADIUS_SHIFT_1D; // integer pixel position, this is rounded down
+
+  // particle radius in pixels, size = 1 means radius of just over 1 pixel
+  int32_t r_subpixel = size + PS_P_RADIUS_1D + 1; // size = 255 is radius of 9, so add 33 -> 33+255=288, 288>>5=9 pixels (i.e. the +1 is needed to correct for bitshift losses)
+  // rendering bounding box in pixels
+  int32_t r_pixels = r_subpixel >> PS_P_RADIUS_SHIFT_1D;
 
+  int32_t x_min = x_center - r_pixels - 1; // extend by one for much smoother movement
+  int32_t x_max = x_center + r_pixels + 1;
+
+  // cache for speed
+  uint32_t matrixX = maxXpixel + 1;
+
+  // iterate over bounding box and render each pixel
+  for (int32_t px = x_min; px <= x_max; px++) {
+    // Check bounds and apply wrapping
+    int32_t render_x = px;
+    if (render_x < 0) {
+      if (!wrap) continue; // skip out of frame pixels
+      render_x += matrixX;
+    } else if (render_x > maxXpixel) {
+      if (!wrap) continue;
+      render_x -= matrixX;
+    }
+    // squared distance from particle center
+    int32_t dx_sq = ((px << PS_P_RADIUS_SHIFT_1D) - x_subcenter + PS_P_HALFRADIUS_1D); // explanation see 2D version
+    dx_sq = dx_sq * dx_sq;
+    int32_t rx_sq = r_subpixel * r_subpixel;
+    uint32_t dist_sq = (dx_sq << 8) / rx_sq; // normalized squared distance in fixed point (0-256)
+
+    // calculate brightness based on distance from particle center with linear falloff
+    uint8_t pixel_brightness = dist_sq >= 256 ? 0 : ((256 - dist_sq) * brightness) >> 8;
+    //if (pixel_brightness == 0) continue; // skip black pixels note: very few pixels will be black, skipping this is usually faster
+
+    // Render pixel
+    framebuffer[render_x] = fast_color_scaleAdd(framebuffer[render_x], color, pixel_brightness);
+  }
 }
 
 // detect collisions in an array of particles and handle them
 void ParticleSystem1D::handleCollisions() {
-  uint32_t collisiondistance = particleHardRadius << 1;
+  uint32_t collisiondistance = particleHardRadius << 1; // twice the radius is min distance between colliding particles
+  uint32_t checkDistSq = max(2 * PS_P_MAXSPEED, (int)collisiondistance);
+  if (perParticleSize && advPartProps != nullptr) // using individual particle size
+    checkDistSq = max(2 * PS_P_MAXSPEED, (512 * 52) >> 6); // max possible collision distance that catches all collisons
+  checkDistSq = checkDistSq * checkDistSq; // square it for distance comparison (faster than abs() )
   // note: partices are binned by position, assumption is that no more than half of the particles are in the same bin
   // if they are, collisionStartIdx is increased so each particle collides at least every second frame (which still gives decent collisions)
-  constexpr int BIN_WIDTH = 32 * PS_P_RADIUS_1D; // width of each bin, a compromise between speed and accuracy (larger bins are faster but collapse more)
-  int32_t overlap = particleHardRadius << 1; // overlap bins to include edge particles to neighbouring bins
-  if (advPartProps != nullptr) //may be using individual particle size
-    overlap += 256; // add 2 * max radius (approximately)
+  int binWidth = 64 * PS_P_RADIUS_1D; // width of each bin, a compromise between speed and accuracy
+  int32_t overlap = collisiondistance + (2 * PS_P_MAXSPEED); // overlap bins to include edge particles to neighbouring bins (+ look-ahead of speed)
+  if (perParticleSize && advPartProps != nullptr) //may be using individual particle size
+    overlap = 512; // 2 * max radius, enough to catch all collisions even at full speed
   uint32_t maxBinParticles = max((uint32_t)50, (usedParticles + 1) / 4); // do not bin small amounts, limit max to 1/4 of particles
-  uint32_t numBins = (maxX + (BIN_WIDTH - 1)) / BIN_WIDTH; // calculate number of bins
+  uint32_t numBins = (maxX + (binWidth - 1)) / binWidth; // calculate number of bins
+  if (usedParticles < maxBinParticles) {
+    numBins = 1; // use single bin for small number of particles
+    binWidth = maxX + 1;
+  }
   uint16_t binIndices[maxBinParticles]; // array to store indices of particles in a bin
   uint32_t binParticleCount; // number of particles in the current bin
-  uint16_t nextFrameStartIdx = hw_random16(usedParticles); // index of the first particle in the next frame (set to fixed value if bin overflow)
+  uint32_t nextFrameStartIdx = hw_random16(usedParticles); // index of the first particle in the next frame (set to fixed value if bin overflow)
   uint32_t pidx = collisionStartIdx; //start index in case a bin is full, process remaining particles next frame
   for (uint32_t bin = 0; bin < numBins; bin++) {
     binParticleCount = 0; // reset for this bin
-    int32_t binStart = bin * BIN_WIDTH - overlap; // note: first bin will extend to negative, but that is ok as out of bounds particles are ignored
-    int32_t binEnd = binStart + BIN_WIDTH + overlap; // note: last bin can be out of bounds, see above
+    int32_t binStart = bin * binWidth - overlap; // note: first bin will extend to negative, but that is ok as out of bounds particles are ignored
+    int32_t binEnd = binStart + binWidth + overlap; // note: last bin can be out of bounds, see above
 
     // fill the binIndices array for this bin
     for (uint32_t i = 0; i < usedParticles; i++) {
@@ -1663,87 +1651,104 @@ void ParticleSystem1D::handleCollisions() {
       uint32_t idx_i = binIndices[i];
       for (uint32_t j = i + 1; j < binParticleCount; j++) { // check against higher number particles
         uint32_t idx_j = binIndices[j];
-        if (advPartProps != nullptr) { // use advanced size properties
-          collisiondistance = (PS_P_MINHARDRADIUS_1D << particlesize) + ((advPartProps[idx_i].size + advPartProps[idx_j].size) >> 1);
-        }
-        int32_t dx = (particles[idx_j].x + particles[idx_j].vx) - (particles[idx_i].x + particles[idx_i].vx); // distance between particles with lookahead
-        uint32_t dx_abs = abs(dx);
-        if (dx_abs <= collisiondistance) { // collide if close
-          collideParticles(particles[idx_i], particleFlags[idx_i], particles[idx_j], particleFlags[idx_j], dx, dx_abs, collisiondistance);
+        int32_t dx = particles[idx_j].x - particles[idx_i].x; // distance between particles
+        uint32_t dx_sq = dx * dx; // square distance (faster than abs() and works the same)
+        if (dx_sq <= checkDistSq) { // possible collision imminent, check properly note: this is slower than using direct speed look-ahead (like in 2D) but more accurate and fast enough for 1D
+          collideParticles(idx_i, idx_j, dx, collisiondistance); // handle the collision
         }
       }
     }
   }
   collisionStartIdx = nextFrameStartIdx; // set the start index for the next frame
 }
-// handle a collision if close proximity is detected, i.e. dx and/or dy smaller than 2*PS_P_RADIUS
-// takes two pointers to the particles to collide and the particle hardness (softer means more energy lost in collision, 255 means full hard)
-void WLED_O2_ATTR ParticleSystem1D::collideParticles(PSparticle1D &particle1, const PSparticleFlags1D &particle1flags, PSparticle1D &particle2, const PSparticleFlags1D &particle2flags, const int32_t dx, const uint32_t dx_abs, const uint32_t collisiondistance) {
-  int32_t dv = particle2.vx - particle1.vx;
+// handle a collision if close proximity is detected, i.e. dx smaller than 2*radius + speed look-ahead
+void WLED_O2_ATTR ParticleSystem1D::collideParticles(uint32_t partIdx1, uint32_t partIdx2, int32_t dx, uint32_t collisiondistance) {
+  int32_t massratio1 = 0; // 0 means dont use mass ratio (equal mass)
+  int32_t massratio2 = 0;
+  if (perParticleSize && advPartProps != nullptr) { // use advanced size properties, calculate collision distance and mass ratio
+    collisiondistance = (PS_P_MINHARDRADIUS_1D * 2) + ((((uint32_t)advPartProps[partIdx1].size + (uint32_t)advPartProps[partIdx2].size) * 52) >> 6); // collision distance, use 80% of size for tighter stacking (slight overlap)
+    // calculate mass ratio for collision response
+    uint32_t mass1 = PS_P_RADIUS_1D + advPartProps[partIdx1].size;
+    uint32_t mass2 = PS_P_RADIUS_1D + advPartProps[partIdx2].size;
+    uint32_t totalmass = mass1 + mass2 - 2; // -2 to account for rounding
+    massratio1 = (mass2 << 8) / totalmass; // massratio 1 depends on mass of particle 2, i.e. if 2 is heavier -> higher velocity impact on 1
+    massratio2 = (mass1 << 8) / totalmass;
+  }
+  int32_t dv = (int)particles[partIdx2].vx - (int)particles[partIdx1].vx;
+  int32_t absdv = abs(dv);
   int32_t dotProduct = (dx * dv); // is always negative if moving towards each other
+  uint32_t dx_abs = abs(dx);
 
   if (dotProduct < 0) { // particles are moving towards each other
-    uint32_t surfacehardness = max(collisionHardness, (int32_t)PS_P_MINSURFACEHARDNESS_1D); // if particles are soft, the impulse must stay above a limit or collisions slip through
-    // Calculate new velocities after collision  note: not using dot product like in 2D as impulse is purely speed depnedent
-    #if defined(CONFIG_IDF_TARGET_ESP32C3) || defined(ESP8266) // use bitshifts with rounding instead of division (2x faster)
-    int32_t impulse = ((dv * surfacehardness) + ((dv >> 31) & 0xFF)) >> 8; // note: (v>>31) & 0xFF)) extracts the sign and adds 255 if negative for correct rounding using shifts
-    #else // division is faster on ESP32, S2 and S3
-    int32_t impulse = (dv * surfacehardness) / 255;
-    #endif
-    particle1.vx += impulse;
-    particle2.vx -= impulse;
-
-    // if one of the particles is fixed, transfer the impulse back so it bounces
-    if (particle1flags.fixed)
-      particle2.vx = -particle1.vx;
-    else if (particle2flags.fixed)
-      particle1.vx = -particle2.vx;
-
-    if (collisionHardness < PS_P_MINSURFACEHARDNESS_1D && (SEGMENT.call & 0x07) == 0) { // if particles are soft, they become 'sticky' i.e. apply some friction
-      const uint32_t coeff = collisionHardness + (250 - PS_P_MINSURFACEHARDNESS_1D);
+    uint32_t lookaheadDistance = collisiondistance + absdv; // add look-ahead: if reaching collisiondistance in this frame, collide
+    if (dx_abs <= lookaheadDistance) {
+      // if one of the particles is fixed, invert the other particle's velocity and multiply by hardness, also set its position to the edge of the fixed particle
+      if (particleFlags[partIdx1].fixed) {
+        particles[partIdx2].vx = -(particles[partIdx2].vx * collisionHardness) / 255;
+        particles[partIdx2].x = particles[partIdx1].x + (dx < 0 ? -collisiondistance : collisiondistance); // dv < 0 means particle2.x < particle1.x
+        return;
+      }
+      else if (particleFlags[partIdx2].fixed) {
+        particles[partIdx1].vx = -(particles[partIdx1].vx * collisionHardness) / 255;
+        particles[partIdx1].x = particles[partIdx2].x + (dx < 0 ? collisiondistance : -collisiondistance);
+        return;
+      }
+      int32_t surfacehardness = max(collisionHardness, (int32_t)PS_P_MINSURFACEHARDNESS_1D); // if particles are soft, the impulse must stay above a limit or collisions slip through
+      // Calculate new velocities after collision  note: not using dot product like in 2D as impulse is purely speed depnedent
       #if defined(CONFIG_IDF_TARGET_ESP32C3) || defined(ESP8266) // use bitshifts with rounding instead of division (2x faster)
-      particle1.vx = ((int32_t)particle1.vx * coeff + (((int32_t)particle1.vx >> 31) & 0xFF)) >> 8; // note: (v>>31) & 0xFF)) extracts the sign and adds 255 if negative for correct rounding using shifts
-      particle2.vx = ((int32_t)particle2.vx * coeff + (((int32_t)particle2.vx >> 31) & 0xFF)) >> 8;
+      int32_t impulse = (dv * surfacehardness + ((dv >> 31) & 0xFF)) >> 8; // note: (v>>31) & 0xFF)) extracts the sign and adds 255 if negative for correct rounding using shifts
       #else // division is faster on ESP32, S2 and S3
-      particle1.vx = ((int32_t)particle1.vx * coeff) / 255;
-      particle2.vx = ((int32_t)particle2.vx * coeff) / 255;
+      int32_t impulse = (dv * surfacehardness) / 255;
       #endif
+
+      // if particles are not the same size, use a mass ratio. mass ratio is set to 0 if particles are the same size
+      if (massratio1) {
+        int vx1 = (int)particles[partIdx1].vx + ((impulse * massratio1) >> 7); // mass ratio is in fixed point 8bit
+        int vx2 = (int)particles[partIdx2].vx - ((impulse * massratio2) >> 7);
+        // limit speeds to max speed (required as a lot of impulse can be transferred from a large to a small particle)
+        particles[partIdx1].vx = limitSpeed(vx1);
+        particles[partIdx2].vx = limitSpeed(vx2);
+      }
+      else {
+        particles[partIdx1].vx += impulse;
+        particles[partIdx2].vx -= impulse;
+      }
+
+      if (collisionHardness < PS_P_MINSURFACEHARDNESS_1D && (SEGMENT.call & 0x07) == 0) { // if particles are soft, they become 'sticky' i.e. apply some friction
+        const uint32_t coeff = collisionHardness + (250 - PS_P_MINSURFACEHARDNESS_1D);
+        #if defined(CONFIG_IDF_TARGET_ESP32C3) || defined(ESP8266) // use bitshifts with rounding instead of division (2x faster)
+        particles[partIdx1].vx = ((int32_t)particles[partIdx1].vx * coeff + (((int32_t)particles[partIdx1].vx >> 31) & 0xFF)) >> 8; // note: (v>>31) & 0xFF)) extracts the sign and adds 255 if negative for correct rounding using shifts
+        particles[partIdx2].vx = ((int32_t)particles[partIdx2].vx * coeff + (((int32_t)particles[partIdx2].vx >> 31) & 0xFF)) >> 8;
+        #else // division is faster on ESP32, S2 and S3
+        particles[partIdx1].vx = ((int32_t)particles[partIdx1].vx * coeff) / 255;
+        particles[partIdx2].vx = ((int32_t)particles[partIdx2].vx * coeff) / 255;
+        #endif
+      }
+    } else {
+      return; // not close enough yet
     }
   }
+  // particles have volume, push particles apart if they are too close
+  // note: like in 2D, pushing by a distance makes softer piles collapse, giving particles speed prevents that and looks nicer
 
-  if (dx_abs < (collisiondistance - 8) && abs(dv) < 5) { // overlapping and moving slowly
-    // particles have volume, push particles apart if they are too close
-    // behaviour is different than in 2D, we need pixel accurate stacking here, push the top particle
-    // note: like in 2D, pushing by a distance makes softer piles collapse, giving particles speed prevents that and looks nicer
-    int32_t pushamount = 1;
-    if (dx < 0)  // particle2.x < particle1.x
+  if (dx_abs < collisiondistance) { // too close, force push particles so they dont collapse
+    int32_t pushamount = 1 + ((collisiondistance - dx_abs) >> 3); // push by eighth of deviation (plus 1 to push at least a little), note: pushing too much leads to pass-throughs and more flickering
+    int32_t addspeed = 1;
+    if (dx < 0) {  // particle2.x < particle1.x
       pushamount = -pushamount;
-    particle1.vx -= pushamount;
-    particle2.vx += pushamount;
-
-    if (dx_abs < collisiondistance >> 1) { // too close, force push particles so they dont collapse
-      pushamount = 1 + ((collisiondistance - dx_abs) >> 3); // note: push amount found by experimentation
-
-      if (particle1.x < (maxX >> 1)) { // lower half, push particle with larger x in positive direction
-        if (dx < 0 && !particle1flags.fixed) {  // particle2.x < particle1.x  -> push particle 1
-          particle1.vx++;// += pushamount;
-          particle1.x += pushamount;
-        }
-        else if (!particle2flags.fixed) { // particle1.x < particle2.x  -> push particle 2
-          particle2.vx++;// += pushamount;
-          particle2.x += pushamount;
-        }
-      }
-      else { // upper half, push particle with smaller x
-        if (dx < 0 && !particle2flags.fixed) {  // particle2.x < particle1.x  -> push particle 2
-          particle2.vx--;// -= pushamount;
-          particle2.x -= pushamount;
-        }
-        else if (!particle1flags.fixed) { // particle1.x < particle2.x  -> push particle 1
-          particle1.vx--;// -= pushamount;
-          particle1.x -= pushamount;
-        }
-      }
+      addspeed = -addspeed;
+    }
+    if (absdv < 4) { // low relative speed, add speed to help with the pushing (less collapsing piles)
+      particles[partIdx1].vx -= addspeed;
+      particles[partIdx2].vx += addspeed;
+    }
+    // push only one particle to avoid oscillations
+    bool fairlyrandom = dotProduct & 0x01;
+    if (fairlyrandom) {
+      particles[partIdx1].x -= pushamount;
+    }
+    else {
+      particles[partIdx2].x += pushamount;
     }
   }
 }
@@ -1855,24 +1860,6 @@ bool initParticleSystem1D(ParticleSystem1D *&PartSys, const uint32_t requestedso
   PartSys = new (SEGENV.data) ParticleSystem1D(SEGMENT.virtualLength(), numparticles, numsources, advanced); // particle system constructor
   return true;
 }
-
-// blur a 1D buffer, sub-size blurring can be done using start and size
-// for speed, 32bit variables are used, make sure to limit them to 8bit (0-255) or result is undefined
-// to blur a subset of the buffer, change the size and set start to the desired starting coordinates
-void blur1D(uint32_t *colorbuffer, uint32_t size, uint32_t blur, uint32_t start)
-{
-  CRGBW seeppart, carryover;
-  uint32_t seep = blur >> 1;
-  carryover =  BLACK;
-  for (uint32_t x = start; x < start + size; x++) {
-    seeppart = fast_color_scale(colorbuffer[x], seep); // scale it and seep to neighbours
-    if (x > 0) {
-      colorbuffer[x-1] = fast_color_scaleAdd(colorbuffer[x-1], seeppart);
-      colorbuffer[x]   = fast_color_scaleAdd(colorbuffer[x], carryover); // is black on first pass
-    }
-    carryover = seeppart;
-  }
-}
 #endif // WLED_DISABLE_PARTICLESYSTEM1D
 
 #if !(defined(WLED_DISABLE_PARTICLESYSTEM2D) && defined(WLED_DISABLE_PARTICLESYSTEM1D)) // not both disabled
diff --git a/wled00/FXparticleSystem.h b/wled00/FXparticleSystem.h
index e4f203ee30..6a22109a93 100644
--- a/wled00/FXparticleSystem.h
+++ b/wled00/FXparticleSystem.h
@@ -17,7 +17,7 @@
 #include <stdint.h>
 #include "wled.h"
 
-#define PS_P_MAXSPEED 120 // maximum speed a particle can have (vx/vy is int8)
+#define PS_P_MAXSPEED 120 // maximum speed a particle can have (vx/vy is int8), limiting below 127 to avoid overflows in collisions due to rounding errors
 #define MAX_MEMIDLE 10 // max idle time (in frames) before memory is deallocated (if deallocated during an effect, it will crash!)
 
 //#define WLED_DEBUG_PS // note: enabling debug uses ~3k of flash
@@ -196,12 +196,12 @@ class ParticleSystem2D {
 private:
   //rendering functions
   void render();
-  void renderParticleEllipse(const uint32_t particleindex, const uint8_t brightness, const CRGBW& color, const bool wrapX, const bool wrapY);
   [[gnu::hot]] void renderParticle(const uint32_t particleindex, const uint8_t brightness, const CRGBW& color, const bool wrapX, const bool wrapY);
+  void renderLargeParticle(const uint32_t size, const uint32_t particleindex, const uint8_t brightness, const CRGBW& color, const bool wrapX, const bool wrapY);
   //paricle physics applied by system if flags are set
   void applyGravity(); // applies gravity to all particles
   void handleCollisions();
-  void collideParticles(PSparticle &particle1, PSparticle &particle2, const int32_t dx, const int32_t dy, const uint32_t collDistSq, uint32_t massratio1, uint32_t massratio2);
+  void collideParticles(PSparticle &particle1, PSparticle &particle2, int32_t dx, int32_t dy, const uint32_t collDistSq, int32_t massratio1, int32_t massratio2);
   void fireParticleupdate();
   //utility functions
   void updatePSpointers(const bool isadvanced, const bool sizecontrol); // update the data pointers to current segment data space
@@ -228,7 +228,6 @@ class ParticleSystem2D {
   uint8_t smearBlur; // 2D smeared blurring of full frame
 };
 
-void blur2D(uint32_t *colorbuffer, const uint32_t xsize, uint32_t ysize, const uint32_t xblur, const uint32_t yblur, const uint32_t xstart = 0, uint32_t ystart = 0, const bool isparticle = false);
 // initialization functions (not part of class)
 bool initParticleSystem2D(ParticleSystem2D *&PartSys, const uint32_t requestedsources, const uint32_t additionalbytes = 0, const bool advanced = false, const bool sizecontrol = false);
 uint32_t calculateNumberOfParticles2D(const uint32_t pixels, const bool advanced, const bool sizecontrol);
@@ -318,9 +317,9 @@ typedef union {
 
 // struct for additional particle settings (optional)
 typedef struct {
-  uint8_t sat; //color saturation
+  uint8_t sat;  // color saturation
   uint8_t size; // particle size, 255 means 10 pixels in diameter, this overrides global size setting
-  uint8_t forcecounter;
+  uint8_t forcecounter; // counter for applying forces to individual particles
 } PSadvancedParticle1D;
 
 //struct for a particle source (20 bytes)
@@ -367,6 +366,7 @@ class ParticleSystem1D
   void setGravity(int8_t force = 8);
   void enableParticleCollisions(bool enable, const uint8_t hardness = 255);
 
+
   PSparticle1D *particles; // pointer to particle array
   PSparticleFlags1D *particleFlags; // pointer to particle flags array
   PSsource1D *sources; // pointer to sources
@@ -377,16 +377,18 @@ class ParticleSystem1D
   int32_t maxXpixel; // last physical pixel that can be drawn to (FX can read this to read segment size if required), equal to width-1
   uint32_t numSources; // number of sources
   uint32_t usedParticles; // number of particles used in animation, is relative to 'numParticles'
+  bool perParticleSize; // if true, uses individual particle sizes from advPartProps if available (disabled when calling setParticleSize())
 
 private:
   //rendering functions
   void render(void);
-  [[gnu::hot]] void renderParticle(const uint32_t particleindex, const uint8_t brightness, const CRGBW &color, const bool wrap);
+  void renderParticle(const uint32_t particleindex, const uint8_t brightness, const CRGBW &color, const bool wrap);
+  void renderLargeParticle(const uint32_t size, const uint32_t particleindex, const uint8_t brightness, const CRGBW& color, const bool wrap);
 
   //paricle physics applied by system if flags are set
   void applyGravity(); // applies gravity to all particles
   void handleCollisions();
-  [[gnu::hot]] void collideParticles(PSparticle1D &particle1, const PSparticleFlags1D &particle1flags, PSparticle1D &particle2, const PSparticleFlags1D &particle2flags, const int32_t dx, const uint32_t dx_abs, const uint32_t collisiondistance);
+  void collideParticles(uint32_t partIdx1, uint32_t partIdx2, int32_t dx, uint32_t collisiondistance);
 
   //utility functions
   void updatePSpointers(const bool isadvanced); // update the data pointers to current segment data space
@@ -414,5 +416,5 @@ bool initParticleSystem1D(ParticleSystem1D *&PartSys, const uint32_t requestedso
 uint32_t calculateNumberOfParticles1D(const uint32_t fraction, const bool isadvanced);
 uint32_t calculateNumberOfSources1D(const uint32_t requestedsources);
 bool allocateParticleSystemMemory1D(const uint32_t numparticles, const uint32_t numsources, const bool isadvanced, const uint32_t additionalbytes);
-void blur1D(uint32_t *colorbuffer, uint32_t size, uint32_t blur, uint32_t start);
+
 #endif // WLED_DISABLE_PARTICLESYSTEM1D

From 6a8c6c1f58d6018178ec7f18b83af62960f9cd31 Mon Sep 17 00:00:00 2001
From: Damian Schneider <daedae@gmx.ch>
Date: Sat, 13 Dec 2025 19:54:01 +0100
Subject: [PATCH 4/4] bugfix

---
 wled00/FXparticleSystem.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/wled00/FXparticleSystem.cpp b/wled00/FXparticleSystem.cpp
index 9aff84b83b..4075f91456 100644
--- a/wled00/FXparticleSystem.cpp
+++ b/wled00/FXparticleSystem.cpp
@@ -1304,7 +1304,7 @@ void ParticleSystem1D::particleMoveUpdate(PSparticle1D &part, PSparticleFlags1D
     partFlags.outofbounds = false; // reset out of bounds (in case particle was created outside the matrix and is now moving into view)
 
     if (perParticleSize && advancedproperties != nullptr) { // using individual particle size?
-      renderradius = PS_P_HALFRADIUS - 1 + advancedproperties->size; // note: for single pixel particles, it should be zero, but it does not matter as out of bounds checking is done in rendering function
+      renderradius = PS_P_HALFRADIUS_1D - 1 + advancedproperties->size; // note: for single pixel particles, it should be zero, but it does not matter as out of bounds checking is done in rendering function
       if (advancedproperties->size > 1)
         particleHardRadius = PS_P_MINHARDRADIUS_1D + ((advancedproperties->size * 52) >> 6); // use 1 pixel + 80% of size for hard radius (slight overlap with boarders so they do not "float" and nicer stacking)
       else // single pixel particles use half the collision distance for walls