Skip to content

Conversation

@anthonycanino
Copy link
Owner

Consider the following C# snippet

public static Half ProduceHalf(float val, float val2)
{
	Half h1 = (Half)val;
	Half h2 = (Half)val2;
	return ConsumeHalf(h1, h2);
}

public static Half ConsumeHalf(Half h1, Half h2)
{
	Half h3 = h1 + h2;
	return h3;
}

My changes produce the following Tier0 code...

; Assembly listing for method Program:ProduceHalf(float,float):System.Half (Tier0)
; Emitting BLENDED_CODE for generic X64 + VEX + EVEX on Windows
; Tier0 code
; rbp based frame
; partially interruptible
; compiling with minopt
; invoked as altjit
; Final local variable assignments
;
;  V00 arg0         [V00    ] (  1,  1   )   float  ->  [rbp+0x10]  do-not-enreg[]
;  V01 arg1         [V01    ] (  1,  1   )   float  ->  [rbp+0x18]  do-not-enreg[]
;  V02 loc0         [V02    ] (  1,  1   )    half  ->  [rbp-0x08]  do-not-enreg[S] must-init <System.Half>
;  V03 OutArgs      [V03    ] (  1,  1   )  struct (32) [rsp+0x00]  do-not-enreg[XS] addr-exposed "OutgoingArgSpace" <UNNAMED>
;  V04 tmp1         [V04    ] (  1,  1   )    half  ->  [rbp-0x10]  do-not-enreg[S] "non-inline candidate call"
;
; Lcl frame size = 48

G_M43922_IG01:  ;; offset=0x0000
       55                   push     rbp
       4883EC30             sub      rsp, 48
       488D6C2430           lea      rbp, [rsp+0x30]
       33C0                 xor      eax, eax
       488945F8             mov      qword ptr [rbp-0x08], rax
       C5FA114510           vmovss   dword ptr [rbp+0x10], xmm0
       C5FA114D18           vmovss   dword ptr [rbp+0x18], xmm1
						;; size=26 bbWeight=1 PerfScore 5.00
G_M43922_IG02:  ;; offset=0x001A
       C5FA104510           vmovss   xmm0, dword ptr [rbp+0x10]
       FF1593BB7E00         call     [System.Half:op_Explicit(float):System.Half]
       668945F0             mov      word  ptr [rbp-0x10], ax
       C5FA104518           vmovss   xmm0, dword ptr [rbp+0x18]
       FF1584BB7E00         call     [System.Half:op_Explicit(float):System.Half]
       668945F8             mov      word  ptr [rbp-0x08], ax
       0FB74DF0             movzx    rcx, word  ptr [rbp-0x10]
       0FB755F8             movzx    rdx, word  ptr [rbp-0x08]
       FF158ABB7E00         call     [Program:ConsumeHalf(System.Half,System.Half):System.Half]
       90                   nop      
						;; size=45 bbWeight=1 PerfScore 19.25
G_M43922_IG03:  ;; offset=0x0047
       4883C430             add      rsp, 48
       5D                   pop      rbp
       C3                   ret      
						;; size=6 bbWeight=1 PerfScore 1.75

; Total bytes of code 77, prolog size 16, PerfScore 26.00, instruction count 20, allocated bytes for code 77 (MethodHash=f2a4546d) for method Program:ProduceHalf(float,float):System.Half (Tier0)
; ============================================================

; Assembly listing for method Program:ConsumeHalf(System.Half,System.Half):System.Half (Tier0)
; Emitting BLENDED_CODE for generic X64 + VEX + EVEX on Windows
; Tier0 code
; rbp based frame
; partially interruptible
; compiling with minopt
; invoked as altjit
; Final local variable assignments
;
;  V00 arg0         [V00    ] (  1,  1   )    half  ->  [rbp+0x10]  do-not-enreg[S] <System.Half>
;  V01 arg1         [V01    ] (  1,  1   )    half  ->  [rbp+0x18]  do-not-enreg[S] <System.Half>
;  V02 OutArgs      [V02    ] (  1,  1   )  struct (32) [rsp+0x00]  do-not-enreg[XS] addr-exposed "OutgoingArgSpace" <UNNAMED>
;
; Lcl frame size = 32

G_M44932_IG01:  ;; offset=0x0000
       55                   push     rbp
       4883EC20             sub      rsp, 32
       488D6C2420           lea      rbp, [rsp+0x20]
       894D10               mov      dword ptr [rbp+0x10], ecx
       895518               mov      dword ptr [rbp+0x18], edx
						;; size=16 bbWeight=1 PerfScore 3.75
G_M44932_IG02:  ;; offset=0x0010
       0FB74D10             movzx    rcx, word  ptr [rbp+0x10]
       0FB75518             movzx    rdx, word  ptr [rbp+0x18]
       FF15EABA7E00         call     [System.Half:op_Addition(System.Half,System.Half):System.Half]
       90                   nop      
						;; size=15 bbWeight=1 PerfScore 5.25
G_M44932_IG03:  ;; offset=0x001F
       4883C420             add      rsp, 32
       5D                   pop      rbp
       C3                   ret      
						;; size=6 bbWeight=1 PerfScore 1.75

; Total bytes of code 37, prolog size 10, PerfScore 10.75, instruction count 12, allocated bytes for code 37 (MethodHash=5608507b) for method Program:ConsumeHalf(System.Half,System.Half):System.Half (Tier0)
; ============================================================

and the following Tier1 code

; Assembly listing for method Program:ProduceHalf(float,float):System.Half (FullOpts)
; Emitting BLENDED_CODE for generic X64 + VEX + EVEX on Windows
; FullOpts code
; optimized code
; rsp based frame
; partially interruptible
; No PGO data
; invoked as altjit
; Final local variable assignments
;
;  V00 arg0         [V00,T00] (  3,  3   )   float  ->  mm0         single-def
;  V01 arg1         [V01,T01] (  3,  3   )   float  ->  mm1         single-def
;  V02 loc0         [V02    ] (  2,  2   )    half  ->  [rsp+0x20]  single-def <System.Half>
;  V03 OutArgs      [V03    ] (  1,  1   )  struct (32) [rsp+0x00]  do-not-enreg[XS] addr-exposed "OutgoingArgSpace" <UNNAMED>
;
; Lcl frame size = 40

G_M43922_IG01:  ;; offset=0x0000
       4883EC28             sub      rsp, 40
						;; size=4 bbWeight=1 PerfScore 0.25
G_M43922_IG02:  ;; offset=0x0004
       62F574081DC9         vcvtss2sh xmm1, xmm1
       62F57E08114C2410     vmovsh   word  ptr [rsp+0x20], xmm1
       62F57C081DC0         vcvtss2sh xmm0, xmm0
       C5F97EC1             vmovd    ecx, xmm0
       0FB7542420           movzx    rdx, word  ptr [rsp+0x20]
       FF15D1BB7E00         call     [Program:ConsumeHalf(System.Half,System.Half):System.Half]
       90                   nop      
						;; size=36 bbWeight=1 PerfScore 9.25
G_M43922_IG03:  ;; offset=0x0028
       4883C428             add      rsp, 40
       C3                   ret      
						;; size=5 bbWeight=1 PerfScore 1.25

; Total bytes of code 45, prolog size 4, PerfScore 10.75, instruction count 10, allocated bytes for code 45 (MethodHash=f2a4546d) for method Program:ProduceHalf(float,float):System.Half (FullOpts)
; ============================================================

; Assembly listing for method Program:ConsumeHalf(System.Half,System.Half):System.Half (FullOpts)
; Emitting BLENDED_CODE for generic X64 + VEX + EVEX on Windows
; FullOpts code
; optimized code
; rsp based frame
; partially interruptible
; No PGO data
; invoked as altjit
; Final local variable assignments
;
;  V00 arg0         [V00    ] (  3,  3   )    half  ->  [rsp+0x08]  single-def <System.Half>
;  V01 arg1         [V01    ] (  3,  3   )    half  ->  [rsp+0x10]  single-def <System.Half>
;# V02 OutArgs      [V02    ] (  1,  1   )  struct ( 0) [rsp+0x00]  do-not-enreg[XS] addr-exposed "OutgoingArgSpace" <Empty>
;
; Lcl frame size = 0

G_M44932_IG01:  ;; offset=0x0000
       66894C2408           mov      word  ptr [rsp+0x08], cx
       6689542410           mov      word  ptr [rsp+0x10], dx
						;; size=10 bbWeight=1 PerfScore 2.00
G_M44932_IG02:  ;; offset=0x000A
       62F57E0810442404     vmovsh   xmm0, word  ptr [rsp+0x08]
       62F57E0858442408     vaddsh   xmm0, xmm0, word  ptr [rsp+0x10]
       C5F97EC0             vmovd    eax, xmm0
						;; size=20 bbWeight=1 PerfScore 6.00
G_M44932_IG03:  ;; offset=0x001E
       C3                   ret      
						;; size=1 bbWeight=1 PerfScore 1.00

; Total bytes of code 31, prolog size 0, PerfScore 9.00, instruction count 6, allocated bytes for code 31 (MethodHash=5608507b) for method Program:ConsumeHalf(System.Half,System.Half):System.Half (FullOpts)
; ============================================================

@anthonycanino
Copy link
Owner Author

anthonycanino commented Nov 12, 2025

For the unoptimized ConsumeHalf, prior to lowering my tree looks like this...

STMT00000 ( 0x000[E-] ... 0x007 )
N004 (???,???) [000003] --CXG+-----                         *  RETURN    half  
N003 (???,???) [000002] --CXG+-----                         \--*  CALL      half   System.Half:op_Addition(System.Half,System.Half):System.Half
N001 (???,???) [000000] -----+----- arg0 rcx                   +--*  LCL_VAR   half  <System.Half, 2> V00 arg0         
N002 (???,???) [000001] -----+----- arg1 rdx                   \--*  LCL_VAR   half  <System.Half, 2> V01 arg1  

after lowering look like this...

              [000004] -----------                            IL_OFFSET void   INLRT @ 0x000[E-]
N001 (???,???) [000000] -c---+-----                    t0 =    LCL_VAR   half  <System.Half, 2> V00 arg0         
                                                            /--*  t0     half   
               [000005] -----------                    t5 = *  BITCAST   ushort
                                                            /--*  t5     ushort 
               [000006] -----------                    t6 = *  PUTARG_REG int    REG rcx
N002 (???,???) [000001] -c---+-----                    t1 =    LCL_VAR   half  <System.Half, 2> V01 arg1         
                                                            /--*  t1     half   
               [000007] -----------                    t7 = *  BITCAST   ushort
                                                            /--*  t7     ushort 
               [000008] -----------                    t8 = *  PUTARG_REG int    REG rdx
N001 (???,???) [000009] Hc---------                    t9 =    CNS_INT(h) long   0x7ffd85f27b58 ftn
                                                            /--*  t9     long   
N002 (???,???) [000010] nc--G------                   t10 = *  IND       long   REG NA
                                                            /--*  t6     int    arg0 rcx
                                                            +--*  t8     int    arg1 rdx
                                                            +--*  t10    long   control expr
N003 (???,???) [000002] --CXG+-----                    t2 = *  CALL      int    System.Half:op_Addition(System.Half,System.Half):System.Half
                                                            /--*  t2     int    
N004 (???,???) [000003] --CXG+-----                         *  RETURN    int 

For the optimized ConsumeHalf, prior to lowering my tree looks like this...

               [000004] -----------                            IL_OFFSET void   INLRT @ 0x000[E-]
N001 (  1,  2) [000000] -----+-----                    t0 =    LCL_VAR   half  <System.Half, 2> V00 arg0          $c0
N002 (  1,  2) [000001] -----+-----                    t1 =    LCL_VAR   half  <System.Half, 2> V01 arg1          $c1
                                                            /--*  t0     half   
                                                            +--*  t1     half   
N003 (  3,  5) [000002] -----+-----                    t2 = *  HWINTRINSIC half   16 half HalfAdd $140
                                                            /--*  t2     half   
N004 (  4,  6) [000003] -----+-----                         *  RETURN    half   $VN.Void

after lowering look like this...

               [000004] -----------                            IL_OFFSET void   INLRT @ 0x000[E-]
N001 (  1,  2) [000000] -----+-----                    t0 =    LCL_VAR   half  <System.Half, 2> V00 arg0          $c0
N002 (  1,  2) [000001] -----+-----                    t1 =    LCL_VAR   half  <System.Half, 2> V01 arg1          $c1
                                                            /--*  t0     half   
                                                            +--*  t1     half   
N003 (  3,  5) [000002] -----+-----                    t2 = *  HWINTRINSIC half   16 half HalfAdd $140
                                                            /--*  t2     half   
               [000005] -----------                    t5 = *  BITCAST   int   
                                                            /--*  t5     int    
N004 (  4,  6) [000003] -----+-----                         *  RETURN    int    $VN.Void

We can see the ABI respected when the half type needs to passed, but when it does not, it is treated as a float register.

I like how the trees look a bit more now, but I have disabled some asserts, particular, we are allowing a BITCAST from half to ushort and allowing the ushort to be placed with a PUTREG.

For reference, here are the full jit dumps are attached.

half-unopt.txt
haf-opt.txt

Comment on lines +2990 to 3004
instruction store_ins = ins_Store(storeType);
if (storeType == TYP_HALF)
{
// We cannot use `vmovsh` with an integer register, which is what the ABI would pass the
// 16-bit float value in. Switch to `vmov` which supports integer registers.
#if defined(TARGET_XARCH)
store_ins = INS_mov;
#else
store_ins = INS_invalid;
assert(!"TYP_HALF parameter passing not supported on this platform");
#endif
}

GetEmitter()->emitIns_S_R(store_ins, emitActualTypeSize(storeType), segment.GetRegister(), lclNum,
offset);

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I get this, but it also seems like "extra work" that might pessimize the typical path.

I wonder if it'd be better to just keep TYP_STRUCT, much as it is for the struct S { float value; } case. In which case the classifier would state it lives in a GPR and we'd just get INS_mov already. I wouldn't expect us to track it as TYP_HALF unless we're

So we'd only retype/normalize to TYP_HALF in the case the ABI is being followed and its passed in SIMD register (at which point we can use vpextrw and vpinsrw/vmovd (F16C) or vmovw (AVX512-FP16)

This might even push us in the direction that following the ABI is trivial and expected. We only produce TYP_HALF if its accelerated and the handling in that case is correct by convention of it being floating-point and following the other floating-point conventions.

Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm, I am not sure I completely follow.

Would we then perform a normalization/type conversion as a separate compiler pass? Meaning, we do not normalize on a struct import like it is currently done?

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm wondering if the idea to preserve the existing behavior is actually making this more complex, rather than simpler. I'm thinking it's actually less work to just do this right in the first place, ABI wise.

Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

From what I've seen, I think it is.

One thing I can try is prototyping the ABI where half is passed and returned in a float register. We can see if it is much simpler. That shouldn't take too long to try out.

Copy link
Owner Author

@anthonycanino anthonycanino Nov 17, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@tannergooding I put the changes on a separate branch #17

I have not addressed all nits below, but I wanted to get the major changes up and see what the code looked like. I think this is much easier I am only changing a few places. You'll see in the code that some of the signature checking might need to be updated. It's much easier to not treat TYP_HALF as a struct in any way, and instead treat it wholly as a primitive type (otherwise the jit picks it up as a struct in so many places where different handling is needed).

Copy link
Owner Author

@anthonycanino anthonycanino Dec 10, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I did it intensionally

When TYP_HALF was treated as a struct type, it caused a lot handling to apply to the type that didn't make sense for a primitive type.

One confusion that is going on, and perhaps we should move the conversation, is that we are talking about the PR at #17, where I redid the handling of the half type to more closely match that of a primitive float type in terms of the ABI.

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There's a difference between treating it as TYP_STRUCT and having it classified as a struct such that varTypeIsStruct(...) report true (being marked with VTF_S).

We don't expect the former, because we're retyping it to TYP_HALF (much as we retype to TYP_SIMD8/12/16/32/64). But we do still expect the latter.

Looking at the new type entry, it may be that the combination of VTF_S and VTF_FLT is breaking things here, as that may get into ordering or other nuance that hasn't been accounted for due to it being a "new combination". So I'd suggest removing VTF_FLT for now and just having it be specially handled via the intrinsic code paths. We can always look at also handling VTF_FLT in the future if that provides benefit

-- This basically means varTypeIsFloating(...) will report false, but that's probably okay if we're importing everything as GT_HWINTRINSIC nodes anyways. We'd really only need the other if we end up making scalar support more "first class" such that GT_ADD and such works.

Copy link
Owner Author

@anthonycanino anthonycanino Dec 10, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I had tried that route previously, and there are a lot of asserts that get hit. For example, in the gtNewSimdToScalarNode which we use to build up the Half operations, this will trigger...

image

likewise, I believe there are a lot of places where when generating the code or hwintrisic, varTypeIsFloating is checked, for example...

image

I don't think these are isolated to the one Vector128_ToScalar case, but I can try adding an or case there to see.

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Right, I expect there will be several fixups regardless of including it or not.

I expect that its inclusion is what’s making the “invalid IR” right now.

You could (if keeping the flag) try to find and handle those cases, such as by ensuring varTypeIsStruct comes first, so it takes precedence over varTypeIsFloating, or (if you remove the flag) you could update the hardware intrinsic paths to also allow TYP_HALF in may of the paths that assert it must be floating-point

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It’s really just an area where we’ve not quite done “this” before and so we don’t have things in place to make it easy.

We’d have the same general issues with adding some TYP_INT128. Anything added after this pr should be much easier, however

void CodeGen::genCodeForBitCast(GenTreeOp* treeNode)
{
assert(treeNode->TypeGet() == genActualType(treeNode));
//assert(treeNode->TypeGet() == genActualType(treeNode));

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why does this need commented out? I'd expect in this to just be TYP_HALF == TYP_HALF since we aren't implicitly extending the type up to a larger type size

Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You can see this in the trees above, but what happens is Lowering::InsertBitCastIfNecessary will insert the bitcast to TYP_USHORT not TYP_HALF, as the ABI defines the TYP_HALF to be passed in an integer register. The code will grab the appropriate integer register for a 2 byte value, hence, TYP_USHORT.

https://github.com/dotnet/runtime/blob/main/src/coreclr/jit/lower.cpp#L1934-L1944

It's this line in particular that will produce the bitcast...

https://github.com/dotnet/runtime/blob/main/src/coreclr/jit/lower.cpp#L1923

TYP_HALF has to be defined to use float registers, but its ABI register segement info defines it to require integer register for passing.

Comment on lines +3080 to +3081
assert((leadingBytes == 0x0F) || ((emitComp->compIsaSupportedDebugOnly(InstructionSet_AVX10v1) ||
emitComp->compIsaSupportedDebugOnly(InstructionSet_AVX10v2) ||

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This can just check 10v1 as it implies 10v2

else if (code & 0xFF000000)
{
if (size == EA_2BYTE)
if (size == EA_2BYTE && ins != INS_vmovsh)

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Checking if its a vex/evex/simd instruction might be more appropriate, since those all shouldn't be EA_2BYTE or should have this prefix byte as part of their better encoding.

else if (code & 0xFF000000)
{
if (size == EA_2BYTE)
if (size == EA_2BYTE && (ins != INS_vmovsh && ins != INS_vaddsh))

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Similar comment here and elsewhere

break;
}

case NI_AVX10v1_ConvertFloatToHalf:

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: ConvertSingleToHalf


// Don't bother if the struct contains GC references of byrefs, it can't be a SIMD type.
if ((structFlags & (CORINFO_FLG_CONTAINS_GC_PTR | CORINFO_FLG_BYREF_LIKE)) == 0)
if ((structFlags & (CORINFO_FLG_CONTAINS_GC_PTR | CORINFO_FLG_BYREF_LIKE)) == 0 && (structFlags & CORINFO_FLG_INTRINSIC_TYPE) != 0)

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this can be simplified to (structFlags & (CORINFO_FLG_CONTAINS_GC_PTR | CORINFO_FLG_BYREF_LIKE | CORINFO_FLG_INTRINSIC_TYPE)) == CORINFO_FLG_INTRINSIC_TYPE

}
}
}
else if (strcmp(namespaceName, "System") == 0)

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure this is the "right" place for this, particularly given the current method name.

anthonycanino pushed a commit that referenced this pull request Dec 18, 2025
Enable X64's optimization where we clear LCLHEAP via STORE_BLK inserted
in Lower on arm64.

```cs
static void Test128() => Consume(stackalloc char[128]);
```
was:
```asm
            stp     xzr, xzr, [sp, #-0x10]!
            stp     xzr, xzr, [sp, #-0xF0]!
            stp     xzr, xzr, [sp, #0x10]
            stp     xzr, xzr, [sp, #0x20]
            stp     xzr, xzr, [sp, #0x30]
            stp     xzr, xzr, [sp, #0x40]
            stp     xzr, xzr, [sp, #0x50]
            stp     xzr, xzr, [sp, #0x60]
            stp     xzr, xzr, [sp, #0x70]
            stp     xzr, xzr, [sp, #0x80]
            stp     xzr, xzr, [sp, #0x90]
            stp     xzr, xzr, [sp, #0xA0]
            stp     xzr, xzr, [sp, #0xB0]
            stp     xzr, xzr, [sp, #0xC0]
            stp     xzr, xzr, [sp, #0xD0]
            stp     xzr, xzr, [sp, #0xE0]
```
now:
```asm
            movi    v16.16b, #0
            stp     q16, q16, [x0]
            stp     q16, q16, [x0, #0x20]
            stp     q16, q16, [x0, #0x40]
            stp     q16, q16, [x0, #0x60]
            stp     q16, q16, [x0, #0x80]
            stp     q16, q16, [x0, #0xA0]
            stp     q16, q16, [x0, #0xC0]
            stp     q16, q16, [x0, #0xE0]
```

Also, for larger sizes the previous logic used to emit a slow loop (e.g.
1024 bytes):
```asm
            mov     w0, #0x400
G_M30953_IG03:
            stp     xzr, xzr, [sp, #-0x10]!
            subs    x0, x0, #16
            bne     G_M30953_IG03
```
Now it will emit a call to `CORINFO_HELP_MEMZERO`


[Benchmarks.](EgorBot/runtime-utils#553)

```cs
using System.Runtime.CompilerServices;
using BenchmarkDotNet.Attributes;

public class Benchmarks
{
    [Benchmark] public void Stackalloc64() => Consume(stackalloc byte[64]);
    [Benchmark] public void Stackalloc128() => Consume(stackalloc byte[128]);
    [Benchmark] public void Stackalloc256() => Consume(stackalloc byte[256]);
    [Benchmark] public void Stackalloc512() => Consume(stackalloc byte[512]);
    [Benchmark] public void Stackalloc1024() => Consume(stackalloc byte[1024]);
    [Benchmark] public void Stackalloc16384() => Consume(stackalloc byte[16384]);

    [MethodImpl(MethodImplOptions.NoInlining)]
    static void Consume(Span<byte> x){}
}
```

| Method | Toolchain | Mean | Error | Ratio |
|---------------- |------------------------
|-----------:|----------:|------:|
| Stackalloc64    | Main |   3.425 ns | 0.0004 ns |  1.00 |
| Stackalloc64    | PR |   2.559 ns | 0.0008 ns |  0.75 |
| | | | | |
| Stackalloc128   | Main |   3.999 ns | 0.0002 ns |  1.00 |
| Stackalloc128   | PR |   2.404 ns | 0.0003 ns |  0.60 |
| | | | | |
| Stackalloc256   | Main |   5.431 ns | 0.0005 ns |  1.00 |
| Stackalloc256   | PR |   2.754 ns | 0.0003 ns |  0.51 |
| | | | | |
| Stackalloc512   | Main |  12.661 ns | 0.2744 ns |  1.00 |
| Stackalloc512   | PR |   7.423 ns | 0.0008 ns |  0.59 |
| | | | | |
| Stackalloc1024  | Main |  24.958 ns | 0.5326 ns |  1.00 |
| Stackalloc1024  | PR |  14.031 ns | 0.0040 ns |  0.56 |
| | | | | |
| Stackalloc16384 | Main | 374.899 ns | 0.0130 ns |  1.00 |
| Stackalloc16384 | PR | 111.029 ns | 1.2123 ns |  0.30 |

---------

Co-authored-by: Jakob Botsch Nielsen <Jakob.botsch.nielsen@gmail.com>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

None yet

Projects

None yet

Development

Successfully merging this pull request may close these issues.

4 participants