From 88afa3112310a39e7323e495e925afae4db40e4f Mon Sep 17 00:00:00 2001 From: ahxxm Date: Fri, 27 Feb 2026 23:32:05 +0900 Subject: [PATCH 1/2] unroll scalar fill loop by 4 --- runcontainer.go | 28 ++++++++++++++++++++++++---- 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/runcontainer.go b/runcontainer.go index 473ce34d..ee878810 100644 --- a/runcontainer.go +++ b/runcontainer.go @@ -1242,8 +1242,18 @@ func (ri *runIterator16) nextMany(hs uint32, buf []uint32) int { // allows BCE buf2 := buf[n : n+moreVals] - for i := range buf2 { - buf2[i] = base + uint32(i) + i := 0 + for ; i+3 < len(buf2); i += 4 { + _ = buf2[i+3] + buf2[i] = base + buf2[i+1] = base + 1 + buf2[i+2] = base + 2 + buf2[i+3] = base + 3 + base += 4 + } + for ; i < len(buf2); i++ { + buf2[i] = base + base++ } // update values @@ -1283,8 +1293,18 @@ func (ri *runIterator16) nextMany64(hs uint64, buf []uint64) int { // allows BCE buf2 := buf[n : n+moreVals] - for i := range buf2 { - buf2[i] = base + uint64(i) + i := 0 + for ; i+3 < len(buf2); i += 4 { + _ = buf2[i+3] + buf2[i] = base + buf2[i+1] = base + 1 + buf2[i+2] = base + 2 + buf2[i+3] = base + 3 + base += 4 + } + for ; i < len(buf2); i++ { + buf2[i] = base + base++ } // update values From 4d1a03bd6cbc5db942edab0e530bc11df9349ed0 Mon Sep 17 00:00:00 2001 From: ahxxm Date: Fri, 27 Feb 2026 23:42:31 +0900 Subject: [PATCH 2/2] hint only covers buf2[i+2] and buf2[i+3], and no speed gain --- runcontainer.go | 2 -- 1 file changed, 2 deletions(-) diff --git a/runcontainer.go b/runcontainer.go index ee878810..4731da73 100644 --- a/runcontainer.go +++ b/runcontainer.go @@ -1244,7 +1244,6 @@ func (ri *runIterator16) nextMany(hs uint32, buf []uint32) int { buf2 := buf[n : n+moreVals] i := 0 for ; i+3 < len(buf2); i += 4 { - _ = buf2[i+3] buf2[i] = base buf2[i+1] = base + 1 buf2[i+2] = base + 2 @@ -1295,7 +1294,6 @@ func (ri *runIterator16) nextMany64(hs uint64, buf []uint64) int { buf2 := buf[n : n+moreVals] i := 0 for ; i+3 < len(buf2); i += 4 { - _ = buf2[i+3] buf2[i] = base buf2[i+1] = base + 1 buf2[i+2] = base + 2