Golang cmd/compile: a constant expression is moved into a loop

Go version

go version go1.24rc2 linux/amd64

Output of `go env` in your module/workspace:

AR='ar'
CC='gcc'
CGO_CFLAGS='-O2 -g'
CGO_CPPFLAGS=''
CGO_CXXFLAGS='-O2 -g'
CGO_ENABLED='1'
CGO_FFLAGS='-O2 -g'
CGO_LDFLAGS='-O2 -g'
CXX='g++'
GCCGO='gccgo'
GO111MODULE=''
GOAMD64='v1'
GOARCH='amd64'
GOAUTH='netrc'
GOBIN=''
GOCACHE='/home/artem/.cache/go-build'
GOCACHEPROG=''
GODEBUG=''
GOENV='/home/artem/.config/go/env'
GOEXE=''
GOEXPERIMENT=''
GOFIPS140='off'
GOFLAGS=''
GOGCCFLAGS='-fPIC -m64 -pthread -Wl,--no-gc-sections -fmessage-length=0 -ffile-prefix-map=/tmp/go-build211132318=/tmp/go-build -gno-record-gcc-switches'
GOHOSTARCH='amd64'
GOHOSTOS='linux'
GOINSECURE=''
GOMOD='/home/artem/dev/mono/go.mod'
GOMODCACHE='/home/artem/go/pkg/mod'
GOOS='linux'
GOPATH='/home/artem/go'
GOPROXY='http://gomodproxy:1111/'
GOROOT='/usr/local/go'
GOSUMDB='sum.golang.org'
GOTELEMETRY='off'
GOTELEMETRYDIR='/home/artem/.config/go/telemetry'
GOTMPDIR=''
GOTOOLCHAIN='auto'
GOTOOLDIR='/usr/local/go/pkg/tool/linux_amd64'
GOVCS=''
GOVERSION='go1.24rc2'
GOWORK=''
PKG_CONFIG='pkg-config'

What did you do?

Compile the following code:

package main

//go:noinline
func Sum0(xs []byte, mask uint64) (n int, found bool) {
        var sum uint64
        for i, x := range xs {
                sum = (sum << 1) + table[x]
                if (sum & mask) == 0 {
                        return i, true
                }
        }
        return 0, false
}

//go:noinline
func Sum1(xs []byte, mask uint64) (n int, found bool) {
        t := table[:]

        var sum uint64
        for i, x := range xs {
                sum = (sum << 1) + t[x]
                if (sum & mask) == 0 {
                        return i, true
                }
        }
        return 0, false
}

func main() {
        xs := []byte{0, 1, 2, 3, 4, 5, 6, 7}
        _, _ = Sum0(xs, ^uint64(0xff))
        _, _ = Sum1(xs, ^uint64(0xff))
}

var table = [256]uint64{
        0xe80e8d55032474b3, 0x11b25b61f5924e15, 0x03aa5bd82a9eb669, 0xc45a153ef107a38c, 0xeac874b86f0f57b9, 0xa5ccedec95ec79c7, 0xe15a3320ad42ac0a, 0x5ed3583fa63cec15,
        0xcd497bf624a4451d, 0xf9ade5b059683605, 0x773940c03fb11ca1, 0xa36b16e4a6ae15b2, 0x67afd1adb5a89eac, 0xc44c75ee32f0038e, 0x2101790f365c0967, 0x76415c64a222fc4a,
        0x579929249a1e577a, 0xe4762fc41fdbf750, 0xea52198e57dfcdcc, 0xe2535aafe30b4281, 0xcb1a1bd6c77c9056, 0x5a1aa9bfc4612a62, 0x15a728aef8943eb5, 0x2f8f09738a8ec8d9,
        0x200f3dec9fac8074, 0x0fa9a7b1e0d318df, 0x06c0804ffd0d8e3a, 0x630cbc412669dd25, 0x10e34f85f4b10285, 0x2a6fe8164b9b6410, 0xcacb57d857d55810, 0x77f8a3a36ff11b46,
        0x66af517e0dc3003e, 0x76c073c789b4009a, 0x853230dbb529f22a, 0x1e9e9c09a1f77e56, 0x1e871223802ee65d, 0x37fe4588718ff813, 0x10088539f30db464, 0x366f7470b80b72d1,
        0x33f2634d9a6b31db, 0xd43917751d69ea18, 0xa0f492bc1aa7b8de, 0x3f94e5a8054edd20, 0xedfd6e25eb8b1dbf, 0x759517a54f196a56, 0xe81d5006ec7b6b17, 0x8dd8385fa894a6b7,
        0x45f4d5467b0d6f91, 0xa1f894699de22bc8, 0x33829d09ef93e0fe, 0x3e29e250caed603c, 0xf7382cba7f63a45e, 0x970f95412bb569d1, 0xc7fcea456d356b4b, 0x723042513f3e7a57,
        0x17ae7688de3596f1, 0x27ac1fcd7cd23c1a, 0xf429beeb78b3f71f, 0xd0780692fb93a3f9, 0x9f507e28a7c9842f, 0x56001ad536e433ae, 0x7e1dd1ecf58be306, 0x15fee353aa233fc6,
        0xb033a0730b7638e8, 0xeb593ad6bd2406d1, 0x7c86502574d0f133, 0xce3b008d4ccb4be7, 0xf8566e3d383594c8, 0xb2c261e9b7af4429, 0xf685e7e253799dbb, 0x05d33ed60a494cbc,
        0xeaf88d55a4cb0d1a, 0x3ee9368a902415a1, 0x8980fe6a8493a9a4, 0x358ed008cb448631, 0xd0cb7e37b46824b8, 0xe9bc375c0bc94f84, 0xea0bf1d8e6b55bb3, 0xb66a60d0f9f6f297,
        0x66db2cc4807b3758, 0x7e4e014afbca8b4d, 0xa5686a4938b0c730, 0xa5f0d7353d623316, 0x26e38c349242d5e8, 0xeeefa80a29858e30, 0x8915cb912aa67386, 0x4b957a47bfc420d4,
        0xbb53d051a895f7e1, 0x09f5e3235f6911ce, 0x416b98e695cfb7ce, 0x97a08183344c5c86, 0xbf68e0791839a861, 0xea05dde59ed3ed56, 0x0ca732280beda160, 0xac748ed62fe7f4e2,
        0xc686da075cf6e151, 0xe1ba5658f4af05c8, 0xe9ff09fbeb67cc35, 0xafaea9470323b28d, 0x0291e8db5bb0ac2a, 0x342072a9bbee77ae, 0x03147eed6b3d0a9c, 0x21379d4de31dbadb,
        0x2388d965226fb986, 0x52c96988bfebabfa, 0xa6fc29896595bc2d, 0x38fa4af70aa46b8b, 0xa688dd13939421ee, 0x99d5275d9b1415da, 0x453d31bb4fe73631, 0xde51debc1fbe3356,
        0x75a3c847a06c622f, 0xe80e32755d272579, 0x5444052250d8ec0d, 0x8f17dfda19580a3b, 0xf6b3e9363a185e42, 0x7a42adec6868732f, 0x32cb6a07629203a2, 0x1eca8957defe56d9,
        0x9fa85e4bc78ff9ed, 0x20ff07224a499ca7, 0x3fa6295ff9682c70, 0xe3d5b1e3ce993eff, 0xa341209362e0b79a, 0x64bd9eae5712ffe8, 0xceebb537babbd12a, 0x5586ef404315954f,
        0x46c3085c938ab51a, 0xa82ccb9199907cee, 0x8c51b6690a3523c8, 0xc4dbd4c9ae518332, 0x979898dbb23db7b2, 0x1b5b585e6f672a9d, 0xce284da7c4903810, 0x841166e8bb5f1c4f,
        0xb7d884a3fceca7d0, 0xa76468f5a4572374, 0xc10c45f49ee9513d, 0x68f9a5663c1908c9, 0x0095a13476a6339d, 0xd1d7516ffbe9c679, 0xfd94ab0c9726f938, 0x627468bbdb27c959,
        0xedc3f8988e4a8c9a, 0x58efd33f0dfaa499, 0x21e37d7e2ef4ac8b, 0x297f9ab5586259c6, 0xda3ba4dc6cb9617d, 0xae11d8d9de2284d2, 0xcfeed88cb3729865, 0xefc2f9e4f03e2633,
        0x8226393e8f0855a4, 0xd6e25fd7acf3a767, 0x435784c3bfd6d14a, 0xf97142e6343fe757, 0xd73b9fe826352f85, 0x6c3ac444b5b2bd76, 0xd8e88f3e9fd4a3fd, 0x31e50875c36f3460,
        0xa824f1bf88cf4d44, 0x54a4d2c8f5f25899, 0xbff254637ce3b1e6, 0xa02cfe92561b3caa, 0x7bedb4edee9f0af7, 0x879c0620ac49a102, 0xa12c4ccd23b332e7, 0x09a5ff47bf94ed1e,
        0x7b62f43cd3046fa0, 0xaa3af0476b9c2fb9, 0x22e55301abebba8e, 0x3a6035c42747bd58, 0x1705373106c8ec07, 0xb1f660de828d0628, 0x065fe82d89ca563d, 0xf555c2d8074d516d,
        0x6bb6c186b423ee99, 0x54a807be6f3120a8, 0x8a3c7fe2f88860b8, 0xbeffc344f5118e81, 0xd686e80b7d1bd268, 0x661aef4ef5e5e88b, 0x5bf256c654cd1dda, 0x9adb1ab85d7640f4,
        0x68449238920833a2, 0x843279f4cebcb044, 0xc8710cdefa93f7bb, 0x236943294538f3e6, 0x80d7d136c486d0b4, 0x61653956b28851d3, 0x3f843be9a9a956b5, 0xf73cfbbf137987e5,
        0xcf0cb6dee8ceac2c, 0x50c401f52f185cae, 0xbdbe89ce735c4c1c, 0xeef3ade9c0570bc7, 0xbe8b066f8f64cbf6, 0x5238d6131705dcb9, 0x20219086c950e9f6, 0x634468d9ed74de02,
        0x0aba4b3d705c7fa5, 0x3374416f725a6672, 0xe7378bdf7beb3bc6, 0x0f7b6a1b1cee565b, 0x234e4c41b0c33e64, 0x4efa9a0c3f21fe28, 0x1167fc551643e514, 0x9f81a69d3eb01fa4,
        0xdb75c22b12306ed0, 0xe25055d738fc9686, 0x9f9f167a3f8507bb, 0x195f8336d3fbe4d3, 0x8442b6feffdcb6f6, 0x1e07ed24746ffde9, 0x140e31462d555266, 0x8bd0ce515ae1406e,
        0x2c0be0042b5584b3, 0x35a23d0e15d45a60, 0xc14f1ba147d9bc83, 0xbbf168691264b23f, 0xad2cc7b57e589ade, 0x9501963154c7815c, 0x9664afa6b8d67d47, 0x7f9e5101fea0a81c,
        0x45ecffb610d25bfd, 0x3157f7aecf9b6ab3, 0xc43ca6f88d87501d, 0x9576ff838dee38dc, 0x93f21afe0ce1c7d7, 0xceac699df343d8f9, 0x2fec49e29f03398d, 0x8805ccd5730281ed,
        0xf9fc16fc750a8e59, 0x35308cc771adf736, 0x4a57b7c9ee2b7def, 0x03a4c6cdc937a02a, 0x6c9a8a269fc8c4fc, 0x4681decec7a03f43, 0x342eecded1353ef9, 0x8be0552d8413a867,
        0xc7b4ac51beda8be8, 0xebcc64fb719842c0, 0xde8e4c7fb6d40c1c, 0xcc8263b62f9738b1, 0xd3cfc0f86511929a, 0x466024ce8bb226ea, 0x459ff690253a3c18, 0x98b27e9d91284c9c,
        0x75c3ae8aa3af373d, 0xfbf8f8e79a866ffc, 0x32327f59d0662799, 0x8228b57e729e9830, 0x065ceb7a18381b58, 0xd2177671a31dc5ff, 0x90cd801f2f8701f9, 0x9d714428471c65fe,
}

What did you see happen?

The second function loads the global var table into a local var before the loop, and never changes it. I'd expect that load to happen once in the generated code, yet Sum0() and Sum1() compile to the same code and load table in every loop iteration:

0000000000468f20 <main.Sum0>:
  468f20:       48 89 44 24 08          mov    %rax,0x8(%rsp)
  468f25:       31 c9                   xor    %ecx,%ecx
  468f27:       31 d2                   xor    %edx,%edx
  468f29:       eb 03                   jmp    468f2e <main.Sum0+0xe>
  468f2b:       48 ff c1                inc    %rcx
  468f2e:       48 39 cb                cmp    %rcx,%rbx
  468f31:       7e 21                   jle    468f54 <main.Sum0+0x34>
  468f33:       0f b6 34 08             movzbl (%rax,%rcx,1),%esi
  468f37:       4c 8d 05 82 9c 08 00    lea    0x89c82(%rip),%r8        # 4f2bc0 <main.table>
  468f3e:       49 8b 34 f0             mov    (%r8,%rsi,8),%rsi
  468f42:       48 8d 14 56             lea    (%rsi,%rdx,2),%rdx
  468f46:       48 85 d7                test   %rdx,%rdi
  468f49:       75 e0                   jne    468f2b <main.Sum0+0xb>
  468f4b:       48 89 c8                mov    %rcx,%rax
  468f4e:       bb 01 00 00 00          mov    $0x1,%ebx
  468f53:       c3                      ret
  468f54:       31 c0                   xor    %eax,%eax
  468f56:       31 db                   xor    %ebx,%ebx
  468f58:       c3                      ret

0000000000468f60 <main.Sum1>:
  468f60:       48 89 44 24 08          mov    %rax,0x8(%rsp)
  468f65:       31 c9                   xor    %ecx,%ecx
  468f67:       31 d2                   xor    %edx,%edx
  468f69:       eb 03                   jmp    468f6e <main.Sum1+0xe>
  468f6b:       48 ff c1                inc    %rcx
  468f6e:       48 39 cb                cmp    %rcx,%rbx
  468f71:       7e 21                   jle    468f94 <main.Sum1+0x34>
  468f73:       0f b6 34 08             movzbl (%rax,%rcx,1),%esi
  468f77:       4c 8d 05 42 9c 08 00    lea    0x89c42(%rip),%r8        # 4f2bc0 <main.table>   <-- !!!
  468f7e:       49 8b 34 f0             mov    (%r8,%rsi,8),%rsi
  468f82:       48 8d 14 56             lea    (%rsi,%rdx,2),%rdx
  468f86:       48 85 d7                test   %rdx,%rdi
  468f89:       75 e0                   jne    468f6b <main.Sum1+0xb>
  468f8b:       48 89 c8                mov    %rcx,%rax
  468f8e:       bb 01 00 00 00          mov    $0x1,%ebx
  468f93:       c3                      ret
  468f94:       31 c0                   xor    %eax,%eax
  468f96:       31 db                   xor    %ebx,%ebx
  468f98:       c3                      ret

What did you expect to see?

Do not move constant expressions into a loop, evaluate them only once.

Even in the case of Sum0() repeated reloads of table look dubious to me. Another goroutine may change table concurrently, but in the absence of locking in Sum0() I'd argue that hoisting the load out of the loop is as racy as reloading it every time. Moreover, table is private to the package so the compiler can know that no code in the package writes to it, nor takes its address, so table is a constant.

Comment From: gabyhelp

Related Issues

_{(Emoji vote if this was helpful or unhelpful; more detailed feedback welcome in this discussion.)}

Comment From: randall77

I'd expect that load to happen once in the generated code

That's not a load, it is just an address calculation.

The compiler doesn't try to keep those outside the loop currently. We could, I guess, but there's a tension there between computing the address just once, and using an additional register to hold it.

The worries about raciness don't apply here, as table[:] is just an address calculation, not a load. If table was defined as a slice instead of an array, then the compiler needs to be more careful (and it is).

Comment From: randall77

Do you have a benchmark that demonstrates that the LEA inside the loop makes things slower?

Comment From: artem-anisimov-0x7f

Here is the C + asm version of the same loop. sum3() has lea outside the loop, sum4() has lea inside the loop. The benchmark calculates sum3() and sum4() 16 times over an array of 128M bytes. On a Ryzen 3950x, I get the following results:

$ gcc -o test -O2 -fPIE test.c
$ ./test
sum3: 1075.885ms
sum4: 1187.926ms

This is a 10.4% more time because of a lea inside the loop.

The code for the benchmark:

#include <stdint.h>
#include <stdlib.h>
#include <stdio.h>
#include <time.h>
#include <err.h>

static uint64_t table[];

uint64_t sum0(uint8_t *xs, uint64_t len, uint64_t mask)
{
        uint64_t sum = 0;
        for (uint64_t i = 0; i < len; ++i) {
                sum = (sum << 1) + table[xs[i]];
                if ((sum & mask) == 0)
                        return i + 1;
        }
        return ~0;
}

uint64_t __attribute__((naked)) sum3(uint8_t *xs, uint64_t len, uint64_t mask)
{
        __asm__ __volatile__ (
                "xor %%rax, %%rax                \n\t"
                "xor %%rcx, %%rcx                \n\t"
                "lea table(%%rip), %%r9          \n\t"
                "jmp sum3start                   \n\t"
                "nop                             \n\t"

                "sum3loop:                       \n\t"
                "inc %%rax                       \n\t"

                "sum3start:                      \n\t"
                "cmp %%rax, %%rsi                \n\t"
                "je sum3notfound                 \n\t"

                "movzbl (%%rdi, %%rax, 1), %%r8d \n\t"
                "mov (%%r9, %%r8, 8), %%r8       \n\t"
                "lea (%%r8, %%rcx, 2), %%rcx     \n\t"
                "test %%rdx, %%rcx               \n\t"
                "jne sum3loop                    \n\t"

                "inc %%rax                       \n\t"
                "ret                             \n\t"
                "sum3notfound:                   \n\t"
                "mov $0xffffffffffffffff, %%rax  \n\t"
                "ret"
                :
                :
                :
        );
}

uint64_t __attribute__((naked)) sum4(uint8_t *xs, uint64_t len, uint64_t mask)
{
        __asm__ __volatile__ (
                "xor %%rax, %%rax                \n\t"
                "xor %%rcx, %%rcx                \n\t"
                "jmp sum4start                   \n\t"
                ".byte 0x0f, 0x1f, 0x84, 0x00    \n\t"
                ".byte 0x00, 0x00, 0x00, 0x00    \n\t"

                "sum4loop:                       \n\t"
                "inc %%rax                       \n\t"

                "sum4start:                      \n\t"
                "cmp %%rax, %%rsi                \n\t"
                "je sum4notfound                 \n\t"

                "movzbl (%%rdi, %%rax, 1), %%r8d \n\t"
                "lea table(%%rip), %%r9          \n\t"
                "mov (%%r9, %%r8, 8), %%r8       \n\t"
                "lea (%%r8, %%rcx, 2), %%rcx     \n\t"
                "test %%rdx, %%rcx               \n\t"
                "jne sum4loop                    \n\t"

                "inc %%rax                       \n\t"
                "ret                             \n\t"
                "sum4notfound:                   \n\t"
                "mov $0xffffffffffffffff, %%rax  \n\t"
                "ret"
                :
                :
                :
        );
}

static uint8_t* generate(uint64_t len)
{
        uint8_t *xs = malloc(len);
        xs[0] = 0;

        uint8_t *x = &xs[1];
        uint64_t ctr = 0;
        for (uint64_t i = 1; i < len; ++i) {
                ctr = ctr * 6364136223846793005 + 1442695040888963407;
                *x++ = ctr;
        }

        return xs;
}

static void compare(uint8_t *xs, uint64_t len, uint64_t mask)
{
        while (len > 0) {
                uint64_t a0 = sum0(xs, len, mask),
                        a3 = sum3(xs, len, mask),
                        a4 = sum4(xs, len, mask);

                if (a0 != a3)
                        errx(1, "sum0() and sum3() disagree");
                if (a0 != a4)
                        errx(1, "sum0() and sum4() disagree");

                if (a0 != -1ULL) {
                        xs += a0;
                        len -= a0;
                } else {
                        break;
                }
        }
}

static void measure(const char *name, uint64_t (*fn)(uint8_t *xs, uint64_t len, uint64_t mask), uint8_t *xs, uint64_t len)
{
        struct timespec begin, end;
        clock_gettime(CLOCK_THREAD_CPUTIME_ID, &begin);
        for (int i = 0; i < 16; ++i)
                (void) fn(xs, len, -1ULL);
        clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end);

        long long sec = end.tv_sec - begin.tv_sec;
        long long nsec = end.tv_nsec - begin.tv_nsec;
        long long delta = (sec * 1000 * 1000) + (nsec / 1000);

        printf("%s: %.3lfms\n", name, delta / 1000.0);
}

int main()
{
        const uint64_t len = 128 * 1024 * 1024;
        const uint64_t mask = 0x7fULL;

        uint8_t *xs = generate(len);
        compare(xs, len, mask);

        measure("sum3", &sum3, xs, len);
        measure("sum4", &sum4, xs, len);

        return 0;
}

static uint64_t table[] = {
        0xe80e8d55032474b3, 0x11b25b61f5924e15, 0x03aa5bd82a9eb669, 0xc45a153ef107a38c, 0xeac874b86f0f57b9, 0xa5ccedec95ec79c7, 0xe15a3320ad42ac0a, 0x5ed3583fa63cec15,
        0xcd497bf624a4451d, 0xf9ade5b059683605, 0x773940c03fb11ca1, 0xa36b16e4a6ae15b2, 0x67afd1adb5a89eac, 0xc44c75ee32f0038e, 0x2101790f365c0967, 0x76415c64a222fc4a,
        0x579929249a1e577a, 0xe4762fc41fdbf750, 0xea52198e57dfcdcc, 0xe2535aafe30b4281, 0xcb1a1bd6c77c9056, 0x5a1aa9bfc4612a62, 0x15a728aef8943eb5, 0x2f8f09738a8ec8d9,
        0x200f3dec9fac8074, 0x0fa9a7b1e0d318df, 0x06c0804ffd0d8e3a, 0x630cbc412669dd25, 0x10e34f85f4b10285, 0x2a6fe8164b9b6410, 0xcacb57d857d55810, 0x77f8a3a36ff11b46,
        0x66af517e0dc3003e, 0x76c073c789b4009a, 0x853230dbb529f22a, 0x1e9e9c09a1f77e56, 0x1e871223802ee65d, 0x37fe4588718ff813, 0x10088539f30db464, 0x366f7470b80b72d1,
        0x33f2634d9a6b31db, 0xd43917751d69ea18, 0xa0f492bc1aa7b8de, 0x3f94e5a8054edd20, 0xedfd6e25eb8b1dbf, 0x759517a54f196a56, 0xe81d5006ec7b6b17, 0x8dd8385fa894a6b7,
        0x45f4d5467b0d6f91, 0xa1f894699de22bc8, 0x33829d09ef93e0fe, 0x3e29e250caed603c, 0xf7382cba7f63a45e, 0x970f95412bb569d1, 0xc7fcea456d356b4b, 0x723042513f3e7a57,
        0x17ae7688de3596f1, 0x27ac1fcd7cd23c1a, 0xf429beeb78b3f71f, 0xd0780692fb93a3f9, 0x9f507e28a7c9842f, 0x56001ad536e433ae, 0x7e1dd1ecf58be306, 0x15fee353aa233fc6,
        0xb033a0730b7638e8, 0xeb593ad6bd2406d1, 0x7c86502574d0f133, 0xce3b008d4ccb4be7, 0xf8566e3d383594c8, 0xb2c261e9b7af4429, 0xf685e7e253799dbb, 0x05d33ed60a494cbc,
        0xeaf88d55a4cb0d1a, 0x3ee9368a902415a1, 0x8980fe6a8493a9a4, 0x358ed008cb448631, 0xd0cb7e37b46824b8, 0xe9bc375c0bc94f84, 0xea0bf1d8e6b55bb3, 0xb66a60d0f9f6f297,
        0x66db2cc4807b3758, 0x7e4e014afbca8b4d, 0xa5686a4938b0c730, 0xa5f0d7353d623316, 0x26e38c349242d5e8, 0xeeefa80a29858e30, 0x8915cb912aa67386, 0x4b957a47bfc420d4,
        0xbb53d051a895f7e1, 0x09f5e3235f6911ce, 0x416b98e695cfb7ce, 0x97a08183344c5c86, 0xbf68e0791839a861, 0xea05dde59ed3ed56, 0x0ca732280beda160, 0xac748ed62fe7f4e2,
        0xc686da075cf6e151, 0xe1ba5658f4af05c8, 0xe9ff09fbeb67cc35, 0xafaea9470323b28d, 0x0291e8db5bb0ac2a, 0x342072a9bbee77ae, 0x03147eed6b3d0a9c, 0x21379d4de31dbadb,
        0x2388d965226fb986, 0x52c96988bfebabfa, 0xa6fc29896595bc2d, 0x38fa4af70aa46b8b, 0xa688dd13939421ee, 0x99d5275d9b1415da, 0x453d31bb4fe73631, 0xde51debc1fbe3356,
        0x75a3c847a06c622f, 0xe80e32755d272579, 0x5444052250d8ec0d, 0x8f17dfda19580a3b, 0xf6b3e9363a185e42, 0x7a42adec6868732f, 0x32cb6a07629203a2, 0x1eca8957defe56d9,
        0x9fa85e4bc78ff9ed, 0x20ff07224a499ca7, 0x3fa6295ff9682c70, 0xe3d5b1e3ce993eff, 0xa341209362e0b79a, 0x64bd9eae5712ffe8, 0xceebb537babbd12a, 0x5586ef404315954f,
        0x46c3085c938ab51a, 0xa82ccb9199907cee, 0x8c51b6690a3523c8, 0xc4dbd4c9ae518332, 0x979898dbb23db7b2, 0x1b5b585e6f672a9d, 0xce284da7c4903810, 0x841166e8bb5f1c4f,
        0xb7d884a3fceca7d0, 0xa76468f5a4572374, 0xc10c45f49ee9513d, 0x68f9a5663c1908c9, 0x0095a13476a6339d, 0xd1d7516ffbe9c679, 0xfd94ab0c9726f938, 0x627468bbdb27c959,
        0xedc3f8988e4a8c9a, 0x58efd33f0dfaa499, 0x21e37d7e2ef4ac8b, 0x297f9ab5586259c6, 0xda3ba4dc6cb9617d, 0xae11d8d9de2284d2, 0xcfeed88cb3729865, 0xefc2f9e4f03e2633,
        0x8226393e8f0855a4, 0xd6e25fd7acf3a767, 0x435784c3bfd6d14a, 0xf97142e6343fe757, 0xd73b9fe826352f85, 0x6c3ac444b5b2bd76, 0xd8e88f3e9fd4a3fd, 0x31e50875c36f3460,
        0xa824f1bf88cf4d44, 0x54a4d2c8f5f25899, 0xbff254637ce3b1e6, 0xa02cfe92561b3caa, 0x7bedb4edee9f0af7, 0x879c0620ac49a102, 0xa12c4ccd23b332e7, 0x09a5ff47bf94ed1e,
        0x7b62f43cd3046fa0, 0xaa3af0476b9c2fb9, 0x22e55301abebba8e, 0x3a6035c42747bd58, 0x1705373106c8ec07, 0xb1f660de828d0628, 0x065fe82d89ca563d, 0xf555c2d8074d516d,
        0x6bb6c186b423ee99, 0x54a807be6f3120a8, 0x8a3c7fe2f88860b8, 0xbeffc344f5118e81, 0xd686e80b7d1bd268, 0x661aef4ef5e5e88b, 0x5bf256c654cd1dda, 0x9adb1ab85d7640f4,
        0x68449238920833a2, 0x843279f4cebcb044, 0xc8710cdefa93f7bb, 0x236943294538f3e6, 0x80d7d136c486d0b4, 0x61653956b28851d3, 0x3f843be9a9a956b5, 0xf73cfbbf137987e5,
        0xcf0cb6dee8ceac2c, 0x50c401f52f185cae, 0xbdbe89ce735c4c1c, 0xeef3ade9c0570bc7, 0xbe8b066f8f64cbf6, 0x5238d6131705dcb9, 0x20219086c950e9f6, 0x634468d9ed74de02,
        0x0aba4b3d705c7fa5, 0x3374416f725a6672, 0xe7378bdf7beb3bc6, 0x0f7b6a1b1cee565b, 0x234e4c41b0c33e64, 0x4efa9a0c3f21fe28, 0x1167fc551643e514, 0x9f81a69d3eb01fa4,
        0xdb75c22b12306ed0, 0xe25055d738fc9686, 0x9f9f167a3f8507bb, 0x195f8336d3fbe4d3, 0x8442b6feffdcb6f6, 0x1e07ed24746ffde9, 0x140e31462d555266, 0x8bd0ce515ae1406e,
        0x2c0be0042b5584b3, 0x35a23d0e15d45a60, 0xc14f1ba147d9bc83, 0xbbf168691264b23f, 0xad2cc7b57e589ade, 0x9501963154c7815c, 0x9664afa6b8d67d47, 0x7f9e5101fea0a81c,
        0x45ecffb610d25bfd, 0x3157f7aecf9b6ab3, 0xc43ca6f88d87501d, 0x9576ff838dee38dc, 0x93f21afe0ce1c7d7, 0xceac699df343d8f9, 0x2fec49e29f03398d, 0x8805ccd5730281ed,
        0xf9fc16fc750a8e59, 0x35308cc771adf736, 0x4a57b7c9ee2b7def, 0x03a4c6cdc937a02a, 0x6c9a8a269fc8c4fc, 0x4681decec7a03f43, 0x342eecded1353ef9, 0x8be0552d8413a867,
        0xc7b4ac51beda8be8, 0xebcc64fb719842c0, 0xde8e4c7fb6d40c1c, 0xcc8263b62f9738b1, 0xd3cfc0f86511929a, 0x466024ce8bb226ea, 0x459ff690253a3c18, 0x98b27e9d91284c9c,
        0x75c3ae8aa3af373d, 0xfbf8f8e79a866ffc, 0x32327f59d0662799, 0x8228b57e729e9830, 0x065ceb7a18381b58, 0xd2177671a31dc5ff, 0x90cd801f2f8701f9, 0x9d714428471c65fe,
};

There is one difference. movzbl is one byte longer in my version because it moves into %r8d instead of %esi, but I think that should not affect the result.

Comment From: artem-anisimov-0x7f

There is another interesting thing. Consider the following code. It has the same loop as Sum0() in my initial example, but it convinces the register allocator to insert a move that could be avoided (I have minimised it from a real example to keep the move):

type S struct {
        minSize  uint
        maxSize  uint
        normSize uint

        maskS uint64
        maskL uint64

        scanned uint
        sum     uint64
}

//go:noinline
func (s *S) Sum(xs []byte) (uint, bool) {
        l := uint(len(xs))
        sum := s.sum

        var i uint
        if s.scanned < s.minSize {
                i = s.minSize - s.scanned
        }

        n := min(l, s.normSize-s.scanned)
        mask := s.maskS
        for ; i < n; i++ {
                sum = (sum << 1) + table[xs[i]]
                if (sum & mask) == 0 {
                        return i + 1, true
                }
        }

        if i+s.scanned == s.maxSize {
                return s.maxSize, true
        }

        return l, false
}

The loop compiles to the following code:

  468fd3:       48 ff c6                inc    %rsi
  468fd6:       49 89 f8                mov    %rdi,%r8
  468fd9:       48 39 ce                cmp    %rcx,%rsi
  468fdc:       73 23                   jae    469001 <main.(*S).Sum+0x61>
  468fde:       0f b6 3c 33             movzbl (%rbx,%rsi,1),%edi
  468fe2:       4c 8d 1d d7 9b 08 00    lea    0x89bd7(%rip),%r11        # 4f2bc0 <main.table>
  468fe9:       49 8b 3c fb             mov    (%r11,%rdi,8),%rdi
  468fed:       4a 8d 3c 47             lea    (%rdi,%r8,2),%rdi
  468ff1:       49 85 f9                test   %rdi,%r9
  468ff4:       75 dd                   jne    468fd3 <main.(*S).Sum+0x33>

mov %rdi, %r8 is only needed because of movzbl at 468fde and mov at 468fe9. Those two instructions could use %r8 instead of %rdi.

The C + asm code for this version of the loop (sum1() has lea outside the loop and sum2() has lea inside the loop):

uint64_t __attribute__((naked)) sum1(uint8_t *xs, uint64_t len, uint64_t mask)
{
        __asm__ __volatile__ (
                "xor %%rax, %%rax                \n\t"
                "xor %%rcx, %%rcx                \n\t"
                "lea table(%%rip), %%r9          \n\t"
                "jmp sum1start                   \n\t"
                "nop                             \n\t"

                "sum1loop:                       \n\t"
                "inc %%rax                       \n\t"

                "sum1start:                      \n\t"
                "mov %%rcx, %%r10                \n\t"
                "cmp %%rax, %%rsi                \n\t"
                "je sum1notfound                 \n\t"

                "movzbl (%%rdi, %%rax, 1), %%ecx \n\t"
                "mov (%%r9, %%rcx, 8), %%rcx     \n\t"
                "lea (%%rcx, %%r10, 2), %%rcx    \n\t"
                "test %%rdx, %%rcx               \n\t"
                "jne sum1loop                    \n\t"

                "inc %%rax                       \n\t"
                "ret                             \n\t"
                "sum1notfound:                   \n\t"
                "mov $0xffffffffffffffff, %%rax  \n\t"
                "ret"
                :
                :
                :
        );
}

uint64_t __attribute__((naked)) sum2(uint8_t *xs, uint64_t len, uint64_t mask)
{
        __asm__ __volatile__ (
                "xor %%rax, %%rax                \n\t"
                "xor %%rcx, %%rcx                \n\t"
                "jmp sum2start                   \n\t"
                ".byte 0x0f, 0x1f, 0x84, 0x00    \n\t"
                ".byte 0x00, 0x00, 0x00, 0x00    \n\t"

                "sum2loop:                       \n\t"
                "inc %%rax                       \n\t"

                "sum2start:                      \n\t"
                "mov %%rcx, %%r10                \n\t"
                "cmp %%rax, %%rsi                \n\t"
                "je sum2notfound                 \n\t"

                "movzbl (%%rdi, %%rax, 1), %%ecx \n\t"
                "lea table(%%rip), %%r9          \n\t"
                "mov (%%r9, %%rcx, 8), %%rcx     \n\t"
                "lea (%%rcx, %%r10, 2), %%rcx    \n\t"
                "test %%rdx, %%rcx               \n\t"
                "jne sum2loop                    \n\t"

                "inc %%rax                       \n\t"
                "ret                             \n\t"
                "sum2notfound:                   \n\t"
                "mov $0xffffffffffffffff, %%rax  \n\t"
                "ret"
                :
                :
                :
        );
}

The difference of the run time between the two is

$ gcc -o test -O2 -fPIE test.c
$ ./test
sum1: 1063.284ms
sum2: 1291.752ms

For this version of the loop, the difference between lea inside and outside the loop is no longer 10.4% as in the previous comment, but 21.5%.

Comment From: Jorropo

I couldn't believe that a random LEA would slow down this benchmark, it has no dependency and in real life you almost always have one free ALU or AGU unit to execute the LEA in parallel of other instructions ahead of when you need it. So I proceeded to create a benchmark proving this is a loop alignment issue as this almost always explain weird performance differences.

Source code: https://go.dev/play/p/dwi-EzN2oTo

You also need to patch the assembler to use <ABIInternal>:

diff --git a/src/cmd/asm/internal/asm/parse.go b/src/cmd/asm/internal/asm/parse.go
index 638f4e2fc4..0bc8d1e3ce 100644
--- a/src/cmd/asm/internal/asm/parse.go
+++ b/src/cmd/asm/internal/asm/parse.go
@@ -68,7 +68,7 @@ func NewParser(ctxt *obj.Link, ar *arch.Arch, lexer lex.TokenReader) *Parser {
                labels:      make(map[string]*obj.Prog),
                dataAddr:    make(map[string]int64),
                errorWriter: os.Stderr,
-               allowABI:    ctxt != nil && objabi.LookupPkgSpecial(ctxt.Pkgpath).AllowAsmABI,
+               allowABI:    true,
                pkgPrefix:   pkgPrefix,
        }
 }

This test a 2×4 configuration space:

Alignment	LEA in body	LEA in header
0	`Sum0`	`Sum1`
64	`Sum2`	`Sum3`
32	`Sum4`	`Sum5`
16	`Sum6`	`Sum7`

Here are results for 10 interleaved runs:

goos: linux
goarch: amd64
pkg: aaa
cpu: AMD Ryzen 5 3600 6-Core Processor              
        │   result    │
        │   sec/op    │
Sum0-12   9.583n ± 1%
Sum1-12   8.495n ± 1%
Sum2-12   9.574n ± 1%
Sum3-12   8.753n ± 1%
Sum4-12   9.533n ± 1%
Sum5-12   8.718n ± 1%
Sum6-12   9.544n ± 1%
Sum7-12   8.630n ± 1%
geomean   9.092n

Alignment	LEA in body	LEA in header
0	9.5	8.4
64	9.5	8.7
32	9.5	8.7
16	9.5	8.6

If anything my conclusion is that for this exact function, having the LEA in the body is in fact significantly slower.

Comment From: bupjae

I think this can explain why "random LEA" could slow down that benchmark.

According to Intel's Optimization Reference Manual (brought from same source mentioned on #21735 )

For LEA instructions with three source operands and some specific situations, instruction latency has increased to 3 cycles, and must dispatch via port 1: - LEA that has all three source operands: base, index, and offset - LEA that uses base and index registers where the base is EBP, RBP,or R13 - LEA that uses RIP relative addressing mode - LEA that uses 16-bit addressing mode

That LEA is indeed third case of "slow LEA" that could impact performance in tight loop.