Go version
go version go1.24rc2 linux/amd64
Output of go env
in your module/workspace:
AR='ar'
CC='gcc'
CGO_CFLAGS='-O2 -g'
CGO_CPPFLAGS=''
CGO_CXXFLAGS='-O2 -g'
CGO_ENABLED='1'
CGO_FFLAGS='-O2 -g'
CGO_LDFLAGS='-O2 -g'
CXX='g++'
GCCGO='gccgo'
GO111MODULE=''
GOAMD64='v1'
GOARCH='amd64'
GOAUTH='netrc'
GOBIN=''
GOCACHE='/home/artem/.cache/go-build'
GOCACHEPROG=''
GODEBUG=''
GOENV='/home/artem/.config/go/env'
GOEXE=''
GOEXPERIMENT=''
GOFIPS140='off'
GOFLAGS=''
GOGCCFLAGS='-fPIC -m64 -pthread -Wl,--no-gc-sections -fmessage-length=0 -ffile-prefix-map=/tmp/go-build211132318=/tmp/go-build -gno-record-gcc-switches'
GOHOSTARCH='amd64'
GOHOSTOS='linux'
GOINSECURE=''
GOMOD='/home/artem/dev/mono/go.mod'
GOMODCACHE='/home/artem/go/pkg/mod'
GOOS='linux'
GOPATH='/home/artem/go'
GOPROXY='http://gomodproxy:1111/'
GOROOT='/usr/local/go'
GOSUMDB='sum.golang.org'
GOTELEMETRY='off'
GOTELEMETRYDIR='/home/artem/.config/go/telemetry'
GOTMPDIR=''
GOTOOLCHAIN='auto'
GOTOOLDIR='/usr/local/go/pkg/tool/linux_amd64'
GOVCS=''
GOVERSION='go1.24rc2'
GOWORK=''
PKG_CONFIG='pkg-config'
What did you do?
Compile the following code:
package main
//go:noinline
func Sum0(xs []byte, mask uint64) (n int, found bool) {
var sum uint64
for i, x := range xs {
sum = (sum << 1) + table[x]
if (sum & mask) == 0 {
return i, true
}
}
return 0, false
}
//go:noinline
func Sum1(xs []byte, mask uint64) (n int, found bool) {
t := table[:]
var sum uint64
for i, x := range xs {
sum = (sum << 1) + t[x]
if (sum & mask) == 0 {
return i, true
}
}
return 0, false
}
func main() {
xs := []byte{0, 1, 2, 3, 4, 5, 6, 7}
_, _ = Sum0(xs, ^uint64(0xff))
_, _ = Sum1(xs, ^uint64(0xff))
}
var table = [256]uint64{
0xe80e8d55032474b3, 0x11b25b61f5924e15, 0x03aa5bd82a9eb669, 0xc45a153ef107a38c, 0xeac874b86f0f57b9, 0xa5ccedec95ec79c7, 0xe15a3320ad42ac0a, 0x5ed3583fa63cec15,
0xcd497bf624a4451d, 0xf9ade5b059683605, 0x773940c03fb11ca1, 0xa36b16e4a6ae15b2, 0x67afd1adb5a89eac, 0xc44c75ee32f0038e, 0x2101790f365c0967, 0x76415c64a222fc4a,
0x579929249a1e577a, 0xe4762fc41fdbf750, 0xea52198e57dfcdcc, 0xe2535aafe30b4281, 0xcb1a1bd6c77c9056, 0x5a1aa9bfc4612a62, 0x15a728aef8943eb5, 0x2f8f09738a8ec8d9,
0x200f3dec9fac8074, 0x0fa9a7b1e0d318df, 0x06c0804ffd0d8e3a, 0x630cbc412669dd25, 0x10e34f85f4b10285, 0x2a6fe8164b9b6410, 0xcacb57d857d55810, 0x77f8a3a36ff11b46,
0x66af517e0dc3003e, 0x76c073c789b4009a, 0x853230dbb529f22a, 0x1e9e9c09a1f77e56, 0x1e871223802ee65d, 0x37fe4588718ff813, 0x10088539f30db464, 0x366f7470b80b72d1,
0x33f2634d9a6b31db, 0xd43917751d69ea18, 0xa0f492bc1aa7b8de, 0x3f94e5a8054edd20, 0xedfd6e25eb8b1dbf, 0x759517a54f196a56, 0xe81d5006ec7b6b17, 0x8dd8385fa894a6b7,
0x45f4d5467b0d6f91, 0xa1f894699de22bc8, 0x33829d09ef93e0fe, 0x3e29e250caed603c, 0xf7382cba7f63a45e, 0x970f95412bb569d1, 0xc7fcea456d356b4b, 0x723042513f3e7a57,
0x17ae7688de3596f1, 0x27ac1fcd7cd23c1a, 0xf429beeb78b3f71f, 0xd0780692fb93a3f9, 0x9f507e28a7c9842f, 0x56001ad536e433ae, 0x7e1dd1ecf58be306, 0x15fee353aa233fc6,
0xb033a0730b7638e8, 0xeb593ad6bd2406d1, 0x7c86502574d0f133, 0xce3b008d4ccb4be7, 0xf8566e3d383594c8, 0xb2c261e9b7af4429, 0xf685e7e253799dbb, 0x05d33ed60a494cbc,
0xeaf88d55a4cb0d1a, 0x3ee9368a902415a1, 0x8980fe6a8493a9a4, 0x358ed008cb448631, 0xd0cb7e37b46824b8, 0xe9bc375c0bc94f84, 0xea0bf1d8e6b55bb3, 0xb66a60d0f9f6f297,
0x66db2cc4807b3758, 0x7e4e014afbca8b4d, 0xa5686a4938b0c730, 0xa5f0d7353d623316, 0x26e38c349242d5e8, 0xeeefa80a29858e30, 0x8915cb912aa67386, 0x4b957a47bfc420d4,
0xbb53d051a895f7e1, 0x09f5e3235f6911ce, 0x416b98e695cfb7ce, 0x97a08183344c5c86, 0xbf68e0791839a861, 0xea05dde59ed3ed56, 0x0ca732280beda160, 0xac748ed62fe7f4e2,
0xc686da075cf6e151, 0xe1ba5658f4af05c8, 0xe9ff09fbeb67cc35, 0xafaea9470323b28d, 0x0291e8db5bb0ac2a, 0x342072a9bbee77ae, 0x03147eed6b3d0a9c, 0x21379d4de31dbadb,
0x2388d965226fb986, 0x52c96988bfebabfa, 0xa6fc29896595bc2d, 0x38fa4af70aa46b8b, 0xa688dd13939421ee, 0x99d5275d9b1415da, 0x453d31bb4fe73631, 0xde51debc1fbe3356,
0x75a3c847a06c622f, 0xe80e32755d272579, 0x5444052250d8ec0d, 0x8f17dfda19580a3b, 0xf6b3e9363a185e42, 0x7a42adec6868732f, 0x32cb6a07629203a2, 0x1eca8957defe56d9,
0x9fa85e4bc78ff9ed, 0x20ff07224a499ca7, 0x3fa6295ff9682c70, 0xe3d5b1e3ce993eff, 0xa341209362e0b79a, 0x64bd9eae5712ffe8, 0xceebb537babbd12a, 0x5586ef404315954f,
0x46c3085c938ab51a, 0xa82ccb9199907cee, 0x8c51b6690a3523c8, 0xc4dbd4c9ae518332, 0x979898dbb23db7b2, 0x1b5b585e6f672a9d, 0xce284da7c4903810, 0x841166e8bb5f1c4f,
0xb7d884a3fceca7d0, 0xa76468f5a4572374, 0xc10c45f49ee9513d, 0x68f9a5663c1908c9, 0x0095a13476a6339d, 0xd1d7516ffbe9c679, 0xfd94ab0c9726f938, 0x627468bbdb27c959,
0xedc3f8988e4a8c9a, 0x58efd33f0dfaa499, 0x21e37d7e2ef4ac8b, 0x297f9ab5586259c6, 0xda3ba4dc6cb9617d, 0xae11d8d9de2284d2, 0xcfeed88cb3729865, 0xefc2f9e4f03e2633,
0x8226393e8f0855a4, 0xd6e25fd7acf3a767, 0x435784c3bfd6d14a, 0xf97142e6343fe757, 0xd73b9fe826352f85, 0x6c3ac444b5b2bd76, 0xd8e88f3e9fd4a3fd, 0x31e50875c36f3460,
0xa824f1bf88cf4d44, 0x54a4d2c8f5f25899, 0xbff254637ce3b1e6, 0xa02cfe92561b3caa, 0x7bedb4edee9f0af7, 0x879c0620ac49a102, 0xa12c4ccd23b332e7, 0x09a5ff47bf94ed1e,
0x7b62f43cd3046fa0, 0xaa3af0476b9c2fb9, 0x22e55301abebba8e, 0x3a6035c42747bd58, 0x1705373106c8ec07, 0xb1f660de828d0628, 0x065fe82d89ca563d, 0xf555c2d8074d516d,
0x6bb6c186b423ee99, 0x54a807be6f3120a8, 0x8a3c7fe2f88860b8, 0xbeffc344f5118e81, 0xd686e80b7d1bd268, 0x661aef4ef5e5e88b, 0x5bf256c654cd1dda, 0x9adb1ab85d7640f4,
0x68449238920833a2, 0x843279f4cebcb044, 0xc8710cdefa93f7bb, 0x236943294538f3e6, 0x80d7d136c486d0b4, 0x61653956b28851d3, 0x3f843be9a9a956b5, 0xf73cfbbf137987e5,
0xcf0cb6dee8ceac2c, 0x50c401f52f185cae, 0xbdbe89ce735c4c1c, 0xeef3ade9c0570bc7, 0xbe8b066f8f64cbf6, 0x5238d6131705dcb9, 0x20219086c950e9f6, 0x634468d9ed74de02,
0x0aba4b3d705c7fa5, 0x3374416f725a6672, 0xe7378bdf7beb3bc6, 0x0f7b6a1b1cee565b, 0x234e4c41b0c33e64, 0x4efa9a0c3f21fe28, 0x1167fc551643e514, 0x9f81a69d3eb01fa4,
0xdb75c22b12306ed0, 0xe25055d738fc9686, 0x9f9f167a3f8507bb, 0x195f8336d3fbe4d3, 0x8442b6feffdcb6f6, 0x1e07ed24746ffde9, 0x140e31462d555266, 0x8bd0ce515ae1406e,
0x2c0be0042b5584b3, 0x35a23d0e15d45a60, 0xc14f1ba147d9bc83, 0xbbf168691264b23f, 0xad2cc7b57e589ade, 0x9501963154c7815c, 0x9664afa6b8d67d47, 0x7f9e5101fea0a81c,
0x45ecffb610d25bfd, 0x3157f7aecf9b6ab3, 0xc43ca6f88d87501d, 0x9576ff838dee38dc, 0x93f21afe0ce1c7d7, 0xceac699df343d8f9, 0x2fec49e29f03398d, 0x8805ccd5730281ed,
0xf9fc16fc750a8e59, 0x35308cc771adf736, 0x4a57b7c9ee2b7def, 0x03a4c6cdc937a02a, 0x6c9a8a269fc8c4fc, 0x4681decec7a03f43, 0x342eecded1353ef9, 0x8be0552d8413a867,
0xc7b4ac51beda8be8, 0xebcc64fb719842c0, 0xde8e4c7fb6d40c1c, 0xcc8263b62f9738b1, 0xd3cfc0f86511929a, 0x466024ce8bb226ea, 0x459ff690253a3c18, 0x98b27e9d91284c9c,
0x75c3ae8aa3af373d, 0xfbf8f8e79a866ffc, 0x32327f59d0662799, 0x8228b57e729e9830, 0x065ceb7a18381b58, 0xd2177671a31dc5ff, 0x90cd801f2f8701f9, 0x9d714428471c65fe,
}
What did you see happen?
The second function loads the global var table
into a local var before the loop, and never changes it. I'd expect that load to happen once in the generated code, yet Sum0()
and Sum1()
compile to the same code and load table
in every loop iteration:
0000000000468f20 <main.Sum0>:
468f20: 48 89 44 24 08 mov %rax,0x8(%rsp)
468f25: 31 c9 xor %ecx,%ecx
468f27: 31 d2 xor %edx,%edx
468f29: eb 03 jmp 468f2e <main.Sum0+0xe>
468f2b: 48 ff c1 inc %rcx
468f2e: 48 39 cb cmp %rcx,%rbx
468f31: 7e 21 jle 468f54 <main.Sum0+0x34>
468f33: 0f b6 34 08 movzbl (%rax,%rcx,1),%esi
468f37: 4c 8d 05 82 9c 08 00 lea 0x89c82(%rip),%r8 # 4f2bc0 <main.table>
468f3e: 49 8b 34 f0 mov (%r8,%rsi,8),%rsi
468f42: 48 8d 14 56 lea (%rsi,%rdx,2),%rdx
468f46: 48 85 d7 test %rdx,%rdi
468f49: 75 e0 jne 468f2b <main.Sum0+0xb>
468f4b: 48 89 c8 mov %rcx,%rax
468f4e: bb 01 00 00 00 mov $0x1,%ebx
468f53: c3 ret
468f54: 31 c0 xor %eax,%eax
468f56: 31 db xor %ebx,%ebx
468f58: c3 ret
0000000000468f60 <main.Sum1>:
468f60: 48 89 44 24 08 mov %rax,0x8(%rsp)
468f65: 31 c9 xor %ecx,%ecx
468f67: 31 d2 xor %edx,%edx
468f69: eb 03 jmp 468f6e <main.Sum1+0xe>
468f6b: 48 ff c1 inc %rcx
468f6e: 48 39 cb cmp %rcx,%rbx
468f71: 7e 21 jle 468f94 <main.Sum1+0x34>
468f73: 0f b6 34 08 movzbl (%rax,%rcx,1),%esi
468f77: 4c 8d 05 42 9c 08 00 lea 0x89c42(%rip),%r8 # 4f2bc0 <main.table> <-- !!!
468f7e: 49 8b 34 f0 mov (%r8,%rsi,8),%rsi
468f82: 48 8d 14 56 lea (%rsi,%rdx,2),%rdx
468f86: 48 85 d7 test %rdx,%rdi
468f89: 75 e0 jne 468f6b <main.Sum1+0xb>
468f8b: 48 89 c8 mov %rcx,%rax
468f8e: bb 01 00 00 00 mov $0x1,%ebx
468f93: c3 ret
468f94: 31 c0 xor %eax,%eax
468f96: 31 db xor %ebx,%ebx
468f98: c3 ret
What did you expect to see?
Do not move constant expressions into a loop, evaluate them only once.
Even in the case of Sum0()
repeated reloads of table
look dubious to me. Another goroutine may change table
concurrently, but in the absence of locking in Sum0()
I'd argue that hoisting the load out of the loop is as racy as reloading it every time. Moreover, table
is private to the package so the compiler can know that no code in the package writes to it, nor takes its address, so table
is a constant.
Comment From: gabyhelp
Related Issues
- cmd/compile: unneccesary 0 check in division #45928 (closed)
- cmd/compile: double zeroing and unnecessary copying/stack use #67957 (closed)
- cmd/compile: loads combining regression #18946 (closed)
- cmd/compile: Use wide integer load/store instructions if possible #11819 (closed)
- cmd/compile: missing BCE on slice in loop #15203 (closed)
- Signed non-negative divide by power of 2. #44530 (closed)
- cmd/compile: range over rune literal causes internal compiler error #64471 (closed)
- cmd/compile: incorrect optimizations. #30339 (closed)
- Why slice not painc #42069 (closed)
- cmd/compile: stringtoslicebytetmp optimization on unescaped slice #38501 (closed)
(Emoji vote if this was helpful or unhelpful; more detailed feedback welcome in this discussion.)
Comment From: randall77
I'd expect that load to happen once in the generated code
That's not a load, it is just an address calculation.
The compiler doesn't try to keep those outside the loop currently. We could, I guess, but there's a tension there between computing the address just once, and using an additional register to hold it.
The worries about raciness don't apply here, as table[:]
is just an address calculation, not a load. If table
was defined as a slice instead of an array, then the compiler needs to be more careful (and it is).
Comment From: randall77
Do you have a benchmark that demonstrates that the LEA
inside the loop makes things slower?
Comment From: artem-anisimov-0x7f
Here is the C + asm version of the same loop. sum3()
has lea
outside the loop, sum4()
has lea
inside the loop. The benchmark calculates sum3()
and sum4()
16 times over an array of 128M bytes. On a Ryzen 3950x, I get the following results:
$ gcc -o test -O2 -fPIE test.c
$ ./test
sum3: 1075.885ms
sum4: 1187.926ms
This is a 10.4% more time because of a lea
inside the loop.
The code for the benchmark:
#include <stdint.h>
#include <stdlib.h>
#include <stdio.h>
#include <time.h>
#include <err.h>
static uint64_t table[];
uint64_t sum0(uint8_t *xs, uint64_t len, uint64_t mask)
{
uint64_t sum = 0;
for (uint64_t i = 0; i < len; ++i) {
sum = (sum << 1) + table[xs[i]];
if ((sum & mask) == 0)
return i + 1;
}
return ~0;
}
uint64_t __attribute__((naked)) sum3(uint8_t *xs, uint64_t len, uint64_t mask)
{
__asm__ __volatile__ (
"xor %%rax, %%rax \n\t"
"xor %%rcx, %%rcx \n\t"
"lea table(%%rip), %%r9 \n\t"
"jmp sum3start \n\t"
"nop \n\t"
"sum3loop: \n\t"
"inc %%rax \n\t"
"sum3start: \n\t"
"cmp %%rax, %%rsi \n\t"
"je sum3notfound \n\t"
"movzbl (%%rdi, %%rax, 1), %%r8d \n\t"
"mov (%%r9, %%r8, 8), %%r8 \n\t"
"lea (%%r8, %%rcx, 2), %%rcx \n\t"
"test %%rdx, %%rcx \n\t"
"jne sum3loop \n\t"
"inc %%rax \n\t"
"ret \n\t"
"sum3notfound: \n\t"
"mov $0xffffffffffffffff, %%rax \n\t"
"ret"
:
:
:
);
}
uint64_t __attribute__((naked)) sum4(uint8_t *xs, uint64_t len, uint64_t mask)
{
__asm__ __volatile__ (
"xor %%rax, %%rax \n\t"
"xor %%rcx, %%rcx \n\t"
"jmp sum4start \n\t"
".byte 0x0f, 0x1f, 0x84, 0x00 \n\t"
".byte 0x00, 0x00, 0x00, 0x00 \n\t"
"sum4loop: \n\t"
"inc %%rax \n\t"
"sum4start: \n\t"
"cmp %%rax, %%rsi \n\t"
"je sum4notfound \n\t"
"movzbl (%%rdi, %%rax, 1), %%r8d \n\t"
"lea table(%%rip), %%r9 \n\t"
"mov (%%r9, %%r8, 8), %%r8 \n\t"
"lea (%%r8, %%rcx, 2), %%rcx \n\t"
"test %%rdx, %%rcx \n\t"
"jne sum4loop \n\t"
"inc %%rax \n\t"
"ret \n\t"
"sum4notfound: \n\t"
"mov $0xffffffffffffffff, %%rax \n\t"
"ret"
:
:
:
);
}
static uint8_t* generate(uint64_t len)
{
uint8_t *xs = malloc(len);
xs[0] = 0;
uint8_t *x = &xs[1];
uint64_t ctr = 0;
for (uint64_t i = 1; i < len; ++i) {
ctr = ctr * 6364136223846793005 + 1442695040888963407;
*x++ = ctr;
}
return xs;
}
static void compare(uint8_t *xs, uint64_t len, uint64_t mask)
{
while (len > 0) {
uint64_t a0 = sum0(xs, len, mask),
a3 = sum3(xs, len, mask),
a4 = sum4(xs, len, mask);
if (a0 != a3)
errx(1, "sum0() and sum3() disagree");
if (a0 != a4)
errx(1, "sum0() and sum4() disagree");
if (a0 != -1ULL) {
xs += a0;
len -= a0;
} else {
break;
}
}
}
static void measure(const char *name, uint64_t (*fn)(uint8_t *xs, uint64_t len, uint64_t mask), uint8_t *xs, uint64_t len)
{
struct timespec begin, end;
clock_gettime(CLOCK_THREAD_CPUTIME_ID, &begin);
for (int i = 0; i < 16; ++i)
(void) fn(xs, len, -1ULL);
clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end);
long long sec = end.tv_sec - begin.tv_sec;
long long nsec = end.tv_nsec - begin.tv_nsec;
long long delta = (sec * 1000 * 1000) + (nsec / 1000);
printf("%s: %.3lfms\n", name, delta / 1000.0);
}
int main()
{
const uint64_t len = 128 * 1024 * 1024;
const uint64_t mask = 0x7fULL;
uint8_t *xs = generate(len);
compare(xs, len, mask);
measure("sum3", &sum3, xs, len);
measure("sum4", &sum4, xs, len);
return 0;
}
static uint64_t table[] = {
0xe80e8d55032474b3, 0x11b25b61f5924e15, 0x03aa5bd82a9eb669, 0xc45a153ef107a38c, 0xeac874b86f0f57b9, 0xa5ccedec95ec79c7, 0xe15a3320ad42ac0a, 0x5ed3583fa63cec15,
0xcd497bf624a4451d, 0xf9ade5b059683605, 0x773940c03fb11ca1, 0xa36b16e4a6ae15b2, 0x67afd1adb5a89eac, 0xc44c75ee32f0038e, 0x2101790f365c0967, 0x76415c64a222fc4a,
0x579929249a1e577a, 0xe4762fc41fdbf750, 0xea52198e57dfcdcc, 0xe2535aafe30b4281, 0xcb1a1bd6c77c9056, 0x5a1aa9bfc4612a62, 0x15a728aef8943eb5, 0x2f8f09738a8ec8d9,
0x200f3dec9fac8074, 0x0fa9a7b1e0d318df, 0x06c0804ffd0d8e3a, 0x630cbc412669dd25, 0x10e34f85f4b10285, 0x2a6fe8164b9b6410, 0xcacb57d857d55810, 0x77f8a3a36ff11b46,
0x66af517e0dc3003e, 0x76c073c789b4009a, 0x853230dbb529f22a, 0x1e9e9c09a1f77e56, 0x1e871223802ee65d, 0x37fe4588718ff813, 0x10088539f30db464, 0x366f7470b80b72d1,
0x33f2634d9a6b31db, 0xd43917751d69ea18, 0xa0f492bc1aa7b8de, 0x3f94e5a8054edd20, 0xedfd6e25eb8b1dbf, 0x759517a54f196a56, 0xe81d5006ec7b6b17, 0x8dd8385fa894a6b7,
0x45f4d5467b0d6f91, 0xa1f894699de22bc8, 0x33829d09ef93e0fe, 0x3e29e250caed603c, 0xf7382cba7f63a45e, 0x970f95412bb569d1, 0xc7fcea456d356b4b, 0x723042513f3e7a57,
0x17ae7688de3596f1, 0x27ac1fcd7cd23c1a, 0xf429beeb78b3f71f, 0xd0780692fb93a3f9, 0x9f507e28a7c9842f, 0x56001ad536e433ae, 0x7e1dd1ecf58be306, 0x15fee353aa233fc6,
0xb033a0730b7638e8, 0xeb593ad6bd2406d1, 0x7c86502574d0f133, 0xce3b008d4ccb4be7, 0xf8566e3d383594c8, 0xb2c261e9b7af4429, 0xf685e7e253799dbb, 0x05d33ed60a494cbc,
0xeaf88d55a4cb0d1a, 0x3ee9368a902415a1, 0x8980fe6a8493a9a4, 0x358ed008cb448631, 0xd0cb7e37b46824b8, 0xe9bc375c0bc94f84, 0xea0bf1d8e6b55bb3, 0xb66a60d0f9f6f297,
0x66db2cc4807b3758, 0x7e4e014afbca8b4d, 0xa5686a4938b0c730, 0xa5f0d7353d623316, 0x26e38c349242d5e8, 0xeeefa80a29858e30, 0x8915cb912aa67386, 0x4b957a47bfc420d4,
0xbb53d051a895f7e1, 0x09f5e3235f6911ce, 0x416b98e695cfb7ce, 0x97a08183344c5c86, 0xbf68e0791839a861, 0xea05dde59ed3ed56, 0x0ca732280beda160, 0xac748ed62fe7f4e2,
0xc686da075cf6e151, 0xe1ba5658f4af05c8, 0xe9ff09fbeb67cc35, 0xafaea9470323b28d, 0x0291e8db5bb0ac2a, 0x342072a9bbee77ae, 0x03147eed6b3d0a9c, 0x21379d4de31dbadb,
0x2388d965226fb986, 0x52c96988bfebabfa, 0xa6fc29896595bc2d, 0x38fa4af70aa46b8b, 0xa688dd13939421ee, 0x99d5275d9b1415da, 0x453d31bb4fe73631, 0xde51debc1fbe3356,
0x75a3c847a06c622f, 0xe80e32755d272579, 0x5444052250d8ec0d, 0x8f17dfda19580a3b, 0xf6b3e9363a185e42, 0x7a42adec6868732f, 0x32cb6a07629203a2, 0x1eca8957defe56d9,
0x9fa85e4bc78ff9ed, 0x20ff07224a499ca7, 0x3fa6295ff9682c70, 0xe3d5b1e3ce993eff, 0xa341209362e0b79a, 0x64bd9eae5712ffe8, 0xceebb537babbd12a, 0x5586ef404315954f,
0x46c3085c938ab51a, 0xa82ccb9199907cee, 0x8c51b6690a3523c8, 0xc4dbd4c9ae518332, 0x979898dbb23db7b2, 0x1b5b585e6f672a9d, 0xce284da7c4903810, 0x841166e8bb5f1c4f,
0xb7d884a3fceca7d0, 0xa76468f5a4572374, 0xc10c45f49ee9513d, 0x68f9a5663c1908c9, 0x0095a13476a6339d, 0xd1d7516ffbe9c679, 0xfd94ab0c9726f938, 0x627468bbdb27c959,
0xedc3f8988e4a8c9a, 0x58efd33f0dfaa499, 0x21e37d7e2ef4ac8b, 0x297f9ab5586259c6, 0xda3ba4dc6cb9617d, 0xae11d8d9de2284d2, 0xcfeed88cb3729865, 0xefc2f9e4f03e2633,
0x8226393e8f0855a4, 0xd6e25fd7acf3a767, 0x435784c3bfd6d14a, 0xf97142e6343fe757, 0xd73b9fe826352f85, 0x6c3ac444b5b2bd76, 0xd8e88f3e9fd4a3fd, 0x31e50875c36f3460,
0xa824f1bf88cf4d44, 0x54a4d2c8f5f25899, 0xbff254637ce3b1e6, 0xa02cfe92561b3caa, 0x7bedb4edee9f0af7, 0x879c0620ac49a102, 0xa12c4ccd23b332e7, 0x09a5ff47bf94ed1e,
0x7b62f43cd3046fa0, 0xaa3af0476b9c2fb9, 0x22e55301abebba8e, 0x3a6035c42747bd58, 0x1705373106c8ec07, 0xb1f660de828d0628, 0x065fe82d89ca563d, 0xf555c2d8074d516d,
0x6bb6c186b423ee99, 0x54a807be6f3120a8, 0x8a3c7fe2f88860b8, 0xbeffc344f5118e81, 0xd686e80b7d1bd268, 0x661aef4ef5e5e88b, 0x5bf256c654cd1dda, 0x9adb1ab85d7640f4,
0x68449238920833a2, 0x843279f4cebcb044, 0xc8710cdefa93f7bb, 0x236943294538f3e6, 0x80d7d136c486d0b4, 0x61653956b28851d3, 0x3f843be9a9a956b5, 0xf73cfbbf137987e5,
0xcf0cb6dee8ceac2c, 0x50c401f52f185cae, 0xbdbe89ce735c4c1c, 0xeef3ade9c0570bc7, 0xbe8b066f8f64cbf6, 0x5238d6131705dcb9, 0x20219086c950e9f6, 0x634468d9ed74de02,
0x0aba4b3d705c7fa5, 0x3374416f725a6672, 0xe7378bdf7beb3bc6, 0x0f7b6a1b1cee565b, 0x234e4c41b0c33e64, 0x4efa9a0c3f21fe28, 0x1167fc551643e514, 0x9f81a69d3eb01fa4,
0xdb75c22b12306ed0, 0xe25055d738fc9686, 0x9f9f167a3f8507bb, 0x195f8336d3fbe4d3, 0x8442b6feffdcb6f6, 0x1e07ed24746ffde9, 0x140e31462d555266, 0x8bd0ce515ae1406e,
0x2c0be0042b5584b3, 0x35a23d0e15d45a60, 0xc14f1ba147d9bc83, 0xbbf168691264b23f, 0xad2cc7b57e589ade, 0x9501963154c7815c, 0x9664afa6b8d67d47, 0x7f9e5101fea0a81c,
0x45ecffb610d25bfd, 0x3157f7aecf9b6ab3, 0xc43ca6f88d87501d, 0x9576ff838dee38dc, 0x93f21afe0ce1c7d7, 0xceac699df343d8f9, 0x2fec49e29f03398d, 0x8805ccd5730281ed,
0xf9fc16fc750a8e59, 0x35308cc771adf736, 0x4a57b7c9ee2b7def, 0x03a4c6cdc937a02a, 0x6c9a8a269fc8c4fc, 0x4681decec7a03f43, 0x342eecded1353ef9, 0x8be0552d8413a867,
0xc7b4ac51beda8be8, 0xebcc64fb719842c0, 0xde8e4c7fb6d40c1c, 0xcc8263b62f9738b1, 0xd3cfc0f86511929a, 0x466024ce8bb226ea, 0x459ff690253a3c18, 0x98b27e9d91284c9c,
0x75c3ae8aa3af373d, 0xfbf8f8e79a866ffc, 0x32327f59d0662799, 0x8228b57e729e9830, 0x065ceb7a18381b58, 0xd2177671a31dc5ff, 0x90cd801f2f8701f9, 0x9d714428471c65fe,
};
There is one difference. movzbl
is one byte longer in my version because it moves into %r8d
instead of %esi
, but I think that should not affect the result.
Comment From: artem-anisimov-0x7f
There is another interesting thing. Consider the following code. It has the same loop as Sum0()
in my initial example, but it convinces the register allocator to insert a move that could be avoided (I have minimised it from a real example to keep the move):
type S struct {
minSize uint
maxSize uint
normSize uint
maskS uint64
maskL uint64
scanned uint
sum uint64
}
//go:noinline
func (s *S) Sum(xs []byte) (uint, bool) {
l := uint(len(xs))
sum := s.sum
var i uint
if s.scanned < s.minSize {
i = s.minSize - s.scanned
}
n := min(l, s.normSize-s.scanned)
mask := s.maskS
for ; i < n; i++ {
sum = (sum << 1) + table[xs[i]]
if (sum & mask) == 0 {
return i + 1, true
}
}
if i+s.scanned == s.maxSize {
return s.maxSize, true
}
return l, false
}
The loop compiles to the following code:
468fd3: 48 ff c6 inc %rsi
468fd6: 49 89 f8 mov %rdi,%r8
468fd9: 48 39 ce cmp %rcx,%rsi
468fdc: 73 23 jae 469001 <main.(*S).Sum+0x61>
468fde: 0f b6 3c 33 movzbl (%rbx,%rsi,1),%edi
468fe2: 4c 8d 1d d7 9b 08 00 lea 0x89bd7(%rip),%r11 # 4f2bc0 <main.table>
468fe9: 49 8b 3c fb mov (%r11,%rdi,8),%rdi
468fed: 4a 8d 3c 47 lea (%rdi,%r8,2),%rdi
468ff1: 49 85 f9 test %rdi,%r9
468ff4: 75 dd jne 468fd3 <main.(*S).Sum+0x33>
mov %rdi, %r8
is only needed because of movzbl
at 468fde
and mov
at 468fe9
. Those two instructions could use %r8
instead of %rdi
.
The C + asm code for this version of the loop (sum1()
has lea
outside the loop and sum2()
has lea
inside the loop):
uint64_t __attribute__((naked)) sum1(uint8_t *xs, uint64_t len, uint64_t mask)
{
__asm__ __volatile__ (
"xor %%rax, %%rax \n\t"
"xor %%rcx, %%rcx \n\t"
"lea table(%%rip), %%r9 \n\t"
"jmp sum1start \n\t"
"nop \n\t"
"sum1loop: \n\t"
"inc %%rax \n\t"
"sum1start: \n\t"
"mov %%rcx, %%r10 \n\t"
"cmp %%rax, %%rsi \n\t"
"je sum1notfound \n\t"
"movzbl (%%rdi, %%rax, 1), %%ecx \n\t"
"mov (%%r9, %%rcx, 8), %%rcx \n\t"
"lea (%%rcx, %%r10, 2), %%rcx \n\t"
"test %%rdx, %%rcx \n\t"
"jne sum1loop \n\t"
"inc %%rax \n\t"
"ret \n\t"
"sum1notfound: \n\t"
"mov $0xffffffffffffffff, %%rax \n\t"
"ret"
:
:
:
);
}
uint64_t __attribute__((naked)) sum2(uint8_t *xs, uint64_t len, uint64_t mask)
{
__asm__ __volatile__ (
"xor %%rax, %%rax \n\t"
"xor %%rcx, %%rcx \n\t"
"jmp sum2start \n\t"
".byte 0x0f, 0x1f, 0x84, 0x00 \n\t"
".byte 0x00, 0x00, 0x00, 0x00 \n\t"
"sum2loop: \n\t"
"inc %%rax \n\t"
"sum2start: \n\t"
"mov %%rcx, %%r10 \n\t"
"cmp %%rax, %%rsi \n\t"
"je sum2notfound \n\t"
"movzbl (%%rdi, %%rax, 1), %%ecx \n\t"
"lea table(%%rip), %%r9 \n\t"
"mov (%%r9, %%rcx, 8), %%rcx \n\t"
"lea (%%rcx, %%r10, 2), %%rcx \n\t"
"test %%rdx, %%rcx \n\t"
"jne sum2loop \n\t"
"inc %%rax \n\t"
"ret \n\t"
"sum2notfound: \n\t"
"mov $0xffffffffffffffff, %%rax \n\t"
"ret"
:
:
:
);
}
The difference of the run time between the two is
$ gcc -o test -O2 -fPIE test.c
$ ./test
sum1: 1063.284ms
sum2: 1291.752ms
For this version of the loop, the difference between lea
inside and outside the loop is no longer 10.4% as in the previous comment, but 21.5%.
Comment From: Jorropo
I couldn't believe that a random LEA
would slow down this benchmark, it has no dependency and in real life you almost always have one free ALU or AGU unit to execute the LEA in parallel of other instructions ahead of when you need it.
So I proceeded to create a benchmark proving this is a loop alignment issue as this almost always explain weird performance differences.
Source code: https://go.dev/play/p/dwi-EzN2oTo
You also need to patch the assembler to use <ABIInternal>
:
diff --git a/src/cmd/asm/internal/asm/parse.go b/src/cmd/asm/internal/asm/parse.go
index 638f4e2fc4..0bc8d1e3ce 100644
--- a/src/cmd/asm/internal/asm/parse.go
+++ b/src/cmd/asm/internal/asm/parse.go
@@ -68,7 +68,7 @@ func NewParser(ctxt *obj.Link, ar *arch.Arch, lexer lex.TokenReader) *Parser {
labels: make(map[string]*obj.Prog),
dataAddr: make(map[string]int64),
errorWriter: os.Stderr,
- allowABI: ctxt != nil && objabi.LookupPkgSpecial(ctxt.Pkgpath).AllowAsmABI,
+ allowABI: true,
pkgPrefix: pkgPrefix,
}
}
This test a 2×4 configuration space:
Alignment | LEA in body | LEA in header |
---|---|---|
0 | Sum0 |
Sum1 |
64 | Sum2 |
Sum3 |
32 | Sum4 |
Sum5 |
16 | Sum6 |
Sum7 |
Here are results for 10 interleaved runs:
goos: linux
goarch: amd64
pkg: aaa
cpu: AMD Ryzen 5 3600 6-Core Processor
│ result │
│ sec/op │
Sum0-12 9.583n ± 1%
Sum1-12 8.495n ± 1%
Sum2-12 9.574n ± 1%
Sum3-12 8.753n ± 1%
Sum4-12 9.533n ± 1%
Sum5-12 8.718n ± 1%
Sum6-12 9.544n ± 1%
Sum7-12 8.630n ± 1%
geomean 9.092n
Alignment | LEA in body | LEA in header |
---|---|---|
0 | 9.5 | 8.4 |
64 | 9.5 | 8.7 |
32 | 9.5 | 8.7 |
16 | 9.5 | 8.6 |
If anything my conclusion is that for this exact function, having the LEA
in the body is in fact significantly slower.
Comment From: bupjae
I think this can explain why "random LEA
" could slow down that benchmark.
According to Intel's Optimization Reference Manual (brought from same source mentioned on #21735 )
For LEA instructions with three source operands and some specific situations, instruction latency has increased to 3 cycles, and must dispatch via port 1: - LEA that has all three source operands: base, index, and offset - LEA that uses base and index registers where the base is EBP, RBP,or R13 - LEA that uses RIP relative addressing mode - LEA that uses 16-bit addressing mode
That LEA
is indeed third case of "slow LEA
" that could impact performance in tight loop.