Skip to content

Commit 8081862

Browse files
benshi001cherrymui
authored andcommitted
cmd/internal/obj/arm64: simplify ADD and SUB
Currently "ADD $0x123456, Rs, Rd" will load pre-stored 0x123456 from the constant pool and use it for the addition. Total 12 bytes are cost. And so does SUB. This CL breaks it to "ADD 0x123000, Rs, Rd" + "ADD 0x000456, Rd, Rd". Both "0x123000" and "0x000456" can be directly encoded into the instruction binary code. So 4 bytes are saved. 1. The total size of pkg/android_arm64 decreases about 0.3KB. 2. The go1 benchmark show little regression (excluding noise). name old time/op new time/op delta BinaryTree17-4 15.9s ± 0% 15.9s ± 1% +0.10% (p=0.044 n=29+29) Fannkuch11-4 8.72s ± 0% 8.75s ± 0% +0.34% (p=0.000 n=30+24) FmtFprintfEmpty-4 173ns ± 0% 173ns ± 0% ~ (all equal) FmtFprintfString-4 368ns ± 0% 368ns ± 0% ~ (p=0.593 n=30+30) FmtFprintfInt-4 417ns ± 0% 417ns ± 0% ~ (all equal) FmtFprintfIntInt-4 673ns ± 0% 661ns ± 1% -1.70% (p=0.000 n=30+30) FmtFprintfPrefixedInt-4 805ns ± 0% 805ns ± 0% +0.10% (p=0.011 n=30+30) FmtFprintfFloat-4 1.09µs ± 0% 1.09µs ± 0% ~ (p=0.125 n=30+29) FmtManyArgs-4 2.68µs ± 0% 2.68µs ± 0% +0.07% (p=0.004 n=30+30) GobDecode-4 32.9ms ± 0% 33.2ms ± 1% +1.07% (p=0.000 n=29+29) GobEncode-4 29.5ms ± 0% 29.6ms ± 0% +0.26% (p=0.000 n=28+28) Gzip-4 1.38s ± 1% 1.35s ± 3% -1.94% (p=0.000 n=28+30) Gunzip-4 139ms ± 0% 139ms ± 0% +0.10% (p=0.000 n=28+29) HTTPClientServer-4 745µs ± 5% 742µs ± 3% ~ (p=0.405 n=28+29) JSONEncode-4 49.5ms ± 1% 49.9ms ± 0% +0.89% (p=0.000 n=30+30) JSONDecode-4 264ms ± 1% 264ms ± 0% +0.25% (p=0.001 n=30+30) Mandelbrot200-4 16.6ms ± 0% 16.6ms ± 0% ~ (p=0.507 n=29+29) GoParse-4 15.9ms ± 0% 16.0ms ± 1% +0.91% (p=0.002 n=23+30) RegexpMatchEasy0_32-4 379ns ± 0% 379ns ± 0% ~ (all equal) RegexpMatchEasy0_1K-4 1.31µs ± 0% 1.31µs ± 0% +0.09% (p=0.008 n=27+30) RegexpMatchEasy1_32-4 357ns ± 0% 358ns ± 0% +0.28% (p=0.000 n=28+29) RegexpMatchEasy1_1K-4 2.04µs ± 0% 2.04µs ± 0% ~ (p=0.850 n=30+30) RegexpMatchMedium_32-4 587ns ± 0% 589ns ± 0% +0.33% (p=0.000 n=30+30) RegexpMatchMedium_1K-4 162µs ± 0% 163µs ± 0% ~ (p=0.351 n=30+29) RegexpMatchHard_32-4 9.54µs ± 0% 9.60µs ± 0% +0.59% (p=0.000 n=28+30) RegexpMatchHard_1K-4 287µs ± 0% 287µs ± 0% +0.11% (p=0.000 n=26+29) Revcomp-4 2.50s ± 0% 2.50s ± 0% -0.13% (p=0.012 n=28+27) Template-4 312ms ± 1% 312ms ± 1% +0.20% (p=0.015 n=27+30) TimeParse-4 1.68µs ± 0% 1.68µs ± 0% -0.35% (p=0.000 n=30+30) TimeFormat-4 1.66µs ± 0% 1.64µs ± 0% -1.20% (p=0.000 n=25+29) [Geo mean] 246µs 246µs -0.00% name old speed new speed delta GobDecode-4 23.3MB/s ± 0% 23.1MB/s ± 1% -1.05% (p=0.000 n=29+29) GobEncode-4 26.0MB/s ± 0% 25.9MB/s ± 0% -0.25% (p=0.000 n=29+28) Gzip-4 14.1MB/s ± 1% 14.4MB/s ± 3% +1.94% (p=0.000 n=27+30) Gunzip-4 139MB/s ± 0% 139MB/s ± 0% -0.10% (p=0.000 n=28+29) JSONEncode-4 39.2MB/s ± 1% 38.9MB/s ± 0% -0.88% (p=0.000 n=30+30) JSONDecode-4 7.37MB/s ± 0% 7.35MB/s ± 0% -0.26% (p=0.001 n=30+30) GoParse-4 3.65MB/s ± 0% 3.62MB/s ± 1% -0.86% (p=0.001 n=23+30) RegexpMatchEasy0_32-4 84.3MB/s ± 0% 84.3MB/s ± 0% ~ (p=0.126 n=27+26) RegexpMatchEasy0_1K-4 784MB/s ± 0% 783MB/s ± 0% -0.10% (p=0.003 n=27+30) RegexpMatchEasy1_32-4 89.5MB/s ± 0% 89.3MB/s ± 0% -0.20% (p=0.000 n=27+29) RegexpMatchEasy1_1K-4 502MB/s ± 0% 502MB/s ± 0% ~ (p=0.858 n=30+28) RegexpMatchMedium_32-4 1.70MB/s ± 0% 1.70MB/s ± 0% -0.25% (p=0.000 n=30+30) RegexpMatchMedium_1K-4 6.30MB/s ± 0% 6.30MB/s ± 0% ~ (all equal) RegexpMatchHard_32-4 3.35MB/s ± 0% 3.33MB/s ± 0% -0.47% (p=0.000 n=30+30) RegexpMatchHard_1K-4 3.57MB/s ± 0% 3.56MB/s ± 0% -0.20% (p=0.000 n=27+30) Revcomp-4 102MB/s ± 0% 102MB/s ± 0% +0.14% (p=0.008 n=28+28) Template-4 6.23MB/s ± 0% 6.21MB/s ± 1% -0.21% (p=0.009 n=21+30) [Geo mean] 24.1MB/s 24.0MB/s -0.16% Change-Id: Ifcef3edb667540e2d86e586c23afcfbc2cf1340b Reviewed-on: https://go-review.googlesource.com/c/134536 Run-TryBot: Ben Shi <[email protected]> TryBot-Result: Gobot Gobot <[email protected]> Reviewed-by: Cherry Zhang <[email protected]>
1 parent d217004 commit 8081862

File tree

4 files changed

+46
-2
lines changed

4 files changed

+46
-2
lines changed

src/cmd/asm/internal/asm/testdata/arm64.s

+12
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,18 @@ TEXT foo(SB), DUPOK|NOSPLIT, $-8
2525
ADD R1, R2, R3
2626
ADD R1, ZR, R3
2727
ADD $1, R2, R3
28+
ADD $0x000aaa, R2, R3 // ADD $2730, R2, R3 // 43a82a91
29+
ADD $0x000aaa, R2 // ADD $2730, R2 // 42a82a91
30+
ADD $0xaaa000, R2, R3 // ADD $11182080, R2, R3 // 43a86a91
31+
ADD $0xaaa000, R2 // ADD $11182080, R2 // 42a86a91
32+
ADD $0xaaaaaa, R2, R3 // ADD $11184810, R2, R3 // 43a82a9163a86a91
33+
ADD $0xaaaaaa, R2 // ADD $11184810, R2 // 42a82a9142a86a91
34+
SUB $0x000aaa, R2, R3 // SUB $2730, R2, R3 // 43a82ad1
35+
SUB $0x000aaa, R2 // SUB $2730, R2 // 42a82ad1
36+
SUB $0xaaa000, R2, R3 // SUB $11182080, R2, R3 // 43a86ad1
37+
SUB $0xaaa000, R2 // SUB $11182080, R2 // 42a86ad1
38+
SUB $0xaaaaaa, R2, R3 // SUB $11184810, R2, R3 // 43a82ad163a86ad1
39+
SUB $0xaaaaaa, R2 // SUB $11184810, R2 // 42a82ad142a86ad1
2840
ADD R1>>11, R2, R3
2941
ADD R1<<22, R2, R3
3042
ADD R1->33, R2, R3

src/cmd/internal/obj/arm64/a.out.go

+1
Original file line numberDiff line numberDiff line change
@@ -411,6 +411,7 @@ const (
411411
C_MBCON // could be C_MOVCON or C_BITCON
412412
C_MOVCON // generated by a 16-bit constant, optionally inverted and/or shifted by multiple of 16
413413
C_BITCON // bitfield and logical immediate masks
414+
C_ADDCON2 // 24-bit constant
414415
C_LCON // 32-bit constant
415416
C_VCON // 64-bit constant
416417
C_FCON // floating-point constant

src/cmd/internal/obj/arm64/anames7.go

+1
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ var cnames7 = []string{
2727
"MBCON",
2828
"MOVCON",
2929
"BITCON",
30+
"ADDCON2",
3031
"LCON",
3132
"VCON",
3233
"FCON",

src/cmd/internal/obj/arm64/asm7.go

+32-2
Original file line numberDiff line numberDiff line change
@@ -192,6 +192,8 @@ var optab = []Optab{
192192
{AADD, C_BITCON, C_RSP, C_NONE, C_RSP, 62, 8, 0, 0, 0},
193193
{AADD, C_BITCON, C_NONE, C_NONE, C_RSP, 62, 8, 0, 0, 0},
194194
{ACMP, C_BITCON, C_RSP, C_NONE, C_NONE, 62, 8, 0, 0, 0},
195+
{AADD, C_ADDCON2, C_RSP, C_NONE, C_RSP, 48, 8, 0, 0, 0},
196+
{AADD, C_ADDCON2, C_NONE, C_NONE, C_RSP, 48, 8, 0, 0, 0},
195197
{AADD, C_VCON, C_RSP, C_NONE, C_RSP, 13, 8, 0, LFROM, 0},
196198
{AADD, C_VCON, C_NONE, C_NONE, C_RSP, 13, 8, 0, LFROM, 0},
197199
{ACMP, C_VCON, C_REG, C_NONE, C_NONE, 13, 8, 0, LFROM, 0},
@@ -1046,6 +1048,7 @@ func (c *ctxt7) addpool(p *obj.Prog, a *obj.Addr) {
10461048
C_NOREG4K,
10471049
C_LOREG,
10481050
C_LACON,
1051+
C_ADDCON2,
10491052
C_LCON,
10501053
C_VCON:
10511054
if a.Name == obj.NAME_EXTERN {
@@ -1537,6 +1540,10 @@ func (c *ctxt7) aclass(a *obj.Addr) int {
15371540
return C_BITCON
15381541
}
15391542

1543+
if 0 <= v && v <= 0xffffff {
1544+
return C_ADDCON2
1545+
}
1546+
15401547
if uint64(v) == uint64(uint32(v)) || v == int64(int32(v)) {
15411548
return C_LCON
15421549
}
@@ -1595,7 +1602,12 @@ func (c *ctxt7) oplook(p *obj.Prog) *Optab {
15951602
}
15961603
a1 = int(p.From.Class)
15971604
if a1 == 0 {
1598-
a1 = c.aclass(&p.From) + 1
1605+
a0 := c.aclass(&p.From)
1606+
// do not break C_ADDCON2 when S bit is set
1607+
if (p.As == AADDS || p.As == AADDSW || p.As == ASUBS || p.As == ASUBSW) && a0 == C_ADDCON2 {
1608+
a0 = C_LCON
1609+
}
1610+
a1 = a0 + 1
15991611
p.From.Class = int8(a1)
16001612
}
16011613

@@ -1681,8 +1693,13 @@ func cmp(a int, b int) bool {
16811693
return true
16821694
}
16831695

1696+
case C_ADDCON2:
1697+
if b == C_ZCON || b == C_ADDCON || b == C_ADDCON0 {
1698+
return true
1699+
}
1700+
16841701
case C_LCON:
1685-
if b == C_ZCON || b == C_BITCON || b == C_ADDCON || b == C_ADDCON0 || b == C_ABCON || b == C_ABCON0 || b == C_MBCON || b == C_MOVCON {
1702+
if b == C_ZCON || b == C_BITCON || b == C_ADDCON || b == C_ADDCON0 || b == C_ABCON || b == C_ABCON0 || b == C_MBCON || b == C_MOVCON || b == C_ADDCON2 {
16861703
return true
16871704
}
16881705

@@ -3474,6 +3491,19 @@ func (c *ctxt7) asmout(p *obj.Prog, o *Optab, out []uint32) {
34743491
}
34753492
o1 |= 0x1c1<<21 | uint32(rs&31)<<16 | uint32(rb&31)<<5 | uint32(rt&31)
34763493

3494+
case 48: /* ADD $C_ADDCON2, Rm, Rd */
3495+
op := c.opirr(p, p.As)
3496+
if op&Sbit != 0 {
3497+
c.ctxt.Diag("can not break addition/subtraction when S bit is set", p)
3498+
}
3499+
rt := int(p.To.Reg)
3500+
r := int(p.Reg)
3501+
if r == 0 {
3502+
r = rt
3503+
}
3504+
o1 = c.oaddi(p, int32(op), int32(c.regoff(&p.From)) & 0x000fff, r, rt)
3505+
o2 = c.oaddi(p, int32(op), int32(c.regoff(&p.From)) & 0xfff000, rt, rt)
3506+
34773507
case 50: /* sys/sysl */
34783508
o1 = c.opirr(p, p.As)
34793509

0 commit comments

Comments
 (0)