Skip to content

Commit 50255fe

Browse files
LarhzuJiaT75
authored andcommitted
liblzma: RISC-V filter: Use byte-by-byte access.
Not all RISC-V processors support fast unaligned access so it's better to read only one byte in the main loop. This can be faster even on x86-64 when compared to reading 32 bits at a time as half the time the address is only 16-bit aligned. The downside is larger code size on archs that do support fast unaligned access.
1 parent db5eb5f commit 50255fe

File tree

1 file changed

+84
-30
lines changed

1 file changed

+84
-30
lines changed

src/liblzma/simple/riscv.c

+84-30
Original file line numberDiff line numberDiff line change
@@ -370,28 +370,59 @@ riscv_encode(void *simple lzma_attribute((__unused__)),
370370
// The loop is advanced by 2 bytes every iteration since the
371371
// instruction stream may include 16-bit instructions (C extension).
372372
for (i = 0; i <= size; i += 2) {
373-
uint32_t inst = read32le(buffer + i);
373+
uint32_t inst = buffer[i];
374+
375+
if (inst == 0xEF) {
376+
// JAL
377+
const uint32_t b1 = buffer[i + 1];
378+
379+
// Only filter rd=x1(ra) and rd=x5(t0).
380+
if ((b1 & 0x0D) != 0)
381+
continue;
374382

375-
if ((inst & 0xDFF) == 0x0EF) {
376-
// JAL with rd=x1(ra) or rd=x5(t0)
377-
//
378383
// The 20-bit immediate is in four pieces.
379384
// The encoder stores it in big endian form
380385
// since it improves compression slightly.
381-
uint32_t addr
382-
= ((inst & 0x80000000) >> 11)
383-
| ((inst & 0x7FE00000) >> 20)
384-
| ((inst & 0x00100000) >> 9)
385-
| (inst & 0x000FF000);
386+
const uint32_t b2 = buffer[i + 2];
387+
const uint32_t b3 = buffer[i + 3];
388+
const uint32_t pc = now_pos + (uint32_t)i;
389+
390+
// The following chart shows the highest three bytes of JAL, focusing on
391+
// the 20-bit immediate field [31:12]. The first row of numbers is the
392+
// bit position in a 32-bit little endian instruction. The second row of
393+
// numbers shows the order of the immediate field in a J-type instruction.
394+
// The last row is the bit number in each byte.
395+
//
396+
// To determine the amount to shift each bit, subtract the value in
397+
// the last row from the value in the second last row. If the number
398+
// is positive, shift left. If negative, shift right.
399+
//
400+
// For example, at the rightmost side of the chart, the bit 4 in b1 is
401+
// the bit 12 of the address. Thus that bit needs to be shifted left
402+
// by 12 - 4 = 8 bits to put it in the right place in the addr variable.
403+
//
404+
// NOTE: The immediate of a J-type instruction holds bits [20:1] of
405+
// the address. The bit [0] is always 0 and not part of the immediate.
406+
//
407+
// | b3 | b2 | b1 |
408+
// | 31 30 29 28 27 26 25 24 | 23 22 21 20 19 18 17 16 | 15 14 13 12 x x x x |
409+
// | 20 10 9 8 7 6 5 4 | 3 2 1 11 19 18 17 16 | 15 14 13 12 x x x x |
410+
// | 7 6 5 4 3 2 1 0 | 7 6 5 4 3 2 1 0 | 7 6 5 4 x x x x |
386411

387-
addr += now_pos + (uint32_t)i;
412+
uint32_t addr = ((b1 & 0xF0) << 8)
413+
| ((b2 & 0x0F) << 16)
414+
| ((b2 & 0x10) << 7)
415+
| ((b2 & 0xE0) >> 4)
416+
| ((b3 & 0x7F) << 4)
417+
| ((b3 & 0x80) << 13);
388418

389-
inst = (inst & 0xFFF)
390-
| ((addr & 0x1E0000) >> 5)
391-
| ((addr & 0x01FE00) << 7)
392-
| ((addr & 0x0001FE) << 23);
419+
addr += pc;
393420

394-
write32le(buffer + i, inst);
421+
buffer[i + 1] = (uint8_t)((b1 & 0x0F)
422+
| ((addr >> 13) & 0xF0));
423+
424+
buffer[i + 2] = (uint8_t)(addr >> 9);
425+
buffer[i + 3] = (uint8_t)(addr >> 1);
395426

396427
// The "-2" is included because the for-loop will
397428
// always increment by 2. In this case, we want to
@@ -401,7 +432,10 @@ riscv_encode(void *simple lzma_attribute((__unused__)),
401432

402433
} else if ((inst & 0x7F) == 0x17) {
403434
// AUIPC
404-
//
435+
inst |= (uint32_t)buffer[i + 1] << 8;
436+
inst |= (uint32_t)buffer[i + 2] << 16;
437+
inst |= (uint32_t)buffer[i + 3] << 24;
438+
405439
// Branch based on AUIPC's rd. The bitmask test does
406440
// the same thing as this:
407441
//
@@ -587,30 +621,50 @@ riscv_decode(void *simple lzma_attribute((__unused__)),
587621

588622
size_t i;
589623
for (i = 0; i <= size; i += 2) {
590-
uint32_t inst = read32le(buffer + i);
624+
uint32_t inst = buffer[i];
591625

592-
if ((inst & 0xDFF) == 0x0EF) {
593-
// JAL with rd=x1(ra) or rd=x5(t0)
594-
uint32_t addr
595-
= ((inst << 5) & 0x1E0000)
596-
| ((inst >> 7) & 0x01FE00)
597-
| ((inst >> 23) & 0x0001FE);
626+
if (inst == 0xEF) {
627+
// JAL
628+
const uint32_t b1 = buffer[i + 1];
598629

599-
addr -= now_pos + (uint32_t)i;
630+
// Only filter rd=x1(ra) and rd=x5(t0).
631+
if ((b1 & 0x0D) != 0)
632+
continue;
600633

601-
inst = (inst & 0xFFF)
602-
| ((addr << 11) & 0x80000000)
603-
| ((addr << 20) & 0x7FE00000)
604-
| ((addr << 9) & 0x00100000)
605-
| ( addr & 0x000FF000);
634+
const uint32_t b2 = buffer[i + 2];
635+
const uint32_t b3 = buffer[i + 3];
636+
const uint32_t pc = now_pos + (uint32_t)i;
637+
638+
// | b3 | b2 | b1 |
639+
// | 31 30 29 28 27 26 25 24 | 23 22 21 20 19 18 17 16 | 15 14 13 12 x x x x |
640+
// | 20 10 9 8 7 6 5 4 | 3 2 1 11 19 18 17 16 | 15 14 13 12 x x x x |
641+
// | 7 6 5 4 3 2 1 0 | 7 6 5 4 3 2 1 0 | 7 6 5 4 x x x x |
642+
643+
uint32_t addr = ((b1 & 0xF0) << 13)
644+
| (b2 << 9) | (b3 << 1);
645+
646+
addr -= pc;
647+
648+
buffer[i + 1] = (uint8_t)((b1 & 0x0F)
649+
| ((addr >> 8) & 0xF0));
650+
651+
buffer[i + 2] = (uint8_t)(((addr >> 16) & 0x0F)
652+
| ((addr >> 7) & 0x10)
653+
| ((addr << 4) & 0xE0));
654+
655+
buffer[i + 3] = (uint8_t)(((addr >> 4) & 0x7F)
656+
| ((addr >> 13) & 0x80));
606657

607-
write32le(buffer + i, inst);
608658
i += 4 - 2;
609659

610660
} else if ((inst & 0x7F) == 0x17) {
611661
// AUIPC
612662
uint32_t inst2;
613663

664+
inst |= (uint32_t)buffer[i + 1] << 8;
665+
inst |= (uint32_t)buffer[i + 2] << 16;
666+
inst |= (uint32_t)buffer[i + 3] << 24;
667+
614668
if (inst & 0xE80) {
615669
// AUIPC's rd doesn't equal x0 or x2.
616670

0 commit comments

Comments
 (0)