I test it with 'gcc (GCC) 4.8.5 20150623 (Red Hat 4.8.5-36)'.
My disassembly doesn't similar to yours(I used `gcc -g -O2 -c lvm.s` and `objdump -j .text -S lvm.o`).
vmfetch();
vmdispatch (GET_OPCODE(i)) {
vmcase(OP_MOVE) {
setobjs2s(L, ra, RB(i));
1f50: 41 c1 ed 17 shr $0x17,%r13d
1f54: 49 c1 e5 04 shl $0x4,%r13
1f58: 4b 8b 04 2f mov (%r15,%r13,1),%rax
1f5c: 4b 8b 54 2f 08 mov 0x8(%r15,%r13,1),%rdx
1f61: 48 89 03 mov %rax,(%rbx)
1f64: 48 89 53 08 mov %rdx,0x8(%rbx)
1f68: 48 8b 75 28 mov 0x28(%rbp),%rsi
vmbreak;
1f6c: e9 6f f3 ff ff jmpq 12e0 <luaV_execute+0x40>
1f71: 0f 1f 80 00 00 00 00 nopl 0x0(%rax)
vmbreak;
}
the new one<---------------------------------------------------------------
vmfetch();
vmdispatch (GET_OPCODE(i)) {
vmcase(OP_MOVE) {
setobjs2s(L, ra, RB(i));
1f08: 41 c1 ed 17 shr $0x17,%r13d
1f0c: 49 c1 e5 04 shl $0x4,%r13
1f10: 4d 01 fd add %r15,%r13
vmbreak;
}
vmcase(OP_LOADK) {
TValue *rb = k + GETARG_Bx(i);
setobj2s(L, ra, rb);
1f13: 49 8b 45 00 mov 0x0(%r13),%rax
1f17: 48 89 03 mov %rax,(%rbx)
1f1a: 41 8b 45 08 mov 0x8(%r13),%eax
1f1e: 89 43 08 mov %eax,0x8(%rbx)
1f21: 48 8b 45 28 mov 0x28(%rbp),%rax
vmbreak;
1f25: e9 a6 f3 ff ff jmpq 12d0 <luaV_execute+0x40>
1f2a: 66 0f 1f 44 00 00 nopw 0x0(%rax,%rax,1)
vmbreak;
}
----------------------------------------------------------------------------------
In the disassembly, the difference is:
1. `mov (%r15,%r13,1),%rax` vs `add %r15,%r13` and `mov 0x0(%r13),%rax`
2. old setobj is load,load,store,store, and new setobj is load,store,load,store
3. the register size is different. when assign tt_
But, it seems that no matter how the compiler generate code, the new setobj is always faster old setobj.
Maybe compiler want to hint something?
By the way, can you tell me what's the `performance monitors` you used.