JIT/AArch64: [macos][ZTS] Support fast path for tlv_get_addr (#7042)

* JIT/AArch64: [macos][ZTS] Support fast path for tlv_get_addr

Access to TLV(thread local variable) in macOS is in "dynamic" form and
function tlv_get_addr() is invoked to resolve the address. See the
example in [1].

Note there is one struct TLVDescriptor [2] for each TLV. The first
member holds the address of function tlv_get_addr(), and the other two
members, "key" and "offset", would be used inside tlv_get_addr().

The disassembly code for function tlv_get_addr() is shown in [3]. With
the value from system register, i.e. tpidrro_el0, together with "key"
and "offset", the TLV address can be obtained.

Note that the value from tpidrro_el0 varies for different threads, and
unique address for TLV is resolved.

It's worth noting that slow path would be executed, i.e. function
tlv_allocate_and_initialize_for_key(), for the first time of TLV access.

In this patch:
1. "_tsrm_ls_cache" is guaranteed to be accessed before any VM/JIT code
during the request startup, e.g. in init_executor(), therefore, slow
path can be avoided.

2. As TLVDecriptor is immutable and zend_jit_setup() executes once, we
get this structure in tsrm_get_ls_cache_tcb_offset(). Note the 'ldr'
instruction would be patched to 'add' by the linker.

3. Only fast path for tlv_get_addr() is implemented in macro
LOAD_TSRM_CACHE.

With this patch, all ~4k test cases can pass for ZTS+CALL in macOS on
Apple silicon.

[1] https://gist.github.com/shqking/4aab67e0105f7c1f2c549d57d5799f94
[2]
https://opensource.apple.com/source/dyld/dyld-195.6/src/threadLocalVariables.c.auto.html
[3] https://gist.github.com/shqking/329d7712c26bad49786ab0a544a4af43

Change-Id: I613e9c37e3ff2ecc3fab0f53f1e48a0246e12ee3
This commit is contained in:
Hao Sun 2021-05-31 14:27:00 +08:00 committed by GitHub
parent 805b391d99
commit 7c2a3a9400
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 36 additions and 0 deletions

View File

@ -744,10 +744,17 @@ TSRM_API size_t tsrm_get_ls_cache_tcb_offset(void)
#elif defined(__aarch64__)
size_t ret;
# ifdef __APPLE__
// Points to struct TLVDecriptor for _tsrm_ls_cache in macOS.
asm("adrp %0, #__tsrm_ls_cache@TLVPPAGE\n\t"
"ldr %0, [%0, #__tsrm_ls_cache@TLVPPAGEOFF]"
: "=r" (ret));
# else
asm("mov %0, xzr\n\t"
"add %0, %0, #:tprel_hi12:_tsrm_ls_cache, lsl #12\n\t"
"add %0, %0, #:tprel_lo12_nc:_tsrm_ls_cache"
: "=r" (ret));
# endif
return ret;
#else
return 0;

View File

@ -4055,8 +4055,12 @@ ZEND_EXT_API void zend_jit_unprotect(void)
if (!(JIT_G(debug) & (ZEND_JIT_DEBUG_GDB|ZEND_JIT_DEBUG_PERF_DUMP))) {
int opts = PROT_READ | PROT_WRITE;
#ifdef ZTS
/* TODO: EXEC+WRITE is not supported in macOS. Removing EXEC is still buggy as
* other threads, which are executing the JITed code, would crash anyway. */
# ifndef __APPLE__
/* Another thread may be executing JITed code. */
opts |= PROT_EXEC;
# endif
#endif
if (mprotect(dasm_buf, dasm_size, opts) != 0) {
fprintf(stderr, "mprotect() failed [%d] %s\n", errno, strerror(errno));

View File

@ -184,6 +184,14 @@ const char* zend_reg_name[] = {
#if ZTS
static size_t tsrm_ls_cache_tcb_offset = 0;
# ifdef __APPLE__
struct TLVDescriptor {
void* (*thunk)(struct TLVDescriptor*);
uint64_t key;
uint64_t offset;
};
typedef struct TLVDescriptor TLVDescriptor;
# endif
#endif
/* By default avoid JITing inline handlers if it does not seem profitable due to lack of
@ -483,10 +491,27 @@ static int logical_immediate_p (uint64_t value, uint32_t reg_size)
|| }
|.endmacro
// Safe memory load/store with an unsigned 64-bit offset.
|.macro SAFE_MEM_ACC_WITH_64_UOFFSET, ldr_str_ins, op, base_reg, offset, tmp_reg
|| if (((uintptr_t)(offset)) > LDR_STR_PIMM64) {
| LOAD_64BIT_VAL tmp_reg, offset
| ldr_str_ins op, [base_reg, tmp_reg]
|| } else {
| ldr_str_ins op, [base_reg, #(offset)]
|| }
|.endmacro
|.macro LOAD_TSRM_CACHE, reg
||#ifdef __APPLE__
| .long 0xd53bd071 // TODO: hard-coded: mrs TMP3, tpidrro_el0
| and TMP3, TMP3, #0xfffffffffffffff8
| SAFE_MEM_ACC_WITH_64_UOFFSET ldr, TMP3, TMP3, (((TLVDescriptor*)tsrm_ls_cache_tcb_offset)->key << 3), TMP1
| SAFE_MEM_ACC_WITH_64_UOFFSET ldr, reg, TMP3, (((TLVDescriptor*)tsrm_ls_cache_tcb_offset)->offset), TMP1
||#else
| .long 0xd53bd051 // TODO: hard-coded: mrs TMP3, tpidr_el0
|| ZEND_ASSERT(tsrm_ls_cache_tcb_offset <= LDR_STR_PIMM64);
| ldr reg, [TMP3, #tsrm_ls_cache_tcb_offset]
||#endif
|.endmacro
|.macro LOAD_ADDR_ZTS, reg, struct, field