1 // Copyright 2018 Sen Han <00hnes@gmail.com>
2 // Modifications copyright 2025 Bruce Hill <bruce@bruce-hill.com>
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
8 // http://www.apache.org/licenses/LICENSE-2.0
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
23 #define public __attribute__((visibility("default")))
26 #define aco_size_t_safe_add_assert(a, b) aco_assert((a) + (b) >= (a))
28 static void aco_default_protector_last_word(void *);
30 void *(*aco_alloc_fn)(size_t) = malloc;
31 void (*aco_dealloc_fn)(void *) = free;
33 #define aco_alloc(size) \
35 void *_ptr = aco_alloc_fn(size); \
36 if (aco_unlikely((_ptr) == NULL)) { \
37 fprintf(stderr, "Aborting: failed to allocate memory: %s:%d:%s\n", __FILE__, __LINE__, \
38 __PRETTY_FUNCTION__); \
44 // aco's Global Thread Local Storage variable `co`
46 __thread aco_t *aco_gtls_co;
47 static __thread aco_cofuncp_t aco_gtls_last_word_fp = aco_default_protector_last_word;
50 static __thread void *aco_gtls_fpucw_mxcsr[2];
52 static __thread void *aco_gtls_fpucw_mxcsr[1];
54 #error "platform not supporteded yet"
58 void aco_runtime_test(void) {
60 _Static_assert(sizeof(void *) == 4, "require 'sizeof(void*) == 4'");
62 _Static_assert(sizeof(void *) == 8, "require 'sizeof(void*) == 8'");
63 _Static_assert(sizeof(__uint128_t) == 16, "require 'sizeof(__uint128_t) == 16'");
65 #error "platform not supporteded yet"
67 _Static_assert(sizeof(int) >= 4, "require 'sizeof(int) >= 4'");
68 aco_assert(sizeof(int) >= 4);
69 _Static_assert(sizeof(int) <= sizeof(size_t), "require 'sizeof(int) <= sizeof(size_t)'");
70 aco_assert(sizeof(int) <= sizeof(size_t));
74 static inline void aco_fast_memcpy(void *dst, const void *src, size_t sz) {
75 if (((uintptr_t)src & 0x0f) != 0 || ((uintptr_t)dst & 0x0f) != 0 || (sz & 0x0f) != 0x08 || (sz >> 4) > 8) {
80 __uint128_t xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
84 xmm0 = *((__uint128_t *)src + 0);
85 *((__uint128_t *)dst + 0) = xmm0;
88 xmm0 = *((__uint128_t *)src + 0);
89 xmm1 = *((__uint128_t *)src + 1);
90 *((__uint128_t *)dst + 0) = xmm0;
91 *((__uint128_t *)dst + 1) = xmm1;
94 xmm0 = *((__uint128_t *)src + 0);
95 xmm1 = *((__uint128_t *)src + 1);
96 xmm2 = *((__uint128_t *)src + 2);
97 *((__uint128_t *)dst + 0) = xmm0;
98 *((__uint128_t *)dst + 1) = xmm1;
99 *((__uint128_t *)dst + 2) = xmm2;
102 xmm0 = *((__uint128_t *)src + 0);
103 xmm1 = *((__uint128_t *)src + 1);
104 xmm2 = *((__uint128_t *)src + 2);
105 xmm3 = *((__uint128_t *)src + 3);
106 *((__uint128_t *)dst + 0) = xmm0;
107 *((__uint128_t *)dst + 1) = xmm1;
108 *((__uint128_t *)dst + 2) = xmm2;
109 *((__uint128_t *)dst + 3) = xmm3;
112 xmm0 = *((__uint128_t *)src + 0);
113 xmm1 = *((__uint128_t *)src + 1);
114 xmm2 = *((__uint128_t *)src + 2);
115 xmm3 = *((__uint128_t *)src + 3);
116 xmm4 = *((__uint128_t *)src + 4);
117 *((__uint128_t *)dst + 0) = xmm0;
118 *((__uint128_t *)dst + 1) = xmm1;
119 *((__uint128_t *)dst + 2) = xmm2;
120 *((__uint128_t *)dst + 3) = xmm3;
121 *((__uint128_t *)dst + 4) = xmm4;
124 xmm0 = *((__uint128_t *)src + 0);
125 xmm1 = *((__uint128_t *)src + 1);
126 xmm2 = *((__uint128_t *)src + 2);
127 xmm3 = *((__uint128_t *)src + 3);
128 xmm4 = *((__uint128_t *)src + 4);
129 xmm5 = *((__uint128_t *)src + 5);
130 *((__uint128_t *)dst + 0) = xmm0;
131 *((__uint128_t *)dst + 1) = xmm1;
132 *((__uint128_t *)dst + 2) = xmm2;
133 *((__uint128_t *)dst + 3) = xmm3;
134 *((__uint128_t *)dst + 4) = xmm4;
135 *((__uint128_t *)dst + 5) = xmm5;
138 xmm0 = *((__uint128_t *)src + 0);
139 xmm1 = *((__uint128_t *)src + 1);
140 xmm2 = *((__uint128_t *)src + 2);
141 xmm3 = *((__uint128_t *)src + 3);
142 xmm4 = *((__uint128_t *)src + 4);
143 xmm5 = *((__uint128_t *)src + 5);
144 xmm6 = *((__uint128_t *)src + 6);
145 *((__uint128_t *)dst + 0) = xmm0;
146 *((__uint128_t *)dst + 1) = xmm1;
147 *((__uint128_t *)dst + 2) = xmm2;
148 *((__uint128_t *)dst + 3) = xmm3;
149 *((__uint128_t *)dst + 4) = xmm4;
150 *((__uint128_t *)dst + 5) = xmm5;
151 *((__uint128_t *)dst + 6) = xmm6;
154 xmm0 = *((__uint128_t *)src + 0);
155 xmm1 = *((__uint128_t *)src + 1);
156 xmm2 = *((__uint128_t *)src + 2);
157 xmm3 = *((__uint128_t *)src + 3);
158 xmm4 = *((__uint128_t *)src + 4);
159 xmm5 = *((__uint128_t *)src + 5);
160 xmm6 = *((__uint128_t *)src + 6);
161 xmm7 = *((__uint128_t *)src + 7);
162 *((__uint128_t *)dst + 0) = xmm0;
163 *((__uint128_t *)dst + 1) = xmm1;
164 *((__uint128_t *)dst + 2) = xmm2;
165 *((__uint128_t *)dst + 3) = xmm3;
166 *((__uint128_t *)dst + 4) = xmm4;
167 *((__uint128_t *)dst + 5) = xmm5;
168 *((__uint128_t *)dst + 6) = xmm6;
169 *((__uint128_t *)dst + 7) = xmm7;
172 *((uint64_t *)((uintptr_t)dst + sz - 8)) = *((uint64_t *)((uintptr_t)src + sz - 8));
176 void aco_default_protector_last_word(void *_) {
177 aco_t *co = aco_get_co();
178 // do some log about the offending `co`
179 fprintf(stderr, "error: aco_default_protector_last_word triggered\n");
181 "error: co:%p should call `aco_exit()` instead of direct "
182 "`return` in co_fp:%p to finish its execution\n",
188 void aco_set_allocator(void *(*alloc)(size_t), void (*dealloc)(void *)) {
189 aco_alloc_fn = alloc;
190 aco_dealloc_fn = dealloc;
194 void aco_thread_init(aco_cofuncp_t last_word_co_fp) {
195 aco_save_fpucw_mxcsr(aco_gtls_fpucw_mxcsr);
197 if ((void *)last_word_co_fp != NULL) aco_gtls_last_word_fp = last_word_co_fp;
200 // This function `aco_funcp_protector` should never be
201 // called. If it's been called, that means the offending
202 // `co` didn't call aco_exit(co) instead of `return` to
203 // finish its execution.
205 void aco_funcp_protector(void) {
206 if ((void *)(aco_gtls_last_word_fp) != NULL) {
207 aco_gtls_last_word_fp(NULL);
209 aco_default_protector_last_word(NULL);
215 aco_shared_stack_t *aco_shared_stack_new(size_t sz) { return aco_shared_stack_new2(sz, 1); }
218 aco_shared_stack_t *aco_shared_stack_new2(size_t sz, bool guard_page_enabled) {
220 sz = 1024 * 1024 * 2;
228 if (guard_page_enabled) {
229 // although gcc's Built-in Functions to Perform Arithmetic with
230 // Overflow Checking is better, but it would require gcc >= 5.0
231 long pgsz = sysconf(_SC_PAGESIZE);
232 // pgsz must be > 0 && a power of two
233 aco_assert(pgsz > 0 && (((pgsz - 1) & pgsz) == 0));
234 u_pgsz = (size_t)((unsigned long)pgsz);
235 // it should be always true in real life
236 aco_assert(u_pgsz == (unsigned long)pgsz && ((u_pgsz << 1) >> 1) == u_pgsz);
241 if ((sz & (u_pgsz - 1)) != 0) {
242 new_sz = (sz & (~(u_pgsz - 1)));
243 aco_assert(new_sz >= u_pgsz);
244 aco_size_t_safe_add_assert(new_sz, (u_pgsz << 1));
245 new_sz = new_sz + (u_pgsz << 1);
246 aco_assert(sz / u_pgsz + 2 == new_sz / u_pgsz);
248 aco_size_t_safe_add_assert(sz, u_pgsz);
249 new_sz = sz + u_pgsz;
250 aco_assert(sz / u_pgsz + 1 == new_sz / u_pgsz);
253 aco_assert((sz / u_pgsz > 1) && ((sz & (u_pgsz - 1)) == 0));
257 aco_shared_stack_t *p = aco_alloc(sizeof(aco_shared_stack_t));
258 memset(p, 0, sizeof(aco_shared_stack_t));
260 if (guard_page_enabled) {
261 p->real_ptr = mmap(NULL, sz, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
262 if (aco_unlikely(p->real_ptr == MAP_FAILED)) {
263 fprintf(stderr, "Aborting: failed to allocate memory: %s:%d:%s\n", __FILE__, __LINE__, __PRETTY_FUNCTION__);
266 p->guard_page_enabled = true;
267 aco_assert(0 == mprotect(p->real_ptr, u_pgsz, PROT_READ));
269 p->ptr = (void *)(((uintptr_t)p->real_ptr) + u_pgsz);
271 aco_assert(sz >= (u_pgsz << 1));
274 // p->guard_page_enabled = 0;
276 p->ptr = aco_alloc(sz);
280 #ifdef ACO_USE_VALGRIND
281 p->valgrind_stk_id = VALGRIND_STACK_REGISTER(p->ptr, (void *)((uintptr_t)p->ptr + p->sz));
283 #if defined(__i386__) || defined(__x86_64__)
284 uintptr_t u_p = (uintptr_t)(p->sz - (sizeof(void *) << 1) + (uintptr_t)p->ptr);
285 u_p = (u_p >> 4) << 4;
286 p->align_highptr = (void *)u_p;
287 p->align_retptr = (void *)(u_p - sizeof(void *));
288 *((void **)(p->align_retptr)) = (void *)(aco_funcp_protector_asm);
289 aco_assert(p->sz > (16 + (sizeof(void *) << 1) + sizeof(void *)));
290 p->align_limit = p->sz - 16 - (sizeof(void *) << 1);
292 #error "platform not supporteded yet"
298 void aco_shared_stack_destroy(aco_shared_stack_t *sstk) {
299 aco_assert(sstk != NULL && sstk->ptr != NULL);
300 #ifdef ACO_USE_VALGRIND
301 VALGRIND_STACK_DEREGISTER(sstk->valgrind_stk_id);
303 if (sstk->guard_page_enabled) {
304 aco_assert(0 == munmap(sstk->real_ptr, sstk->real_sz));
305 sstk->real_ptr = NULL;
308 if (aco_dealloc_fn != NULL) aco_dealloc_fn(sstk->ptr);
311 if (aco_dealloc_fn != NULL) aco_dealloc_fn(sstk);
315 aco_t *aco_create(aco_t *main_co, aco_shared_stack_t *shared_stack, size_t saved_stack_sz, aco_cofuncp_t fp,
317 aco_t *p = aco_alloc(sizeof(aco_t));
318 memset(p, 0, sizeof(aco_t));
320 if (main_co != NULL) { // non-main co
321 aco_assertptr(shared_stack);
322 p->shared_stack = shared_stack;
324 // POSIX.1-2008 (IEEE Std 1003.1-2008) - General Information - Data Types - Pointer Types
325 // http://pubs.opengroup.org/onlinepubs/9699919799.2008edition/functions/V2_chap02.html#tag_15_12_03
326 p->reg[ACO_REG_IDX_RETADDR] = (void *)fp;
328 p->reg[ACO_REG_IDX_SP] = p->shared_stack->align_retptr;
329 #ifndef ACO_CONFIG_SHARE_FPU_MXCSR_ENV
330 p->reg[ACO_REG_IDX_FPU] = aco_gtls_fpucw_mxcsr[0];
331 p->reg[ACO_REG_IDX_FPU + 1] = aco_gtls_fpucw_mxcsr[1];
334 p->reg[ACO_REG_IDX_RETADDR] = (void *)fp;
335 p->reg[ACO_REG_IDX_SP] = p->shared_stack->align_retptr;
336 #ifndef ACO_CONFIG_SHARE_FPU_MXCSR_ENV
337 p->reg[ACO_REG_IDX_FPU] = aco_gtls_fpucw_mxcsr[0];
340 #error "platform not supporteded yet"
342 p->main_co = main_co;
345 if (saved_stack_sz == 0) {
348 p->saved_stack.ptr = aco_alloc(saved_stack_sz);
349 p->saved_stack.sz = saved_stack_sz;
350 #if defined(__i386__) || defined(__x86_64__)
351 p->saved_stack.valid_sz = 0;
353 #error "platform not supporteded yet"
360 p->shared_stack = NULL;
361 p->saved_stack.ptr = NULL;
368 aco_attr_no_asan void aco_resume(aco_t *resume_co) {
369 aco_assert(resume_co != NULL && resume_co->main_co != NULL && !resume_co->is_finished);
370 if (resume_co->shared_stack->owner != resume_co) {
371 if (resume_co->shared_stack->owner != NULL) {
372 aco_t *owner_co = resume_co->shared_stack->owner;
373 aco_assert(owner_co->shared_stack == resume_co->shared_stack);
374 #if defined(__i386__) || defined(__x86_64__)
375 aco_assert(((uintptr_t)(owner_co->shared_stack->align_retptr) >= (uintptr_t)(owner_co->reg[ACO_REG_IDX_SP]))
376 && ((uintptr_t)(owner_co->shared_stack->align_highptr)
377 - (uintptr_t)(owner_co->shared_stack->align_limit)
378 <= (uintptr_t)(owner_co->reg[ACO_REG_IDX_SP])));
379 owner_co->saved_stack.valid_sz =
380 (uintptr_t)(owner_co->shared_stack->align_retptr) - (uintptr_t)(owner_co->reg[ACO_REG_IDX_SP]);
381 if (owner_co->saved_stack.sz < owner_co->saved_stack.valid_sz) {
382 if (aco_dealloc_fn != NULL) aco_dealloc_fn(owner_co->saved_stack.ptr);
383 owner_co->saved_stack.ptr = NULL;
385 owner_co->saved_stack.sz = owner_co->saved_stack.sz << 1;
386 aco_assert(owner_co->saved_stack.sz > 0);
387 if (owner_co->saved_stack.sz >= owner_co->saved_stack.valid_sz) {
391 owner_co->saved_stack.ptr = aco_alloc(owner_co->saved_stack.sz);
393 // TODO: optimize the performance penalty of memcpy function call
394 // for very short memory span
395 if (owner_co->saved_stack.valid_sz > 0) {
397 aco_fast_memcpy(owner_co->saved_stack.ptr, owner_co->reg[ACO_REG_IDX_SP],
398 owner_co->saved_stack.valid_sz);
400 memcpy(owner_co->saved_stack.ptr, owner_co->reg[ACO_REG_IDX_SP], owner_co->saved_stack.valid_sz);
402 owner_co->saved_stack.ct_save++;
404 if (owner_co->saved_stack.valid_sz > owner_co->saved_stack.max_cpsz) {
405 owner_co->saved_stack.max_cpsz = owner_co->saved_stack.valid_sz;
407 owner_co->shared_stack->owner = NULL;
408 owner_co->shared_stack->align_validsz = 0;
410 #error "platform not supporteded yet"
413 aco_assert(resume_co->shared_stack->owner == NULL);
414 #if defined(__i386__) || defined(__x86_64__)
415 aco_assert(resume_co->saved_stack.valid_sz <= resume_co->shared_stack->align_limit - sizeof(void *));
416 // TODO: optimize the performance penalty of memcpy function call
417 // for very short memory span
418 if (resume_co->saved_stack.valid_sz > 0) {
419 void *dst = (void *)((uintptr_t)(resume_co->shared_stack->align_retptr) - resume_co->saved_stack.valid_sz);
421 aco_fast_memcpy(dst, resume_co->saved_stack.ptr, resume_co->saved_stack.valid_sz);
423 memcpy(dst, resume_co->saved_stack.ptr, resume_co->saved_stack.valid_sz);
425 resume_co->saved_stack.ct_restore++;
427 if (resume_co->saved_stack.valid_sz > resume_co->saved_stack.max_cpsz) {
428 resume_co->saved_stack.max_cpsz = resume_co->saved_stack.valid_sz;
430 resume_co->shared_stack->align_validsz = resume_co->saved_stack.valid_sz + sizeof(void *);
431 resume_co->shared_stack->owner = resume_co;
433 #error "platform not supporteded yet"
436 aco_gtls_co = resume_co;
437 aco_yield_asm(resume_co->main_co, resume_co);
438 aco_gtls_co = resume_co->main_co;
442 void aco_destroy(aco_t *co) {
444 if (aco_is_main_co(co)) {
445 if (aco_dealloc_fn != NULL) aco_dealloc_fn(co);
447 if (co->shared_stack->owner == co) {
448 co->shared_stack->owner = NULL;
449 co->shared_stack->align_validsz = 0;
451 if (aco_dealloc_fn != NULL) aco_dealloc_fn(co->saved_stack.ptr);
452 co->saved_stack.ptr = NULL;
453 if (aco_dealloc_fn != NULL) aco_dealloc_fn(co);
458 void aco_exit_fn(void *_) { aco_exit(); }