code / tomo-coroutines

Lines715 C545 Assembly92 Tomo47 Markdown31
(458 lines)
1 // Copyright 2018 Sen Han <00hnes@gmail.com>
2 // Modifications copyright 2025 Bruce Hill <bruce@bruce-hill.com>
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 // http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
16 #define _GNU_SOURCE
18 #include "aco.h"
19 #include <stdint.h>
20 #include <stdio.h>
22 #ifndef public
23 #define public __attribute__((visibility("default")))
24 #endif
26 #define aco_size_t_safe_add_assert(a, b) aco_assert((a) + (b) >= (a))
28 static void aco_default_protector_last_word(void *);
30 void *(*aco_alloc_fn)(size_t) = malloc;
31 void (*aco_dealloc_fn)(void *) = free;
33 #define aco_alloc(size) \
34 ({ \
35 void *_ptr = aco_alloc_fn(size); \
36 if (aco_unlikely((_ptr) == NULL)) { \
37 fprintf(stderr, "Aborting: failed to allocate memory: %s:%d:%s\n", __FILE__, __LINE__, \
38 __PRETTY_FUNCTION__); \
39 abort(); \
40 } \
41 _ptr; \
42 })
44 // aco's Global Thread Local Storage variable `co`
45 public
46 __thread aco_t *aco_gtls_co;
47 static __thread aco_cofuncp_t aco_gtls_last_word_fp = aco_default_protector_last_word;
49 #ifdef __i386__
50 static __thread void *aco_gtls_fpucw_mxcsr[2];
51 #elif __x86_64__
52 static __thread void *aco_gtls_fpucw_mxcsr[1];
53 #else
54 #error "platform not supporteded yet"
55 #endif
57 public
58 void aco_runtime_test(void) {
59 #ifdef __i386__
60 _Static_assert(sizeof(void *) == 4, "require 'sizeof(void*) == 4'");
61 #elif __x86_64__
62 _Static_assert(sizeof(void *) == 8, "require 'sizeof(void*) == 8'");
63 _Static_assert(sizeof(__uint128_t) == 16, "require 'sizeof(__uint128_t) == 16'");
64 #else
65 #error "platform not supporteded yet"
66 #endif
67 _Static_assert(sizeof(int) >= 4, "require 'sizeof(int) >= 4'");
68 aco_assert(sizeof(int) >= 4);
69 _Static_assert(sizeof(int) <= sizeof(size_t), "require 'sizeof(int) <= sizeof(size_t)'");
70 aco_assert(sizeof(int) <= sizeof(size_t));
73 #ifdef __x86_64__
74 static inline void aco_fast_memcpy(void *dst, const void *src, size_t sz) {
75 if (((uintptr_t)src & 0x0f) != 0 || ((uintptr_t)dst & 0x0f) != 0 || (sz & 0x0f) != 0x08 || (sz >> 4) > 8) {
76 memcpy(dst, src, sz);
77 return;
80 __uint128_t xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
81 switch (sz >> 4) {
82 case 0: break;
83 case 1:
84 xmm0 = *((__uint128_t *)src + 0);
85 *((__uint128_t *)dst + 0) = xmm0;
86 break;
87 case 2:
88 xmm0 = *((__uint128_t *)src + 0);
89 xmm1 = *((__uint128_t *)src + 1);
90 *((__uint128_t *)dst + 0) = xmm0;
91 *((__uint128_t *)dst + 1) = xmm1;
92 break;
93 case 3:
94 xmm0 = *((__uint128_t *)src + 0);
95 xmm1 = *((__uint128_t *)src + 1);
96 xmm2 = *((__uint128_t *)src + 2);
97 *((__uint128_t *)dst + 0) = xmm0;
98 *((__uint128_t *)dst + 1) = xmm1;
99 *((__uint128_t *)dst + 2) = xmm2;
100 break;
101 case 4:
102 xmm0 = *((__uint128_t *)src + 0);
103 xmm1 = *((__uint128_t *)src + 1);
104 xmm2 = *((__uint128_t *)src + 2);
105 xmm3 = *((__uint128_t *)src + 3);
106 *((__uint128_t *)dst + 0) = xmm0;
107 *((__uint128_t *)dst + 1) = xmm1;
108 *((__uint128_t *)dst + 2) = xmm2;
109 *((__uint128_t *)dst + 3) = xmm3;
110 break;
111 case 5:
112 xmm0 = *((__uint128_t *)src + 0);
113 xmm1 = *((__uint128_t *)src + 1);
114 xmm2 = *((__uint128_t *)src + 2);
115 xmm3 = *((__uint128_t *)src + 3);
116 xmm4 = *((__uint128_t *)src + 4);
117 *((__uint128_t *)dst + 0) = xmm0;
118 *((__uint128_t *)dst + 1) = xmm1;
119 *((__uint128_t *)dst + 2) = xmm2;
120 *((__uint128_t *)dst + 3) = xmm3;
121 *((__uint128_t *)dst + 4) = xmm4;
122 break;
123 case 6:
124 xmm0 = *((__uint128_t *)src + 0);
125 xmm1 = *((__uint128_t *)src + 1);
126 xmm2 = *((__uint128_t *)src + 2);
127 xmm3 = *((__uint128_t *)src + 3);
128 xmm4 = *((__uint128_t *)src + 4);
129 xmm5 = *((__uint128_t *)src + 5);
130 *((__uint128_t *)dst + 0) = xmm0;
131 *((__uint128_t *)dst + 1) = xmm1;
132 *((__uint128_t *)dst + 2) = xmm2;
133 *((__uint128_t *)dst + 3) = xmm3;
134 *((__uint128_t *)dst + 4) = xmm4;
135 *((__uint128_t *)dst + 5) = xmm5;
136 break;
137 case 7:
138 xmm0 = *((__uint128_t *)src + 0);
139 xmm1 = *((__uint128_t *)src + 1);
140 xmm2 = *((__uint128_t *)src + 2);
141 xmm3 = *((__uint128_t *)src + 3);
142 xmm4 = *((__uint128_t *)src + 4);
143 xmm5 = *((__uint128_t *)src + 5);
144 xmm6 = *((__uint128_t *)src + 6);
145 *((__uint128_t *)dst + 0) = xmm0;
146 *((__uint128_t *)dst + 1) = xmm1;
147 *((__uint128_t *)dst + 2) = xmm2;
148 *((__uint128_t *)dst + 3) = xmm3;
149 *((__uint128_t *)dst + 4) = xmm4;
150 *((__uint128_t *)dst + 5) = xmm5;
151 *((__uint128_t *)dst + 6) = xmm6;
152 break;
153 case 8:
154 xmm0 = *((__uint128_t *)src + 0);
155 xmm1 = *((__uint128_t *)src + 1);
156 xmm2 = *((__uint128_t *)src + 2);
157 xmm3 = *((__uint128_t *)src + 3);
158 xmm4 = *((__uint128_t *)src + 4);
159 xmm5 = *((__uint128_t *)src + 5);
160 xmm6 = *((__uint128_t *)src + 6);
161 xmm7 = *((__uint128_t *)src + 7);
162 *((__uint128_t *)dst + 0) = xmm0;
163 *((__uint128_t *)dst + 1) = xmm1;
164 *((__uint128_t *)dst + 2) = xmm2;
165 *((__uint128_t *)dst + 3) = xmm3;
166 *((__uint128_t *)dst + 4) = xmm4;
167 *((__uint128_t *)dst + 5) = xmm5;
168 *((__uint128_t *)dst + 6) = xmm6;
169 *((__uint128_t *)dst + 7) = xmm7;
170 break;
172 *((uint64_t *)((uintptr_t)dst + sz - 8)) = *((uint64_t *)((uintptr_t)src + sz - 8));
174 #endif
176 void aco_default_protector_last_word(void *_) {
177 aco_t *co = aco_get_co();
178 // do some log about the offending `co`
179 fprintf(stderr, "error: aco_default_protector_last_word triggered\n");
180 fprintf(stderr,
181 "error: co:%p should call `aco_exit()` instead of direct "
182 "`return` in co_fp:%p to finish its execution\n",
183 co, (void *)co->fp);
184 aco_assert(0);
187 public
188 void aco_set_allocator(void *(*alloc)(size_t), void (*dealloc)(void *)) {
189 aco_alloc_fn = alloc;
190 aco_dealloc_fn = dealloc;
193 public
194 void aco_thread_init(aco_cofuncp_t last_word_co_fp) {
195 aco_save_fpucw_mxcsr(aco_gtls_fpucw_mxcsr);
197 if ((void *)last_word_co_fp != NULL) aco_gtls_last_word_fp = last_word_co_fp;
200 // This function `aco_funcp_protector` should never be
201 // called. If it's been called, that means the offending
202 // `co` didn't call aco_exit(co) instead of `return` to
203 // finish its execution.
204 public
205 void aco_funcp_protector(void) {
206 if ((void *)(aco_gtls_last_word_fp) != NULL) {
207 aco_gtls_last_word_fp(NULL);
208 } else {
209 aco_default_protector_last_word(NULL);
211 aco_assert(0);
214 public
215 aco_shared_stack_t *aco_shared_stack_new(size_t sz) { return aco_shared_stack_new2(sz, 1); }
217 public
218 aco_shared_stack_t *aco_shared_stack_new2(size_t sz, bool guard_page_enabled) {
219 if (sz == 0) {
220 sz = 1024 * 1024 * 2;
222 if (sz < 4096) {
223 sz = 4096;
225 aco_assert(sz > 0);
227 size_t u_pgsz = 0;
228 if (guard_page_enabled) {
229 // although gcc's Built-in Functions to Perform Arithmetic with
230 // Overflow Checking is better, but it would require gcc >= 5.0
231 long pgsz = sysconf(_SC_PAGESIZE);
232 // pgsz must be > 0 && a power of two
233 aco_assert(pgsz > 0 && (((pgsz - 1) & pgsz) == 0));
234 u_pgsz = (size_t)((unsigned long)pgsz);
235 // it should be always true in real life
236 aco_assert(u_pgsz == (unsigned long)pgsz && ((u_pgsz << 1) >> 1) == u_pgsz);
237 if (sz <= u_pgsz) {
238 sz = u_pgsz << 1;
239 } else {
240 size_t new_sz;
241 if ((sz & (u_pgsz - 1)) != 0) {
242 new_sz = (sz & (~(u_pgsz - 1)));
243 aco_assert(new_sz >= u_pgsz);
244 aco_size_t_safe_add_assert(new_sz, (u_pgsz << 1));
245 new_sz = new_sz + (u_pgsz << 1);
246 aco_assert(sz / u_pgsz + 2 == new_sz / u_pgsz);
247 } else {
248 aco_size_t_safe_add_assert(sz, u_pgsz);
249 new_sz = sz + u_pgsz;
250 aco_assert(sz / u_pgsz + 1 == new_sz / u_pgsz);
252 sz = new_sz;
253 aco_assert((sz / u_pgsz > 1) && ((sz & (u_pgsz - 1)) == 0));
257 aco_shared_stack_t *p = aco_alloc(sizeof(aco_shared_stack_t));
258 memset(p, 0, sizeof(aco_shared_stack_t));
260 if (guard_page_enabled) {
261 p->real_ptr = mmap(NULL, sz, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
262 if (aco_unlikely(p->real_ptr == MAP_FAILED)) {
263 fprintf(stderr, "Aborting: failed to allocate memory: %s:%d:%s\n", __FILE__, __LINE__, __PRETTY_FUNCTION__);
264 abort();
266 p->guard_page_enabled = true;
267 aco_assert(0 == mprotect(p->real_ptr, u_pgsz, PROT_READ));
269 p->ptr = (void *)(((uintptr_t)p->real_ptr) + u_pgsz);
270 p->real_sz = sz;
271 aco_assert(sz >= (u_pgsz << 1));
272 p->sz = sz - u_pgsz;
273 } else {
274 // p->guard_page_enabled = 0;
275 p->sz = sz;
276 p->ptr = aco_alloc(sz);
279 p->owner = NULL;
280 #ifdef ACO_USE_VALGRIND
281 p->valgrind_stk_id = VALGRIND_STACK_REGISTER(p->ptr, (void *)((uintptr_t)p->ptr + p->sz));
282 #endif
283 #if defined(__i386__) || defined(__x86_64__)
284 uintptr_t u_p = (uintptr_t)(p->sz - (sizeof(void *) << 1) + (uintptr_t)p->ptr);
285 u_p = (u_p >> 4) << 4;
286 p->align_highptr = (void *)u_p;
287 p->align_retptr = (void *)(u_p - sizeof(void *));
288 *((void **)(p->align_retptr)) = (void *)(aco_funcp_protector_asm);
289 aco_assert(p->sz > (16 + (sizeof(void *) << 1) + sizeof(void *)));
290 p->align_limit = p->sz - 16 - (sizeof(void *) << 1);
291 #else
292 #error "platform not supporteded yet"
293 #endif
294 return p;
297 public
298 void aco_shared_stack_destroy(aco_shared_stack_t *sstk) {
299 aco_assert(sstk != NULL && sstk->ptr != NULL);
300 #ifdef ACO_USE_VALGRIND
301 VALGRIND_STACK_DEREGISTER(sstk->valgrind_stk_id);
302 #endif
303 if (sstk->guard_page_enabled) {
304 aco_assert(0 == munmap(sstk->real_ptr, sstk->real_sz));
305 sstk->real_ptr = NULL;
306 sstk->ptr = NULL;
307 } else {
308 if (aco_dealloc_fn != NULL) aco_dealloc_fn(sstk->ptr);
309 sstk->ptr = NULL;
311 if (aco_dealloc_fn != NULL) aco_dealloc_fn(sstk);
314 public
315 aco_t *aco_create(aco_t *main_co, aco_shared_stack_t *shared_stack, size_t saved_stack_sz, aco_cofuncp_t fp,
316 void *arg) {
317 aco_t *p = aco_alloc(sizeof(aco_t));
318 memset(p, 0, sizeof(aco_t));
320 if (main_co != NULL) { // non-main co
321 aco_assertptr(shared_stack);
322 p->shared_stack = shared_stack;
323 #ifdef __i386__
324 // POSIX.1-2008 (IEEE Std 1003.1-2008) - General Information - Data Types - Pointer Types
325 // http://pubs.opengroup.org/onlinepubs/9699919799.2008edition/functions/V2_chap02.html#tag_15_12_03
326 p->reg[ACO_REG_IDX_RETADDR] = (void *)fp;
327 // push retaddr
328 p->reg[ACO_REG_IDX_SP] = p->shared_stack->align_retptr;
329 #ifndef ACO_CONFIG_SHARE_FPU_MXCSR_ENV
330 p->reg[ACO_REG_IDX_FPU] = aco_gtls_fpucw_mxcsr[0];
331 p->reg[ACO_REG_IDX_FPU + 1] = aco_gtls_fpucw_mxcsr[1];
332 #endif
333 #elif __x86_64__
334 p->reg[ACO_REG_IDX_RETADDR] = (void *)fp;
335 p->reg[ACO_REG_IDX_SP] = p->shared_stack->align_retptr;
336 #ifndef ACO_CONFIG_SHARE_FPU_MXCSR_ENV
337 p->reg[ACO_REG_IDX_FPU] = aco_gtls_fpucw_mxcsr[0];
338 #endif
339 #else
340 #error "platform not supporteded yet"
341 #endif
342 p->main_co = main_co;
343 p->arg = arg;
344 p->fp = fp;
345 if (saved_stack_sz == 0) {
346 saved_stack_sz = 64;
348 p->saved_stack.ptr = aco_alloc(saved_stack_sz);
349 p->saved_stack.sz = saved_stack_sz;
350 #if defined(__i386__) || defined(__x86_64__)
351 p->saved_stack.valid_sz = 0;
352 #else
353 #error "platform not supporteded yet"
354 #endif
355 return p;
356 } else { // main co
357 p->main_co = NULL;
358 p->arg = arg;
359 p->fp = fp;
360 p->shared_stack = NULL;
361 p->saved_stack.ptr = NULL;
362 return p;
364 aco_assert(0);
367 public
368 aco_attr_no_asan void aco_resume(aco_t *resume_co) {
369 aco_assert(resume_co != NULL && resume_co->main_co != NULL && !resume_co->is_finished);
370 if (resume_co->shared_stack->owner != resume_co) {
371 if (resume_co->shared_stack->owner != NULL) {
372 aco_t *owner_co = resume_co->shared_stack->owner;
373 aco_assert(owner_co->shared_stack == resume_co->shared_stack);
374 #if defined(__i386__) || defined(__x86_64__)
375 aco_assert(((uintptr_t)(owner_co->shared_stack->align_retptr) >= (uintptr_t)(owner_co->reg[ACO_REG_IDX_SP]))
376 && ((uintptr_t)(owner_co->shared_stack->align_highptr)
377 - (uintptr_t)(owner_co->shared_stack->align_limit)
378 <= (uintptr_t)(owner_co->reg[ACO_REG_IDX_SP])));
379 owner_co->saved_stack.valid_sz =
380 (uintptr_t)(owner_co->shared_stack->align_retptr) - (uintptr_t)(owner_co->reg[ACO_REG_IDX_SP]);
381 if (owner_co->saved_stack.sz < owner_co->saved_stack.valid_sz) {
382 if (aco_dealloc_fn != NULL) aco_dealloc_fn(owner_co->saved_stack.ptr);
383 owner_co->saved_stack.ptr = NULL;
384 while (1) {
385 owner_co->saved_stack.sz = owner_co->saved_stack.sz << 1;
386 aco_assert(owner_co->saved_stack.sz > 0);
387 if (owner_co->saved_stack.sz >= owner_co->saved_stack.valid_sz) {
388 break;
391 owner_co->saved_stack.ptr = aco_alloc(owner_co->saved_stack.sz);
393 // TODO: optimize the performance penalty of memcpy function call
394 // for very short memory span
395 if (owner_co->saved_stack.valid_sz > 0) {
396 #ifdef __x86_64__
397 aco_fast_memcpy(owner_co->saved_stack.ptr, owner_co->reg[ACO_REG_IDX_SP],
398 owner_co->saved_stack.valid_sz);
399 #else
400 memcpy(owner_co->saved_stack.ptr, owner_co->reg[ACO_REG_IDX_SP], owner_co->saved_stack.valid_sz);
401 #endif
402 owner_co->saved_stack.ct_save++;
404 if (owner_co->saved_stack.valid_sz > owner_co->saved_stack.max_cpsz) {
405 owner_co->saved_stack.max_cpsz = owner_co->saved_stack.valid_sz;
407 owner_co->shared_stack->owner = NULL;
408 owner_co->shared_stack->align_validsz = 0;
409 #else
410 #error "platform not supporteded yet"
411 #endif
413 aco_assert(resume_co->shared_stack->owner == NULL);
414 #if defined(__i386__) || defined(__x86_64__)
415 aco_assert(resume_co->saved_stack.valid_sz <= resume_co->shared_stack->align_limit - sizeof(void *));
416 // TODO: optimize the performance penalty of memcpy function call
417 // for very short memory span
418 if (resume_co->saved_stack.valid_sz > 0) {
419 void *dst = (void *)((uintptr_t)(resume_co->shared_stack->align_retptr) - resume_co->saved_stack.valid_sz);
420 #ifdef __x86_64__
421 aco_fast_memcpy(dst, resume_co->saved_stack.ptr, resume_co->saved_stack.valid_sz);
422 #else
423 memcpy(dst, resume_co->saved_stack.ptr, resume_co->saved_stack.valid_sz);
424 #endif
425 resume_co->saved_stack.ct_restore++;
427 if (resume_co->saved_stack.valid_sz > resume_co->saved_stack.max_cpsz) {
428 resume_co->saved_stack.max_cpsz = resume_co->saved_stack.valid_sz;
430 resume_co->shared_stack->align_validsz = resume_co->saved_stack.valid_sz + sizeof(void *);
431 resume_co->shared_stack->owner = resume_co;
432 #else
433 #error "platform not supporteded yet"
434 #endif
436 aco_gtls_co = resume_co;
437 aco_yield_asm(resume_co->main_co, resume_co);
438 aco_gtls_co = resume_co->main_co;
441 public
442 void aco_destroy(aco_t *co) {
443 aco_assertptr(co);
444 if (aco_is_main_co(co)) {
445 if (aco_dealloc_fn != NULL) aco_dealloc_fn(co);
446 } else {
447 if (co->shared_stack->owner == co) {
448 co->shared_stack->owner = NULL;
449 co->shared_stack->align_validsz = 0;
451 if (aco_dealloc_fn != NULL) aco_dealloc_fn(co->saved_stack.ptr);
452 co->saved_stack.ptr = NULL;
453 if (aco_dealloc_fn != NULL) aco_dealloc_fn(co);
457 public
458 void aco_exit_fn(void *_) { aco_exit(); }