From df765200c41692eed41c0f84a8b9a32e39f7fa34 Mon Sep 17 00:00:00 2001 From: Bruce Hill Date: Sat, 6 Jul 2024 17:23:32 -0400 Subject: [PATCH] Switch to parallel compilation --- compilation.md | 164 +++++++++++++++++++++++++++++++++++++++++++++++++ tomo.c | 68 +++++++++++--------- 2 files changed, 203 insertions(+), 29 deletions(-) create mode 100644 compilation.md diff --git a/compilation.md b/compilation.md new file mode 100644 index 0000000..2716111 --- /dev/null +++ b/compilation.md @@ -0,0 +1,164 @@ +# Compilation Pipeline + +For a simple single-file program, the compilation process has a dependency +graph that looks like this: +``` + +----------+ transpile +------------+ generate arg parser + | foo.tm.h | <----------- | foo.tm | ------------- + +----------+ +------------+ | + | | | transpile | + | | v | + | | +------------+ | + | | | foo.tm.c | +--------------------+ + | | +------------+ | main() entry point | + | | | +--------------------+ + | | compile v | + | | +------------+ | + | +----------------->| foo.tm.o | | + | +------------+ | + | | link | + | v | + | compile +------------+ compile | + +---------------------> | foo | <------------+ + +------------+ +``` + +For a more complicated example, imagine `foo.tm` imports `baz.tm` and both are +being compiled into a shared library, `libfoo.so`: + +``` + +---------------------------------------+ + | | + +----------+ transpile +------------+ | + +- | baz.tm.h | <----------- | baz.tm | -+-------------------------+ + | +----------+ +------------+ | | + | | | | | + | | | transpile | | + | | v | | + | | +------------+ | | + | | | baz.tm.c | | | + | | +------------+ | | + | | | | | + | | | compile | | + | | v | | + | | compile +------------+ | | + | +---------------------> | baz.tm.o | | | + | +------------+ | | + | | | | + | | link | compile | + | v v | + | compile +--------------------------------------+ | + | +---------------------> | libfoo.so | | + | | +--------------------------------------+ | + | | ^ | + | | | link | + | | | | + | +----------+ transpile +------------+ | | + | | foo.tm.h | <----------- | foo.tm | | | + | +----------+ +------------+ | | + | | | | | + | | | transpile | | + | | v | | + | | +------------+ | type info | + | | | foo.tm.c | <+-------------------------+ + | | +------------+ | + | | | | + | | | compile | + | | v | + | | compile +------------+ | + +----+---------------------> | foo.tm.o | -+ + | +------------+ + | compile ^ + +-------------------------+ +``` + +These dependency graphs are relatively complicated-looking, but here are some +rough takeaways: + + 1) Header files are a dependency for many parts of the process, so it's + good to transpile them as early as possible. + 2) Once all the header files are available, + compiled into their object files in parallel. This is by far the + slowest part of compilation (invoking the C compiler), so it benefits + the most from parallelization. + 3) After all object files are compiled, the last step is to link them + all together (fast and simple). + +To sastisfy these requirements as efficiently as possible, the approach taken +below is to first transpile all header files sequentially (this could be +parallelized, but is probably faster than the overhead of forking new +processes), then fork a new process for each dependency to transpile and +compile it to an object file. Then, wait for all child processes to finish and +link the resulting object files together. + +## Phase 1 (sequential transpilation): + +``` + +--------+ +--------+ + | foo.tm | | baz.tm | + +--------+ +--------+ + | | | + +--------------+ | + | | + v v + +----------+ +----------+ + | foo.tm.h | | baz.tm.h | + +----------+ +----------+ +``` + +## Phase 2 (parallel transpilation/compilation): + +``` + ################################ ################################ + # Process 1 # # Process 2 # + # +--------+ +----------+ # # +----------+ +--------+ # + # | foo.tm | | foo.tm.h | # # | baz.tm.h | | baz.tm | # + # +--------+ | baz.tm.h | # # +----------+ +--------+ # + # | +----+-----+ # # | | # + # v | # # | v # + # +----------+ | # # | +----------+ # + # | foo.tm.c | | # # | | baz.tm.c | # + # +----------+ | # # | +----------+ # + # | | # # | | # + # +------+------+ # # +--------+ # + # | # # | # + # v # # v # + # +----------+ # # +----------+ # + # | foo.tm.o | # # | baz.tm.o | # + # +----------+ # # +----------+ # + ################################ ################################ +``` + +## Phase 3 (linking a shared object file library): + +``` + +----------+ +----------+ + | foo.tm.o | | baz.tm.o | + +----------+ +----------+ + | | + +--------+--------+ + | + v + +-----------+ + | libfoo.so | + +-----------+ +``` + +## Phase 3 (linking an executable): + +``` + +----------+ +----------+ +--------+ +--------+ + | foo.tm.o | | baz.tm.o | | foo.tm | | baz.tm | + +----------+ +----------+ +--------+ +--------+ + | | | | + +--------+--------+ +----+-----+ + | link | Figure out command line args + v v + +-----+ compile +-------------------------+ + | foo |<-------------| main() function for exe | + +-----+ +----------+--------------+ + | foo.tm.h | + +----------+ + | baz.tm.h | + +----------+ +``` diff --git a/tomo.c b/tomo.c index a0052ba..f603057 100644 --- a/tomo.c +++ b/tomo.c @@ -7,6 +7,7 @@ #include #include #include +#include #include "ast.h" #include "builtins/array.h" @@ -124,53 +125,61 @@ int main(int argc, char *argv[]) env_t *env = new_compilation_unit(&compilation_library_name); table_t dependency_files = {}; table_t to_link = {}; + table_t argument_files = {}; for (int i = after_flags; i < argc; i++) { const char *resolved = resolve_path(argv[i], ".", "."); if (!resolved) errx(1, "Couldn't resolve path: %s", argv[i]); + Table$str_set(&argument_files, resolved, argv[i]); build_file_dependency_graph(resolved, &dependency_files, &to_link); if (mode == MODE_RUN) break; } int status; - // Non-lazily (re)compile header files for each source file passed to the compiler: - for (int i = after_flags; i < argc; i++) { - const char *filename = argv[i]; - status = transpile_header(env, filename, true); - if (status != 0) return status; - } - - // Lazily (re)compile all the header files: + // (Re)compile header files, eagerly for explicitly passed in files, lazily + // for downstream dependencies: for (int64_t i = 0; i < dependency_files.entries.length; i++) { const char *filename = *(char**)(dependency_files.entries.data + i*dependency_files.entries.stride); - status = transpile_header(env, filename, false); + bool is_argument_file = (Table$str_get(argument_files, filename) != NULL); + status = transpile_header(env, filename, is_argument_file); if (status != 0) return status; } env->imports = new(table_t); - // Non-lazily (re)compile object files for each source file passed to the compiler: - for (int i = after_flags; i < argc; i++) { - const char *filename = argv[i]; - status = transpile_code(env, filename, true); - if (status != 0) return status; - status = compile_object_file(filename, true); - if (status != 0) return status; - if (mode == MODE_RUN) break; + struct child_s { + struct child_s *next; + pid_t pid; + } *child_processes = NULL; + + // (Re)transpile and compile object files, eagerly for files explicitly + // specified and lazily for downstream dependencies: + for (int64_t i = 0; i < dependency_files.entries.length; i++) { + const char *filename = *(char**)(dependency_files.entries.data + i*dependency_files.entries.stride); + bool is_argument_file = (Table$str_get(argument_files, filename) != NULL); + if (mode == MODE_COMPILE_OBJ && !is_argument_file) + continue; + + pid_t pid = fork(); + if (pid == 0) { + status = transpile_code(env, filename, is_argument_file); + if (status != 0) + _exit(status); + status = compile_object_file(filename, is_argument_file); + _exit(status); + } + child_processes = new(struct child_s, .next=child_processes, .pid=pid); + } + + for (; child_processes; child_processes = child_processes->next) { + waitpid(child_processes->pid, &status, 0); + if (!WIFEXITED(status) || WEXITSTATUS(status) != 0) + return EXIT_FAILURE; } if (mode == MODE_COMPILE_OBJ) return 0; - // Lazily (re)compile object files for each dependency: - for (int64_t i = 0; i < dependency_files.entries.length; i++) { - const char *filename = *(char**)(dependency_files.entries.data + i*dependency_files.entries.stride); - status = transpile_code(env, filename, false); - if (status != 0) return status; - status = compile_object_file(filename, false); - if (status != 0) return status; - } - CORD object_files = CORD_EMPTY; for (int64_t i = 0; i < dependency_files.entries.length; i++) { const char *filename = *(char**)(dependency_files.entries.data + i*dependency_files.entries.stride); @@ -439,9 +448,10 @@ int compile_object_file(const char *filename, bool force_recompile) int compile_executable(env_t *base_env, const char *filename, CORD object_files) { - const char *name = file_base_name(filename); - env_t *env = Table$str_get(*base_env->imports, name); - assert(env); + ast_t *ast = parse_file(filename, NULL); + if (!ast) + errx(1, "Could not parse file %s", filename); + env_t *env = load_module_env(base_env, ast); binding_t *main_binding = get_binding(env, "main"); if (!main_binding || main_binding->type->tag != FunctionType) { errx(1, "No main() function has been defined for %s, so it can't be run!", filename);