From 12d46be8c70205662d96fc26aee19fba06fd7ddf Mon Sep 17 00:00:00 2001 From: Justin Bedo Date: Thu, 9 Feb 2023 09:35:55 +1100 Subject: switch to multicore version --- pca.c | 33063 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----- pca.h | 3 +- 2 files changed, 30581 insertions(+), 2485 deletions(-) diff --git a/pca.c b/pca.c index cd135a5..d292786 100644 --- a/pca.c +++ b/pca.c @@ -40,6 +40,7 @@ int futhark_context_config_set_tuning_param(struct futhark_context_config *cfg, struct futhark_context; struct futhark_context *futhark_context_new(struct futhark_context_config *cfg); void futhark_context_free(struct futhark_context *cfg); +void futhark_context_config_set_num_threads(struct futhark_context_config *cfg, int n); void futhark_context_config_set_debugging(struct futhark_context_config *cfg, int flag); void futhark_context_config_set_profiling(struct futhark_context_config *cfg, int flag); void futhark_context_config_set_logging(struct futhark_context_config *cfg, int flag); @@ -74,7 +75,7 @@ void futhark_context_set_logging_file(struct futhark_context *ctx, FILE *f); void futhark_context_pause_profiling(struct futhark_context *ctx); void futhark_context_unpause_profiling(struct futhark_context *ctx); int futhark_context_clear_caches(struct futhark_context *ctx); -#define FUTHARK_BACKEND_c +#define FUTHARK_BACKEND_multicore #define FUTHARK_SUCCESS 0 #define FUTHARK_PROGRAM_ERROR 2 #define FUTHARK_OUT_OF_MEMORY 3 @@ -4886,388 +4887,28911 @@ struct memblock { struct constants { int dummy; }; +static int32_t dlossziwithacc_locks_mem_realtype_24530[100151]; +static int32_t dlossziwithacc_locks_mem_realtype_24545[100151]; +static int32_t dlossziwithacc_locks_mem_realtype_24546[100151]; +static int32_t dlossziwithacc_locks_mem_realtype_24613[100151]; +static int32_t dlossziwithacc_locks_mem_realtype_24630[100151]; +static int32_t pcaWithQuantileziwithacc_locks_mem_realtype_24828[100151]; +static int32_t pcaWithQuantileziwithacc_locks_mem_realtype_24843[100151]; +static int32_t pcaWithQuantileziwithacc_locks_mem_realtype_24844[100151]; +static int32_t pcaWithQuantileziwithacc_locks_mem_realtype_24911[100151]; +static int32_t pcaWithQuantileziwithacc_locks_mem_realtype_24928[100151]; static const int num_tuning_params = 0; static const char *tuning_param_names[1]; static const char *tuning_param_vars[1]; static const char *tuning_param_classes[1]; static int64_t *tuning_param_defaults[1]; -// Start of backends/c.h +// start of scheduler.h -struct futhark_context_config { - int in_use; - int debugging; - int profiling; - int logging; - const char *cache_fname; - int num_tuning_params; - int64_t *tuning_params; - const char** tuning_param_names; - const char** tuning_param_vars; - const char** tuning_param_classes; +// First, the API that the generated code will access. In principle, +// we could then compile the scheduler separately and link an object +// file with the generated code. In practice, we will embed all of +// this in the generated code. + +// Scheduler handle. +struct scheduler; + +// Initialise a scheduler (and start worker threads). +static int scheduler_init(struct scheduler *scheduler, + int num_workers, + double kappa); + +// Shut down a scheduler (and destroy worker threads). +static int scheduler_destroy(struct scheduler *scheduler); + +// Figure out the smallest amount of work that amortises task +// creation. +static int determine_kappa(double *kappa); + +// How a segop should be scheduled. +enum scheduling { + DYNAMIC, + STATIC }; -static void backend_context_config_setup(struct futhark_context_config* cfg) { - (void)cfg; +// How a given task should be executed. Filled out by the scheduler +// and passed to the segop function +struct scheduler_info { + int64_t iter_pr_subtask; + int64_t remainder; + int nsubtasks; + enum scheduling sched; + int wake_up_threads; + + int64_t *task_time; + int64_t *task_iter; +}; + +// A segop function. This is what you hand the scheduler for +// execution. +typedef int (*segop_fn)(void* args, + int64_t iterations, + int tid, + struct scheduler_info info); + +// A task for the scheduler to execute. +struct scheduler_segop { + void *args; + segop_fn top_level_fn; + segop_fn nested_fn; + int64_t iterations; + enum scheduling sched; + + // Pointers to timer and iter associated with the task + int64_t *task_time; + int64_t *task_iter; + + // For debugging + const char* name; +}; + +static inline int scheduler_prepare_task(struct scheduler *scheduler, + struct scheduler_segop *task); + +typedef int (*parloop_fn)(void* args, + int64_t start, + int64_t end, + int subtask_id, + int tid); + +// A parallel parloop task. +struct scheduler_parloop { + void* args; + parloop_fn fn; + int64_t iterations; + struct scheduler_info info; + + // For debugging + const char* name; +}; + +static inline int scheduler_execute_task(struct scheduler *scheduler, + struct scheduler_parloop *task); + +// Then the API implementation. + +#include + +#if defined(_WIN32) +#include +#elif defined(__APPLE__) +#include +// For getting cpu usage of threads +#include +#include +#elif defined(__linux__) +#include +#include +#include +#elif defined(__EMSCRIPTEN__) +#include +#include +#include +#include +#endif + +/* Multicore Utility functions */ + +/* A wrapper for getting rusage on Linux and MacOS */ +/* TODO maybe figure out this for windows */ +static inline int getrusage_thread(struct rusage *rusage) +{ + int err = -1; +#if defined(__APPLE__) + thread_basic_info_data_t info = { 0 }; + mach_msg_type_number_t info_count = THREAD_BASIC_INFO_COUNT; + kern_return_t kern_err; + + kern_err = thread_info(mach_thread_self(), + THREAD_BASIC_INFO, + (thread_info_t)&info, + &info_count); + if (kern_err == KERN_SUCCESS) { + memset(rusage, 0, sizeof(struct rusage)); + rusage->ru_utime.tv_sec = info.user_time.seconds; + rusage->ru_utime.tv_usec = info.user_time.microseconds; + rusage->ru_stime.tv_sec = info.system_time.seconds; + rusage->ru_stime.tv_usec = info.system_time.microseconds; + err = 0; + } else { + errno = EINVAL; + } +#elif defined(__linux__) || __EMSCRIPTEN__ + err = getrusage(RUSAGE_THREAD, rusage); +#endif + return err; } -static void backend_context_config_teardown(struct futhark_context_config* cfg) { - (void)cfg; +/* returns the number of logical cores */ +static int num_processors() +{ +#if defined(_WIN32) +/* https://docs.microsoft.com/en-us/windows/win32/api/sysinfoapi/ns-sysinfoapi-system_info */ + SYSTEM_INFO sysinfo; + GetSystemInfo(&sysinfo); + int ncores = sysinfo.dwNumberOfProcessors; + fprintf(stderr, "Found %d cores on your Windows machine\n Is that correct?\n", ncores); + return ncores; +#elif defined(__APPLE__) + int ncores; + size_t ncores_size = sizeof(ncores); + CHECK_ERRNO(sysctlbyname("hw.logicalcpu", &ncores, &ncores_size, NULL, 0), + "sysctlbyname (hw.logicalcpu)"); + return ncores; +#elif defined(__linux__) + return get_nprocs(); +#elif __EMSCRIPTEN__ + return emscripten_num_logical_cores(); +#else + fprintf(stderr, "operating system not recognised\n"); + return -1; +#endif } -int futhark_context_config_set_tuning_param(struct futhark_context_config* cfg, const char *param_name, size_t param_value) { - (void)cfg; (void)param_name; (void)param_value; - return 1; +static unsigned int g_seed; + +// Used to seed the generator. +static inline void fast_srand(unsigned int seed) { + g_seed = seed; } -struct futhark_context { - struct futhark_context_config* cfg; - int detail_memory; - int debugging; - int profiling; - int profiling_paused; - int logging; - lock_t lock; - char *error; - lock_t error_lock; - FILE *log; - struct constants *constants; - struct free_list free_list; - int64_t peak_mem_usage_default; - int64_t cur_mem_usage_default; - struct program* program; +// Compute a pseudorandom integer. +// Output value in range [0, 32767] +static inline unsigned int fast_rand(void) { + g_seed = (214013*g_seed+2531011); + return (g_seed>>16)&0x7FFF; +} + +struct subtask_queue { + int capacity; // Size of the buffer. + int first; // Index of the start of the ring buffer. + int num_used; // Number of used elements in the buffer. + struct subtask **buffer; + + pthread_mutex_t mutex; // Mutex used for synchronisation. + pthread_cond_t cond; // Condition variable used for synchronisation. + int dead; + +#if defined(MCPROFILE) + /* Profiling fields */ + uint64_t time_enqueue; + uint64_t time_dequeue; + uint64_t n_dequeues; + uint64_t n_enqueues; +#endif }; -int backend_context_setup(struct futhark_context* ctx) { - (void)ctx; +/* A subtask that can be executed by a worker */ +struct subtask { + /* The parloop function */ + parloop_fn fn; + /* Execution parameters */ + void* args; + int64_t start, end; + int id; + + /* Dynamic scheduling parameters */ + int chunkable; + int64_t chunk_size; + + /* Shared variables across subtasks */ + volatile int *counter; // Counter for ongoing subtasks + // Shared task timers and iterators + int64_t *task_time; + int64_t *task_iter; + + /* For debugging */ + const char *name; +}; + + +struct worker { + pthread_t thread; + struct scheduler *scheduler; /* Reference to the scheduler struct the worker belongs to*/ + struct subtask_queue q; + int dead; + int tid; /* Just a thread id */ + + /* "thread local" time fields used for online algorithm */ + uint64_t timer; + uint64_t total; + int nested; /* How nested the current computation is */ + + // Profiling fields + int output_usage; /* Whether to dump thread usage */ + uint64_t time_spent_working; /* Time spent in parloop functions */ +}; + +static inline void output_worker_usage(struct worker *worker) +{ + struct rusage usage; + CHECK_ERRNO(getrusage_thread(&usage), "getrusage_thread"); + struct timeval user_cpu_time = usage.ru_utime; + struct timeval sys_cpu_time = usage.ru_stime; + fprintf(stderr, "tid: %2d - work time %10llu us - user time: %10llu us - sys: %10llu us\n", + worker->tid, + (long long unsigned)worker->time_spent_working / 1000, + (long long unsigned)(user_cpu_time.tv_sec * 1000000 + user_cpu_time.tv_usec), + (long long unsigned)(sys_cpu_time.tv_sec * 1000000 + sys_cpu_time.tv_usec)); +} + +/* Doubles the size of the queue */ +static inline int subtask_queue_grow_queue(struct subtask_queue *subtask_queue) { + + int new_capacity = 2 * subtask_queue->capacity; +#ifdef MCDEBUG + fprintf(stderr, "Growing queue to %d\n", subtask_queue->capacity * 2); +#endif + + struct subtask **new_buffer = calloc(new_capacity, sizeof(struct subtask*)); + for (int i = 0; i < subtask_queue->num_used; i++) { + new_buffer[i] = subtask_queue->buffer[(subtask_queue->first + i) % subtask_queue->capacity]; + } + + free(subtask_queue->buffer); + subtask_queue->buffer = new_buffer; + subtask_queue->capacity = new_capacity; + subtask_queue->first = 0; + return 0; } -void backend_context_teardown(struct futhark_context* ctx) { - (void)ctx; -} +// Initialise a job queue with the given capacity. The queue starts out +// empty. Returns non-zero on error. +static inline int subtask_queue_init(struct subtask_queue *subtask_queue, int capacity) +{ + assert(subtask_queue != NULL); + memset(subtask_queue, 0, sizeof(struct subtask_queue)); + + subtask_queue->capacity = capacity; + subtask_queue->buffer = calloc(capacity, sizeof(struct subtask*)); + if (subtask_queue->buffer == NULL) { + return -1; + } + + CHECK_ERRNO(pthread_mutex_init(&subtask_queue->mutex, NULL), "pthread_mutex_init"); + CHECK_ERRNO(pthread_cond_init(&subtask_queue->cond, NULL), "pthread_cond_init"); -int futhark_context_sync(struct futhark_context* ctx) { - (void)ctx; return 0; } -// End of backends/c.h - -struct program { }; -static void setup_program(struct futhark_context *ctx) +// Destroy the job queue. Blocks until the queue is empty before it +// is destroyed. +static inline int subtask_queue_destroy(struct subtask_queue *subtask_queue) { - (void) ctx; - - int error = 0; - - (void) error; - ctx->program = malloc(sizeof(struct program)); + assert(subtask_queue != NULL); + + CHECK_ERR(pthread_mutex_lock(&subtask_queue->mutex), "pthread_mutex_lock"); + + while (subtask_queue->num_used != 0) { + CHECK_ERR(pthread_cond_wait(&subtask_queue->cond, &subtask_queue->mutex), "pthread_cond_wait"); + } + + // Queue is now empty. Let's kill it! + subtask_queue->dead = 1; + free(subtask_queue->buffer); + CHECK_ERR(pthread_cond_broadcast(&subtask_queue->cond), "pthread_cond_broadcast"); + CHECK_ERR(pthread_mutex_unlock(&subtask_queue->mutex), "pthread_mutex_unlock"); + + return 0; } -static void teardown_program(struct futhark_context *ctx) + +static inline void dump_queue(struct worker *worker) { - (void) ctx; - - int error = 0; - - (void) error; - free(ctx->program); + struct subtask_queue *subtask_queue = &worker->q; + CHECK_ERR(pthread_mutex_lock(&subtask_queue->mutex), "pthread_mutex_lock"); + for (int i = 0; i < subtask_queue->num_used; i++) { + struct subtask * subtask = subtask_queue->buffer[(subtask_queue->first + i) % subtask_queue->capacity]; + printf("queue tid %d with %d task %s\n", worker->tid, i, subtask->name); + } + CHECK_ERR(pthread_cond_broadcast(&subtask_queue->cond), "pthread_cond_broadcast"); + CHECK_ERR(pthread_mutex_unlock(&subtask_queue->mutex), "pthread_mutex_unlock"); } -int memblock_unref(struct futhark_context *ctx, struct memblock *block, const char *desc) + +// Push an element onto the end of the job queue. Blocks if the +// subtask_queue is full (its size is equal to its capacity). Returns +// non-zero on error. It is an error to push a job onto a queue that +// has been destroyed. +static inline int subtask_queue_enqueue(struct worker *worker, struct subtask *subtask ) { - if (block->references != NULL) { - *block->references -= 1; - if (ctx->detail_memory) - fprintf(ctx->log, "Unreferencing block %s (allocated as %s) in %s: %d references remaining.\n", desc, block->desc, "default space", *block->references); - if (*block->references == 0) { - ctx->cur_mem_usage_default -= block->size; - host_free(ctx, (size_t) block->size, desc, (void *) block->mem); - free(block->references); - if (ctx->detail_memory) - fprintf(ctx->log, "%lld bytes freed (now allocated: %lld bytes)\n", (long long) block->size, (long long) ctx->cur_mem_usage_default); - } - block->references = NULL; + assert(worker != NULL); + struct subtask_queue *subtask_queue = &worker->q; + +#ifdef MCPROFILE + uint64_t start = get_wall_time(); +#endif + + CHECK_ERR(pthread_mutex_lock(&subtask_queue->mutex), "pthread_mutex_lock"); + // Wait until there is room in the subtask_queue. + while (subtask_queue->num_used == subtask_queue->capacity && !subtask_queue->dead) { + if (subtask_queue->num_used == subtask_queue->capacity) { + CHECK_ERR(subtask_queue_grow_queue(subtask_queue), "subtask_queue_grow_queue"); + continue; } - return 0; + CHECK_ERR(pthread_cond_wait(&subtask_queue->cond, &subtask_queue->mutex), "pthread_cond_wait"); + } + + if (subtask_queue->dead) { + CHECK_ERR(pthread_mutex_unlock(&subtask_queue->mutex), "pthread_mutex_unlock"); + return -1; + } + + // If we made it past the loop, there is room in the subtask_queue. + subtask_queue->buffer[(subtask_queue->first + subtask_queue->num_used) % subtask_queue->capacity] = subtask; + subtask_queue->num_used++; + +#ifdef MCPROFILE + uint64_t end = get_wall_time(); + subtask_queue->time_enqueue += (end - start); + subtask_queue->n_enqueues++; +#endif + // Broadcast a reader (if any) that there is now an element. + CHECK_ERR(pthread_cond_broadcast(&subtask_queue->cond), "pthread_cond_broadcast"); + CHECK_ERR(pthread_mutex_unlock(&subtask_queue->mutex), "pthread_mutex_unlock"); + + return 0; } -int memblock_alloc(struct futhark_context *ctx, struct memblock *block, int64_t size, const char *desc) + + +/* Like subtask_queue_dequeue, but with two differences: + 1) the subtask is stolen from the __front__ of the queue + 2) returns immediately if there is no subtasks queued, + as we dont' want to block on another workers queue and +*/ +static inline int subtask_queue_steal(struct worker *worker, + struct subtask **subtask) { - if (size < 0) - futhark_panic(1, "Negative allocation of %lld bytes attempted for %s in %s.\n", (long long) size, desc, "default space", ctx->cur_mem_usage_default); - - int ret = memblock_unref(ctx, block, desc); - - if (ret != FUTHARK_SUCCESS) - return ret; - if (ctx->detail_memory) - fprintf(ctx->log, "Allocating %lld bytes for %s in %s (then allocated: %lld bytes)", (long long) size, desc, "default space", (long long) ctx->cur_mem_usage_default + size); - if (ctx->cur_mem_usage_default > ctx->peak_mem_usage_default) { - ctx->peak_mem_usage_default = ctx->cur_mem_usage_default; - if (ctx->detail_memory) - fprintf(ctx->log, " (new peak).\n"); - } else if (ctx->detail_memory) - fprintf(ctx->log, ".\n"); - host_alloc(ctx, (size_t) size, desc, (size_t *) &size, (void *) &block->mem); - if (ctx->error == NULL) { - block->references = (int *) malloc(sizeof(int)); - *block->references = 1; - block->size = size; - block->desc = desc; - ctx->cur_mem_usage_default += size; - return FUTHARK_SUCCESS; - } else { - // We are naively assuming that any memory allocation error is due to OOM. - lock_lock(&ctx->error_lock); - - char *old_error = ctx->error; - - ctx->error = msgprintf("Failed to allocate memory in %s.\nAttempted allocation: %12lld bytes\nCurrently allocated: %12lld bytes\n%s", "default space", (long long) size, (long long) ctx->cur_mem_usage_default, old_error); - free(old_error); - lock_unlock(&ctx->error_lock); - return FUTHARK_OUT_OF_MEMORY; - } + struct subtask_queue *subtask_queue = &worker->q; + assert(subtask_queue != NULL); + +#ifdef MCPROFILE + uint64_t start = get_wall_time(); +#endif + CHECK_ERR(pthread_mutex_lock(&subtask_queue->mutex), "pthread_mutex_lock"); + + if (subtask_queue->num_used == 0) { + CHECK_ERR(pthread_cond_broadcast(&subtask_queue->cond), "pthread_cond_broadcast"); + CHECK_ERR(pthread_mutex_unlock(&subtask_queue->mutex), "pthread_mutex_unlock"); + return 1; + } + + if (subtask_queue->dead) { + CHECK_ERR(pthread_mutex_unlock(&subtask_queue->mutex), "pthread_mutex_unlock"); + return -1; + } + + // Tasks gets stolen from the "front" + struct subtask *cur_back = subtask_queue->buffer[subtask_queue->first]; + struct subtask *new_subtask = NULL; + int remaining_iter = cur_back->end - cur_back->start; + // If subtask is chunkable, we steal half of the iterations + if (cur_back->chunkable && remaining_iter > 1) { + int64_t half = remaining_iter / 2; + new_subtask = malloc(sizeof(struct subtask)); + *new_subtask = *cur_back; + new_subtask->start = cur_back->end - half; + cur_back->end = new_subtask->start; + __atomic_fetch_add(cur_back->counter, 1, __ATOMIC_RELAXED); + } else { + new_subtask = cur_back; + subtask_queue->num_used--; + subtask_queue->first = (subtask_queue->first + 1) % subtask_queue->capacity; + } + *subtask = new_subtask; + + if (*subtask == NULL) { + CHECK_ERR(pthread_mutex_unlock(&subtask_queue->mutex), "pthred_mutex_unlock"); + return 1; + } + +#ifdef MCPROFILE + uint64_t end = get_wall_time(); + subtask_queue->time_dequeue += (end - start); + subtask_queue->n_dequeues++; +#endif + + // Broadcast a writer (if any) that there is now room for more. + CHECK_ERR(pthread_cond_broadcast(&subtask_queue->cond), "pthread_cond_broadcast"); + CHECK_ERR(pthread_mutex_unlock(&subtask_queue->mutex), "pthread_mutex_unlock"); + + return 0; } -int memblock_set(struct futhark_context *ctx, struct memblock *lhs, struct memblock *rhs, const char *lhs_desc) + + +// Pop an element from the back of the job queue. +// Optional argument can be provided to block or not +static inline int subtask_queue_dequeue(struct worker *worker, + struct subtask **subtask, int blocking) { - int ret = memblock_unref(ctx, lhs, lhs_desc); - - if (rhs->references != NULL) - (*rhs->references)++; - *lhs = *rhs; - return ret; + assert(worker != NULL); + struct subtask_queue *subtask_queue = &worker->q; + +#ifdef MCPROFILE + uint64_t start = get_wall_time(); +#endif + + CHECK_ERR(pthread_mutex_lock(&subtask_queue->mutex), "pthread_mutex_lock"); + if (subtask_queue->num_used == 0 && !blocking) { + CHECK_ERR(pthread_mutex_unlock(&subtask_queue->mutex), "pthread_mutex_unlock"); + return 1; + } + // Try to steal some work while the subtask_queue is empty + while (subtask_queue->num_used == 0 && !subtask_queue->dead) { + pthread_cond_wait(&subtask_queue->cond, &subtask_queue->mutex); + } + + if (subtask_queue->dead) { + CHECK_ERR(pthread_mutex_unlock(&subtask_queue->mutex), "pthread_mutex_unlock"); + return -1; + } + + // dequeue pops from the back + *subtask = subtask_queue->buffer[(subtask_queue->first + subtask_queue->num_used - 1) % subtask_queue->capacity]; + subtask_queue->num_used--; + + if (*subtask == NULL) { + assert(!"got NULL ptr"); + CHECK_ERR(pthread_mutex_unlock(&subtask_queue->mutex), "pthred_mutex_unlock"); + return -1; + } + +#ifdef MCPROFILE + uint64_t end = get_wall_time(); + subtask_queue->time_dequeue += (end - start); + subtask_queue->n_dequeues++; +#endif + + // Broadcast a writer (if any) that there is now room for more. + CHECK_ERR(pthread_cond_broadcast(&subtask_queue->cond), "pthread_cond_broadcast"); + CHECK_ERR(pthread_mutex_unlock(&subtask_queue->mutex), "pthread_mutex_unlock"); + + return 0; } -void futhark_context_config_set_debugging(struct futhark_context_config *cfg, int flag) + +static inline int subtask_queue_is_empty(struct subtask_queue *subtask_queue) { - cfg->profiling = cfg->logging = cfg->debugging = flag; + return subtask_queue->num_used == 0; } -void futhark_context_config_set_profiling(struct futhark_context_config *cfg, int flag) -{ - cfg->profiling = flag; + +/* Scheduler definitions */ + +struct scheduler { + struct worker *workers; + int num_threads; + int minimum_chunk_size; + + + // If there is work to steal => active_work > 0 + volatile int active_work; + + // Only one error can be returned at the time now. Maybe we can + // provide a stack like structure for pushing errors onto if we wish + // to backpropagte multiple errors + volatile int error; + + // kappa time unit in nanoseconds + double kappa; +}; + + +// Thread local variable worker struct +// Note that, accesses to tls variables are expensive +// Minimize direct references to this variable +__thread struct worker* worker_local = NULL; + +static int64_t total_now(int64_t total, int64_t time) { + return total + (get_wall_time_ns() - time); } -void futhark_context_config_set_logging(struct futhark_context_config *cfg, int flag) -{ - cfg->logging = flag; + +static int random_other_worker(struct scheduler *scheduler, int my_id) { + int my_num_workers = scheduler->num_threads; + assert(my_num_workers != 1); + int i = fast_rand() % (my_num_workers - 1); + if (i >= my_id) { + i++; + } +#ifdef MCDEBUG + assert(i >= 0); + assert(i < my_num_workers); + assert(i != my_id); +#endif + + return i; } -void futhark_context_config_set_cache_file(struct futhark_context_config *cfg, const char *f) + +static inline int64_t compute_chunk_size(int64_t minimum_chunk_size, double kappa, struct subtask* subtask) { - cfg->cache_fname = f; + double C = (double)*subtask->task_time / (double)*subtask->task_iter; + if (C == 0.0F) C += DBL_EPSILON; + return smax64((int64_t)(kappa / C), minimum_chunk_size); } -int futhark_get_tuning_param_count(void) + +/* Takes a chunk from subtask and enqueues the remaining iterations onto the worker's queue */ +/* A no-op if the subtask is not chunkable */ +static inline struct subtask* chunk_subtask(struct worker* worker, struct subtask *subtask) { - return num_tuning_params; + if (subtask->chunkable) { + // Do we have information from previous runs avaliable + if (*subtask->task_iter > 0) { + subtask->chunk_size = compute_chunk_size(worker->scheduler->minimum_chunk_size, + worker->scheduler->kappa, + subtask); + assert(subtask->chunk_size > 0); + } + int64_t remaining_iter = subtask->end - subtask->start; + assert(remaining_iter > 0); + if (remaining_iter > subtask->chunk_size) { + struct subtask *new_subtask = malloc(sizeof(struct subtask)); + *new_subtask = *subtask; + // increment the subtask join counter to account for new subtask + __atomic_fetch_add(subtask->counter, 1, __ATOMIC_RELAXED); + // Update range parameters + subtask->end = subtask->start + subtask->chunk_size; + new_subtask->start = subtask->end; + subtask_queue_enqueue(worker, new_subtask); + } + } + return subtask; } -const char *futhark_get_tuning_param_name(int i) + +static inline int run_subtask(struct worker* worker, struct subtask* subtask) { - return tuning_param_names[i]; + assert(subtask != NULL); + assert(worker != NULL); + + subtask = chunk_subtask(worker, subtask); + worker->total = 0; + worker->timer = get_wall_time_ns(); +#if defined(MCPROFILE) + int64_t start = worker->timer; +#endif + worker->nested++; + int err = subtask->fn(subtask->args, subtask->start, subtask->end, + subtask->id, + worker->tid); + worker->nested--; + // Some error occured during some other subtask + // so we just clean-up and return + if (worker->scheduler->error != 0) { + // Even a failed task counts as finished. + __atomic_fetch_sub(subtask->counter, 1, __ATOMIC_RELAXED); + free(subtask); + return 0; + } + if (err != 0) { + __atomic_store_n(&worker->scheduler->error, err, __ATOMIC_RELAXED); + } + // Total sequential time spent + int64_t time_elapsed = total_now(worker->total, worker->timer); +#if defined(MCPROFILE) + worker->time_spent_working += get_wall_time_ns() - start; +#endif + int64_t iter = subtask->end - subtask->start; + // report measurements + // These updates should really be done using a single atomic CAS operation + __atomic_fetch_add(subtask->task_time, time_elapsed, __ATOMIC_RELAXED); + __atomic_fetch_add(subtask->task_iter, iter, __ATOMIC_RELAXED); + // We need a fence here, since if the counter is decremented before either + // of the two above are updated bad things can happen, e.g. if they are stack-allocated + __atomic_thread_fence(__ATOMIC_SEQ_CST); + __atomic_fetch_sub(subtask->counter, 1, __ATOMIC_RELAXED); + free(subtask); + return 0; } -const char *futhark_get_tuning_param_class(int i) + + +static inline int is_small(struct scheduler_segop *task, struct scheduler *scheduler, int *nsubtasks) { - return tuning_param_classes[i]; + int64_t time = *task->task_time; + int64_t iter = *task->task_iter; + + if (task->sched == DYNAMIC || iter == 0) { + *nsubtasks = scheduler->num_threads; + return 0; + } + + // Estimate the constant C + double C = (double)time / (double)iter; + double cur_task_iter = (double) task->iterations; + + // Returns true if the task is small i.e. + // if the number of iterations times C is smaller + // than the overhead of subtask creation + if (C == 0.0F || C * cur_task_iter < scheduler->kappa) { + *nsubtasks = 1; + return 1; + } + + // Else compute how many subtasks this tasks should create + int64_t min_iter_pr_subtask = smax64(scheduler->kappa / C, 1); + *nsubtasks = smin64(smax64(task->iterations / min_iter_pr_subtask, 1), scheduler->num_threads); + + return 0; } -char *futhark_context_report(struct futhark_context *ctx) + +// TODO make this prettier +static inline struct subtask* create_subtask(parloop_fn fn, + void* args, + const char* name, + volatile int* counter, + int64_t *timer, + int64_t *iter, + int64_t start, int64_t end, + int chunkable, + int64_t chunk_size, + int id) { - if (futhark_context_sync(ctx) != 0) - return NULL; - - struct str_builder builder; - - str_builder_init(&builder); - { } - if (ctx->profiling) { } - return builder.str; + struct subtask* subtask = malloc(sizeof(struct subtask)); + if (subtask == NULL) { + assert(!"malloc failed in create_subtask"); + return NULL; + } + subtask->fn = fn; + subtask->args = args; + + subtask->counter = counter; + subtask->task_time = timer; + subtask->task_iter = iter; + + subtask->start = start; + subtask->end = end; + subtask->id = id; + subtask->chunkable = chunkable; + subtask->chunk_size = chunk_size; + + subtask->name = name; + return subtask; } -char *futhark_context_get_error(struct futhark_context *ctx) -{ - char *error = ctx->error; - - ctx->error = NULL; - return error; + +static int dummy_counter = 0; +static int64_t dummy_timer = 0; +static int64_t dummy_iter = 0; + +static int dummy_fn(void *args, int64_t start, int64_t end, int subtask_id, int tid) { + (void)args; + (void)start; + (void)end; + (void)subtask_id; + (void)tid; + return 0; } -void futhark_context_set_logging_file(struct futhark_context *ctx, FILE *f) -{ - ctx->log = f; + +// Wake up threads, who are blocking by pushing a dummy task +// onto their queue +static inline void wake_up_threads(struct scheduler *scheduler, int start_tid, int end_tid) { + +#if defined(MCDEBUG) + assert(start_tid >= 1); + assert(end_tid <= scheduler->num_threads); +#endif + for (int i = start_tid; i < end_tid; i++) { + struct subtask *subtask = create_subtask(dummy_fn, NULL, "dummy_fn", + &dummy_counter, + &dummy_timer, &dummy_iter, + 0, 0, + 0, 0, + 0); + CHECK_ERR(subtask_queue_enqueue(&scheduler->workers[i], subtask), "subtask_queue_enqueue"); + } } -void futhark_context_pause_profiling(struct futhark_context *ctx) + +static inline int is_finished(struct worker *worker) { + return worker->dead && subtask_queue_is_empty(&worker->q); +} + +// Try to steal from a random queue +static inline int steal_from_random_worker(struct worker* worker) { - ctx->profiling_paused = 1; + int my_id = worker->tid; + struct scheduler* scheduler = worker->scheduler; + int k = random_other_worker(scheduler, my_id); + struct worker *worker_k = &scheduler->workers[k]; + struct subtask* subtask = NULL; + int retval = subtask_queue_steal(worker_k, &subtask); + if (retval == 0) { + subtask_queue_enqueue(worker, subtask); + return 1; + } + return 0; } -void futhark_context_unpause_profiling(struct futhark_context *ctx) + + +static inline void *scheduler_worker(void* args) { - ctx->profiling_paused = 0; + struct worker *worker = (struct worker*) args; + struct scheduler *scheduler = worker->scheduler; + worker_local = worker; + struct subtask *subtask = NULL; + + while(!is_finished(worker)) { + if (!subtask_queue_is_empty(&worker->q)) { + int retval = subtask_queue_dequeue(worker, &subtask, 0); + if (retval == 0) { + assert(subtask != NULL); + CHECK_ERR(run_subtask(worker, subtask), "run_subtask"); + } // else someone stole our work + + } else if (scheduler->active_work) { /* steal */ + while (!is_finished(worker) && scheduler->active_work) { + if (steal_from_random_worker(worker)) { + break; + } + } + } else { /* go back to sleep and wait for work */ + int retval = subtask_queue_dequeue(worker, &subtask, 1); + if (retval == 0) { + assert(subtask != NULL); + CHECK_ERR(run_subtask(worker, subtask), "run_subtask"); + } + } + } + + assert(subtask_queue_is_empty(&worker->q)); +#if defined(MCPROFILE) + if (worker->output_usage) + output_worker_usage(worker); +#endif + return NULL; } -int futhark_context_clear_caches(struct futhark_context *ctx) + + +static inline int scheduler_execute_parloop(struct scheduler *scheduler, + struct scheduler_parloop *task, + int64_t *timer) { - lock_lock(&ctx->lock); - ctx->peak_mem_usage_default = 0; - lock_unlock(&ctx->lock); - return ctx->error != NULL; + + struct worker *worker = worker_local; + + struct scheduler_info info = task->info; + int64_t iter_pr_subtask = info.iter_pr_subtask; + int64_t remainder = info.remainder; + int nsubtasks = info.nsubtasks; + volatile int join_counter = nsubtasks; + + // Shared timer used to sum up all + // sequential work from each subtask + int64_t task_timer = 0; + int64_t task_iter = 0; + + enum scheduling sched = info.sched; + /* If each subtasks should be processed in chunks */ + int chunkable = sched == STATIC ? 0 : 1; + int64_t chunk_size = scheduler->minimum_chunk_size; // The initial chunk size when no info is avaliable + + + if (info.wake_up_threads || sched == DYNAMIC) + __atomic_add_fetch(&scheduler->active_work, nsubtasks, __ATOMIC_RELAXED); + + int64_t start = 0; + int64_t end = iter_pr_subtask + (int64_t)(remainder != 0); + for (int subtask_id = 0; subtask_id < nsubtasks; subtask_id++) { + struct subtask *subtask = create_subtask(task->fn, task->args, task->name, + &join_counter, + &task_timer, &task_iter, + start, end, + chunkable, chunk_size, + subtask_id); + assert(subtask != NULL); + // In most cases we will never have more subtasks than workers, + // but there can be exceptions (e.g. the kappa tuning function). + struct worker *subtask_worker = + worker->nested + ? &scheduler->workers[worker->tid] + : &scheduler->workers[subtask_id % scheduler->num_threads]; + CHECK_ERR(subtask_queue_enqueue(subtask_worker, subtask), + "subtask_queue_enqueue"); + // Update range params + start = end; + end += iter_pr_subtask + ((subtask_id + 1) < remainder); + } + + if (info.wake_up_threads) { + wake_up_threads(scheduler, nsubtasks, scheduler->num_threads); + } + + // Join (wait for subtasks to finish) + while(join_counter != 0) { + if (!subtask_queue_is_empty(&worker->q)) { + struct subtask *subtask = NULL; + int err = subtask_queue_dequeue(worker, &subtask, 0); + if (err == 0 ) { + CHECK_ERR(run_subtask(worker, subtask), "run_subtask"); + } + } else { + if (steal_from_random_worker(worker)) { + struct subtask *subtask = NULL; + int err = subtask_queue_dequeue(worker, &subtask, 0); + if (err == 0) { + CHECK_ERR(run_subtask(worker, subtask), "run_subtask"); + } + } + } + } + + + if (info.wake_up_threads || sched == DYNAMIC) { + __atomic_sub_fetch(&scheduler->active_work, nsubtasks, __ATOMIC_RELAXED); + } + + // Write back timing results of all sequential work + (*timer) += task_timer; + return scheduler->error; } -// Start of context.h -// Eventually it would be nice to move the context definition in here -// instead of generating it in the compiler. For now it defines -// various helper functions that must be available. +static inline int scheduler_execute_task(struct scheduler *scheduler, + struct scheduler_parloop *task) +{ -// Internal functions. + struct worker *worker = worker_local; -static void set_error(struct futhark_context* ctx, char *error) { - lock_lock(&ctx->error_lock); - if (ctx->error == NULL) { - ctx->error = error; + int err = 0; + + // How much sequential work was performed by the task + int64_t task_timer = 0; + + /* Execute task sequential or parallel based on decision made earlier */ + if (task->info.nsubtasks == 1) { + int64_t start = get_wall_time_ns(); + err = task->fn(task->args, 0, task->iterations, 0, worker->tid); + int64_t end = get_wall_time_ns(); + task_timer = end - start; + worker->time_spent_working += task_timer; + // Report time measurements + // TODO the update of both of these should really be a single atomic!! + __atomic_fetch_add(task->info.task_time, task_timer, __ATOMIC_RELAXED); + __atomic_fetch_add(task->info.task_iter, task->iterations, __ATOMIC_RELAXED); } else { - free(error); + // Add "before" time if we already are inside a task + int64_t time_before = 0; + if (worker->nested > 0) { + time_before = total_now(worker->total, worker->timer); + } + + err = scheduler_execute_parloop(scheduler, task, &task_timer); + + // Report time measurements + // TODO the update of both of these should really be a single atomic!! + __atomic_fetch_add(task->info.task_time, task_timer, __ATOMIC_RELAXED); + __atomic_fetch_add(task->info.task_iter, task->iterations, __ATOMIC_RELAXED); + + // Update timers to account for new timings + worker->total = time_before + task_timer; + worker->timer = get_wall_time_ns(); } - lock_unlock(&ctx->error_lock); -} -// XXX: should be static, but used in ispc_util.h -void lexical_realloc_error(struct futhark_context* ctx, size_t new_size) { - set_error(ctx, - msgprintf("Failed to allocate memory.\nAttempted allocation: %12lld bytes\n", - (long long) new_size)); + + return err; } -static int lexical_realloc(struct futhark_context *ctx, - unsigned char **ptr, - int64_t *old_size, - int64_t new_size) { - unsigned char *new = realloc(*ptr, (size_t)new_size); - if (new == NULL) { - lexical_realloc_error(ctx, new_size); - return FUTHARK_OUT_OF_MEMORY; +/* Decide on how schedule the incoming task i.e. how many subtasks and + to run sequential or (potentially nested) parallel code body */ +static inline int scheduler_prepare_task(struct scheduler* scheduler, + struct scheduler_segop *task) +{ + assert(task != NULL); + + struct worker *worker = worker_local; + struct scheduler_info info; + info.task_time = task->task_time; + info.task_iter = task->task_iter; + + int nsubtasks; + // Decide if task should be scheduled sequentially + if (is_small(task, scheduler, &nsubtasks)) { + info.iter_pr_subtask = task->iterations; + info.remainder = 0; + info.nsubtasks = nsubtasks; + return task->top_level_fn(task->args, task->iterations, worker->tid, info); } else { - *ptr = new; - *old_size = new_size; - return FUTHARK_SUCCESS; + info.iter_pr_subtask = task->iterations / nsubtasks; + info.remainder = task->iterations % nsubtasks; + info.sched = task->sched; + switch (task->sched) { + case STATIC: + info.nsubtasks = info.iter_pr_subtask == 0 ? info.remainder : ((task->iterations - info.remainder) / info.iter_pr_subtask); + break; + case DYNAMIC: + // As any thread can take any subtasks, we are being safe with using + // an upper bound on the number of tasks such that the task allocate enough memory + info.nsubtasks = info.iter_pr_subtask == 0 ? info.remainder : nsubtasks; + break; + default: + assert(!"Got unknown scheduling"); + } } -} -static void free_all_in_free_list(struct futhark_context* ctx) { - fl_mem mem; - free_list_pack(&ctx->free_list); - while (free_list_first(&ctx->free_list, (fl_mem*)&mem) == 0) { - free((void*)mem); + info.wake_up_threads = 0; + // We only use the nested parallel segop function if we can't exchaust all cores + // using the outer most level + if (task->nested_fn != NULL && info.nsubtasks < scheduler->num_threads && info.nsubtasks == task->iterations) { + if (worker->nested == 0) + info.wake_up_threads = 1; + return task->nested_fn(task->args, task->iterations, worker->tid, info); } -} -static int is_small_alloc(size_t size) { - return size < 1024*1024; + return task->top_level_fn(task->args, task->iterations, worker->tid, info); } -static void host_alloc(struct futhark_context* ctx, - size_t size, const char* tag, size_t* size_out, void** mem_out) { - if (is_small_alloc(size) || free_list_find(&ctx->free_list, size, tag, size_out, (fl_mem*)mem_out) != 0) { - *size_out = size; - *mem_out = malloc(size); +// Now some code for finding the proper value of kappa on a given +// machine (the smallest amount of work that amortises the cost of +// task creation). + +struct tuning_struct { + int32_t *free_tuning_res; + int32_t *array; +}; + +// Reduction function over an integer array +static int tuning_loop(void *args, int64_t start, int64_t end, + int flat_tid, int tid) { + (void)flat_tid; + (void)tid; + + int err = 0; + struct tuning_struct *tuning_struct = (struct tuning_struct *) args; + int32_t *array = tuning_struct->array; + int32_t *tuning_res = tuning_struct->free_tuning_res; + + int32_t sum = 0; + for (int i = start; i < end; i++) { + int32_t y = array[i]; + sum = add32(sum, y); } + *tuning_res = sum; + return err; } -static void host_free(struct futhark_context* ctx, - size_t size, const char* tag, void* mem) { - // Small allocations are handled by malloc()s own free list. The - // threshold here is kind of arbitrary, but seems to work OK. - // Larger allocations are mmap()ed/munmapped() every time, which is - // very slow, and Futhark programs tend to use a few very large - // allocations. - if (is_small_alloc(size)) { - free(mem); - } else { - free_list_insert(&ctx->free_list, size, (fl_mem)mem, tag); +// The main entry point for the tuning process. Sets the provided +// variable ``kappa``. +static int determine_kappa(double *kappa) { + int err = 0; + + int64_t iterations = 100000000; + int64_t tuning_time = 0; + int64_t tuning_iter = 0; + + int32_t *array = malloc(sizeof(int32_t) * iterations); + for (int64_t i = 0; i < iterations; i++) { + array[i] = fast_rand(); + } + + int64_t start_tuning = get_wall_time_ns(); + /* **************************** */ + /* Run sequential reduce first' */ + /* **************************** */ + int64_t tuning_sequentiual_start = get_wall_time_ns(); + struct tuning_struct tuning_struct; + int32_t tuning_res; + tuning_struct.free_tuning_res = &tuning_res; + tuning_struct.array = array; + + err = tuning_loop(&tuning_struct, 0, iterations, 0, 0); + int64_t tuning_sequentiual_end = get_wall_time_ns(); + int64_t sequential_elapsed = tuning_sequentiual_end - tuning_sequentiual_start; + + double C = (double)sequential_elapsed / (double)iterations; + fprintf(stderr, " Time for sequential run is %lld - Found C %f\n", (long long)sequential_elapsed, C); + + /* ********************** */ + /* Now run tuning process */ + /* ********************** */ + // Setup a scheduler with a single worker + struct scheduler scheduler; + scheduler.num_threads = 1; + scheduler.workers = malloc(sizeof(struct worker)); + worker_local = &scheduler.workers[0]; + worker_local->tid = 0; + CHECK_ERR(subtask_queue_init(&scheduler.workers[0].q, 1024), + "failed to init queue for worker %d\n", 0); + + // Start tuning for kappa + double kappa_tune = 1000; // Initial kappa is 1 us + double ratio; + int64_t time_elapsed; + while(1) { + int64_t min_iter_pr_subtask = (int64_t) (kappa_tune / C) == 0 ? 1 : (kappa_tune / C); + int nsubtasks = iterations / min_iter_pr_subtask; + struct scheduler_info info; + info.iter_pr_subtask = min_iter_pr_subtask; + + info.nsubtasks = iterations / min_iter_pr_subtask; + info.remainder = iterations % min_iter_pr_subtask; + info.task_time = &tuning_time; + info.task_iter = &tuning_iter; + info.sched = STATIC; + + struct scheduler_parloop parloop; + parloop.name = "tuning_loop"; + parloop.fn = tuning_loop; + parloop.args = &tuning_struct; + parloop.iterations = iterations; + parloop.info = info; + + int64_t tuning_chunked_start = get_wall_time_ns(); + int determine_kappa_err = + scheduler_execute_task(&scheduler, + &parloop); + assert(determine_kappa_err == 0); + int64_t tuning_chunked_end = get_wall_time_ns(); + time_elapsed = tuning_chunked_end - tuning_chunked_start; + + ratio = (double)time_elapsed / (double)sequential_elapsed; + if (ratio < 1.055) { + break; + } + kappa_tune += 100; // Increase by 100 ns at the time + fprintf(stderr, "nsubtask %d - kappa %f - ratio %f\n", nsubtasks, kappa_tune, ratio); } + + int64_t end_tuning = get_wall_time_ns(); + fprintf(stderr, "tuning took %lld ns and found kappa %f - time %lld - ratio %f\n", + (long long)end_tuning - start_tuning, + kappa_tune, + (long long)time_elapsed, + ratio); + *kappa = kappa_tune; + + // Clean-up + CHECK_ERR(subtask_queue_destroy(&scheduler.workers[0].q), "failed to destroy queue"); + free(array); + free(scheduler.workers); + return err; +} + +static int scheduler_init(struct scheduler *scheduler, + int num_workers, + double kappa) { +#ifdef FUTHARK_BACKEND_ispc + int64_t get_gang_size(); + scheduler->minimum_chunk_size = get_gang_size(); +#else + scheduler->minimum_chunk_size = 1; +#endif + + assert(num_workers > 0); + + scheduler->kappa = kappa; + scheduler->num_threads = num_workers; + scheduler->active_work = 0; + scheduler->error = 0; + + scheduler->workers = calloc(num_workers, sizeof(struct worker)); + + const int queue_capacity = 1024; + + worker_local = &scheduler->workers[0]; + worker_local->tid = 0; + worker_local->scheduler = scheduler; + CHECK_ERR(subtask_queue_init(&worker_local->q, queue_capacity), + "failed to init queue for worker %d\n", 0); + + for (int i = 1; i < num_workers; i++) { + struct worker *cur_worker = &scheduler->workers[i]; + memset(cur_worker, 0, sizeof(struct worker)); + cur_worker->tid = i; + cur_worker->output_usage = 0; + cur_worker->scheduler = scheduler; + CHECK_ERR(subtask_queue_init(&cur_worker->q, queue_capacity), + "failed to init queue for worker %d\n", i); + + CHECK_ERR(pthread_create(&cur_worker->thread, + NULL, + &scheduler_worker, + cur_worker), + "Failed to create worker %d\n", i); + } + + return 0; } -struct futhark_context_config* futhark_context_config_new(void) { - struct futhark_context_config* cfg = malloc(sizeof(struct futhark_context_config)); - if (cfg == NULL) { - return NULL; +static int scheduler_destroy(struct scheduler *scheduler) { + // We assume that this function is called by the thread controlling + // the first worker, which is why we treat scheduler->workers[0] + // specially here. + + // First mark them all as dead. + for (int i = 1; i < scheduler->num_threads; i++) { + struct worker *cur_worker = &scheduler->workers[i]; + cur_worker->dead = 1; } - cfg->in_use = 0; - cfg->debugging = 0; - cfg->profiling = 0; - cfg->logging = 0; - cfg->cache_fname = NULL; - cfg->num_tuning_params = num_tuning_params; - cfg->tuning_params = malloc(cfg->num_tuning_params * sizeof(int64_t)); - memcpy(cfg->tuning_params, tuning_param_defaults, - cfg->num_tuning_params * sizeof(int64_t)); - cfg->tuning_param_names = tuning_param_names; - cfg->tuning_param_vars = tuning_param_vars; - cfg->tuning_param_classes = tuning_param_classes; - backend_context_config_setup(cfg); - return cfg; + + // Then destroy their task queues (this will wake up the threads and + // make them do their shutdown). + for (int i = 1; i < scheduler->num_threads; i++) { + struct worker *cur_worker = &scheduler->workers[i]; + subtask_queue_destroy(&cur_worker->q); + } + + // Then actually wait for them to stop. + for (int i = 1; i < scheduler->num_threads; i++) { + struct worker *cur_worker = &scheduler->workers[i]; + CHECK_ERR(pthread_join(scheduler->workers[i].thread, NULL), "pthread_join"); + } + + // And then destroy our own queue. + subtask_queue_destroy(&scheduler->workers[0].q); + + free(scheduler->workers); + + return 0; } -void futhark_context_config_free(struct futhark_context_config* cfg) { - assert(!cfg->in_use); - backend_context_config_teardown(cfg); - free(cfg->tuning_params); - free(cfg); +// End of scheduler.h + +// Start of backends/multicore.h + +struct futhark_context_config { + int in_use; + int debugging; + int profiling; + int logging; + const char *cache_fname; + int num_tuning_params; + int64_t *tuning_params; + const char** tuning_param_names; + const char** tuning_param_vars; + const char** tuning_param_classes; + // Uniform fields above. + + int num_threads; +}; + +static void backend_context_config_setup(struct futhark_context_config* cfg) { + cfg->num_threads = 0; +} + +static void backend_context_config_teardown(struct futhark_context_config* cfg) { + (void)cfg; +} + +void futhark_context_config_set_num_threads(struct futhark_context_config *cfg, int n) { + cfg->num_threads = n; +} + +int futhark_context_config_set_tuning_param(struct futhark_context_config* cfg, const char *param_name, size_t param_value) { + (void)cfg; (void)param_name; (void)param_value; + return 1; +} + +struct futhark_context { + struct futhark_context_config* cfg; + int detail_memory; + int debugging; + int profiling; + int profiling_paused; + int logging; + lock_t lock; + char *error; + lock_t error_lock; + FILE *log; + struct constants *constants; + struct free_list free_list; + int64_t peak_mem_usage_default; + int64_t cur_mem_usage_default; + struct program* program; + // Uniform fields above. + + struct scheduler scheduler; + int total_runs; + long int total_runtime; + int64_t tuning_timing; + int64_t tuning_iter; +}; + +int backend_context_setup(struct futhark_context* ctx) { + // Initialize rand() + fast_srand(time(0)); + + int tune_kappa = 0; + double kappa = 5.1f * 1000; + + if (tune_kappa) { + if (determine_kappa(&kappa) != 0) { + ctx->error = strdup("Failed to determine kappa."); + return 1; + } + } + + if (scheduler_init(&ctx->scheduler, + ctx->cfg->num_threads > 0 ? + ctx->cfg->num_threads : num_processors(), + kappa) != 0) { + ctx->error = strdup("Failed to initialise scheduler."); + return 1; + } + + return 0; +} + +void backend_context_teardown(struct futhark_context* ctx) { + (void)scheduler_destroy(&ctx->scheduler); +} + +int futhark_context_sync(struct futhark_context* ctx) { + (void)ctx; + return 0; +} + +// End of backends/multicore.h + +struct program { + int64_t *futhark_mc_segmap_parloop_24514_total_runtime; + int *futhark_mc_segmap_parloop_24514_runs; + int64_t *futhark_mc_segmap_parloop_24514_iter; + int64_t futhark_mc_segmap_parloop_24514_total_total_runtime; + int futhark_mc_segmap_parloop_24514_total_runs; + int64_t futhark_mc_segmap_parloop_24514_total_iter; + int64_t *futhark_mc_segmap_task_24512_total_runtime; + int *futhark_mc_segmap_task_24512_runs; + int64_t *futhark_mc_segmap_task_24512_iter; + int64_t futhark_mc_segmap_task_24512_total_time; + int64_t futhark_mc_segmap_task_24512_total_iter; + int64_t *futhark_mc_segmap_parloop_24521_total_runtime; + int *futhark_mc_segmap_parloop_24521_runs; + int64_t *futhark_mc_segmap_parloop_24521_iter; + int64_t futhark_mc_segmap_parloop_24521_total_total_runtime; + int futhark_mc_segmap_parloop_24521_total_runs; + int64_t futhark_mc_segmap_parloop_24521_total_iter; + int64_t *futhark_mc_segmap_task_24519_total_runtime; + int *futhark_mc_segmap_task_24519_runs; + int64_t *futhark_mc_segmap_task_24519_iter; + int64_t futhark_mc_segmap_task_24519_total_time; + int64_t futhark_mc_segmap_task_24519_total_iter; + int64_t *futhark_mc_segmap_parloop_24517_total_runtime; + int *futhark_mc_segmap_parloop_24517_runs; + int64_t *futhark_mc_segmap_parloop_24517_iter; + int64_t futhark_mc_segmap_parloop_24517_total_total_runtime; + int futhark_mc_segmap_parloop_24517_total_runs; + int64_t futhark_mc_segmap_parloop_24517_total_iter; + int64_t *futhark_mc_segmap_nested_task_24515_total_runtime; + int *futhark_mc_segmap_nested_task_24515_runs; + int64_t *futhark_mc_segmap_nested_task_24515_iter; + int64_t *futhark_mc_copy_parloop_24525_total_runtime; + int *futhark_mc_copy_parloop_24525_runs; + int64_t *futhark_mc_copy_parloop_24525_iter; + int64_t futhark_mc_copy_parloop_24525_total_total_runtime; + int futhark_mc_copy_parloop_24525_total_runs; + int64_t futhark_mc_copy_parloop_24525_total_iter; + int64_t *futhark_mc_copy_22728_task_24523_total_runtime; + int *futhark_mc_copy_22728_task_24523_runs; + int64_t *futhark_mc_copy_22728_task_24523_iter; + int64_t futhark_mc_copy_22728_task_24523_total_time; + int64_t futhark_mc_copy_22728_task_24523_total_iter; + int64_t *futhark_mc_copy_parloop_24529_total_runtime; + int *futhark_mc_copy_parloop_24529_runs; + int64_t *futhark_mc_copy_parloop_24529_iter; + int64_t futhark_mc_copy_parloop_24529_total_total_runtime; + int futhark_mc_copy_parloop_24529_total_runs; + int64_t futhark_mc_copy_parloop_24529_total_iter; + int64_t *futhark_mc_copy_22738_task_24527_total_runtime; + int *futhark_mc_copy_22738_task_24527_runs; + int64_t *futhark_mc_copy_22738_task_24527_iter; + int64_t futhark_mc_copy_22738_task_24527_total_time; + int64_t futhark_mc_copy_22738_task_24527_total_iter; + int64_t *futhark_mc_segmap_parloop_24534_total_runtime; + int *futhark_mc_segmap_parloop_24534_runs; + int64_t *futhark_mc_segmap_parloop_24534_iter; + int64_t futhark_mc_segmap_parloop_24534_total_total_runtime; + int futhark_mc_segmap_parloop_24534_total_runs; + int64_t futhark_mc_segmap_parloop_24534_total_iter; + int64_t *futhark_mc_segmap_task_24532_total_runtime; + int *futhark_mc_segmap_task_24532_runs; + int64_t *futhark_mc_segmap_task_24532_iter; + int64_t futhark_mc_segmap_task_24532_total_time; + int64_t futhark_mc_segmap_task_24532_total_iter; + int64_t *futhark_mc_segmap_parloop_24563_total_runtime; + int *futhark_mc_segmap_parloop_24563_runs; + int64_t *futhark_mc_segmap_parloop_24563_iter; + int64_t futhark_mc_segmap_parloop_24563_total_total_runtime; + int futhark_mc_segmap_parloop_24563_total_runs; + int64_t futhark_mc_segmap_parloop_24563_total_iter; + int64_t *futhark_mc_segmap_task_24561_total_runtime; + int *futhark_mc_segmap_task_24561_runs; + int64_t *futhark_mc_segmap_task_24561_iter; + int64_t futhark_mc_segmap_task_24561_total_time; + int64_t futhark_mc_segmap_task_24561_total_iter; + int64_t *futhark_mc_segred_stage_1_parloop_24571_total_runtime; + int *futhark_mc_segred_stage_1_parloop_24571_runs; + int64_t *futhark_mc_segred_stage_1_parloop_24571_iter; + int64_t futhark_mc_segred_stage_1_parloop_24571_total_total_runtime; + int futhark_mc_segred_stage_1_parloop_24571_total_runs; + int64_t futhark_mc_segred_stage_1_parloop_24571_total_iter; + int64_t *futhark_mc_segred_task_24568_total_runtime; + int *futhark_mc_segred_task_24568_runs; + int64_t *futhark_mc_segred_task_24568_iter; + int64_t futhark_mc_segred_task_24568_total_time; + int64_t futhark_mc_segred_task_24568_total_iter; + int64_t *futhark_mc_segmap_parloop_24566_total_runtime; + int *futhark_mc_segmap_parloop_24566_runs; + int64_t *futhark_mc_segmap_parloop_24566_iter; + int64_t futhark_mc_segmap_parloop_24566_total_total_runtime; + int futhark_mc_segmap_parloop_24566_total_runs; + int64_t futhark_mc_segmap_parloop_24566_total_iter; + int64_t *futhark_mc_segmap_nested_task_24564_total_runtime; + int *futhark_mc_segmap_nested_task_24564_runs; + int64_t *futhark_mc_segmap_nested_task_24564_iter; + int64_t *futhark_mc_segmap_parloop_24575_total_runtime; + int *futhark_mc_segmap_parloop_24575_runs; + int64_t *futhark_mc_segmap_parloop_24575_iter; + int64_t futhark_mc_segmap_parloop_24575_total_total_runtime; + int futhark_mc_segmap_parloop_24575_total_runs; + int64_t futhark_mc_segmap_parloop_24575_total_iter; + int64_t *futhark_mc_segmap_task_24573_total_runtime; + int *futhark_mc_segmap_task_24573_runs; + int64_t *futhark_mc_segmap_task_24573_iter; + int64_t futhark_mc_segmap_task_24573_total_time; + int64_t futhark_mc_segmap_task_24573_total_iter; + int64_t *futhark_mc_segred_stage_1_parloop_24583_total_runtime; + int *futhark_mc_segred_stage_1_parloop_24583_runs; + int64_t *futhark_mc_segred_stage_1_parloop_24583_iter; + int64_t futhark_mc_segred_stage_1_parloop_24583_total_total_runtime; + int futhark_mc_segred_stage_1_parloop_24583_total_runs; + int64_t futhark_mc_segred_stage_1_parloop_24583_total_iter; + int64_t *futhark_mc_segred_task_24580_total_runtime; + int *futhark_mc_segred_task_24580_runs; + int64_t *futhark_mc_segred_task_24580_iter; + int64_t futhark_mc_segred_task_24580_total_time; + int64_t futhark_mc_segred_task_24580_total_iter; + int64_t *futhark_mc_segmap_parloop_24578_total_runtime; + int *futhark_mc_segmap_parloop_24578_runs; + int64_t *futhark_mc_segmap_parloop_24578_iter; + int64_t futhark_mc_segmap_parloop_24578_total_total_runtime; + int futhark_mc_segmap_parloop_24578_total_runs; + int64_t futhark_mc_segmap_parloop_24578_total_iter; + int64_t *futhark_mc_segmap_nested_task_24576_total_runtime; + int *futhark_mc_segmap_nested_task_24576_runs; + int64_t *futhark_mc_segmap_nested_task_24576_iter; + int64_t *futhark_mc_segred_stage_1_parloop_24589_total_runtime; + int *futhark_mc_segred_stage_1_parloop_24589_runs; + int64_t *futhark_mc_segred_stage_1_parloop_24589_iter; + int64_t futhark_mc_segred_stage_1_parloop_24589_total_total_runtime; + int futhark_mc_segred_stage_1_parloop_24589_total_runs; + int64_t futhark_mc_segred_stage_1_parloop_24589_total_iter; + int64_t *futhark_mc_segred_task_24585_total_runtime; + int *futhark_mc_segred_task_24585_runs; + int64_t *futhark_mc_segred_task_24585_iter; + int64_t futhark_mc_segred_task_24585_total_time; + int64_t futhark_mc_segred_task_24585_total_iter; + int64_t *futhark_mc_segred_stage_1_parloop_24595_total_runtime; + int *futhark_mc_segred_stage_1_parloop_24595_runs; + int64_t *futhark_mc_segred_stage_1_parloop_24595_iter; + int64_t futhark_mc_segred_stage_1_parloop_24595_total_total_runtime; + int futhark_mc_segred_stage_1_parloop_24595_total_runs; + int64_t futhark_mc_segred_stage_1_parloop_24595_total_iter; + int64_t *futhark_mc_segred_task_24591_total_runtime; + int *futhark_mc_segred_task_24591_runs; + int64_t *futhark_mc_segred_task_24591_iter; + int64_t futhark_mc_segred_task_24591_total_time; + int64_t futhark_mc_segred_task_24591_total_iter; + int64_t *futhark_mc_segred_stage_1_parloop_24604_total_runtime; + int *futhark_mc_segred_stage_1_parloop_24604_runs; + int64_t *futhark_mc_segred_stage_1_parloop_24604_iter; + int64_t futhark_mc_segred_stage_1_parloop_24604_total_total_runtime; + int futhark_mc_segred_stage_1_parloop_24604_total_runs; + int64_t futhark_mc_segred_stage_1_parloop_24604_total_iter; + int64_t *futhark_mc_segred_task_24597_total_runtime; + int *futhark_mc_segred_task_24597_runs; + int64_t *futhark_mc_segred_task_24597_iter; + int64_t futhark_mc_segred_task_24597_total_time; + int64_t futhark_mc_segred_task_24597_total_iter; + int64_t *futhark_mc_copy_parloop_24608_total_runtime; + int *futhark_mc_copy_parloop_24608_runs; + int64_t *futhark_mc_copy_parloop_24608_iter; + int64_t futhark_mc_copy_parloop_24608_total_total_runtime; + int futhark_mc_copy_parloop_24608_total_runs; + int64_t futhark_mc_copy_parloop_24608_total_iter; + int64_t *futhark_mc_copy_23437_task_24606_total_runtime; + int *futhark_mc_copy_23437_task_24606_runs; + int64_t *futhark_mc_copy_23437_task_24606_iter; + int64_t futhark_mc_copy_23437_task_24606_total_time; + int64_t futhark_mc_copy_23437_task_24606_total_iter; + int64_t *futhark_mc_segmap_parloop_24612_total_runtime; + int *futhark_mc_segmap_parloop_24612_runs; + int64_t *futhark_mc_segmap_parloop_24612_iter; + int64_t futhark_mc_segmap_parloop_24612_total_total_runtime; + int futhark_mc_segmap_parloop_24612_total_runs; + int64_t futhark_mc_segmap_parloop_24612_total_iter; + int64_t *futhark_mc_segmap_task_24610_total_runtime; + int *futhark_mc_segmap_task_24610_runs; + int64_t *futhark_mc_segmap_task_24610_iter; + int64_t futhark_mc_segmap_task_24610_total_time; + int64_t futhark_mc_segmap_task_24610_total_iter; + int64_t *futhark_mc_segmap_parloop_24617_total_runtime; + int *futhark_mc_segmap_parloop_24617_runs; + int64_t *futhark_mc_segmap_parloop_24617_iter; + int64_t futhark_mc_segmap_parloop_24617_total_total_runtime; + int futhark_mc_segmap_parloop_24617_total_runs; + int64_t futhark_mc_segmap_parloop_24617_total_iter; + int64_t *futhark_mc_segmap_task_24615_total_runtime; + int *futhark_mc_segmap_task_24615_runs; + int64_t *futhark_mc_segmap_task_24615_iter; + int64_t futhark_mc_segmap_task_24615_total_time; + int64_t futhark_mc_segmap_task_24615_total_iter; + int64_t *futhark_mc_segred_stage_1_parloop_24625_total_runtime; + int *futhark_mc_segred_stage_1_parloop_24625_runs; + int64_t *futhark_mc_segred_stage_1_parloop_24625_iter; + int64_t futhark_mc_segred_stage_1_parloop_24625_total_total_runtime; + int futhark_mc_segred_stage_1_parloop_24625_total_runs; + int64_t futhark_mc_segred_stage_1_parloop_24625_total_iter; + int64_t *futhark_mc_segred_task_24622_total_runtime; + int *futhark_mc_segred_task_24622_runs; + int64_t *futhark_mc_segred_task_24622_iter; + int64_t futhark_mc_segred_task_24622_total_time; + int64_t futhark_mc_segred_task_24622_total_iter; + int64_t *futhark_mc_segmap_parloop_24629_total_runtime; + int *futhark_mc_segmap_parloop_24629_runs; + int64_t *futhark_mc_segmap_parloop_24629_iter; + int64_t futhark_mc_segmap_parloop_24629_total_total_runtime; + int futhark_mc_segmap_parloop_24629_total_runs; + int64_t futhark_mc_segmap_parloop_24629_total_iter; + int64_t *futhark_mc_segmap_task_24627_total_runtime; + int *futhark_mc_segmap_task_24627_runs; + int64_t *futhark_mc_segmap_task_24627_iter; + int64_t futhark_mc_segmap_task_24627_total_time; + int64_t futhark_mc_segmap_task_24627_total_iter; + int64_t *futhark_mc_segmap_parloop_24620_total_runtime; + int *futhark_mc_segmap_parloop_24620_runs; + int64_t *futhark_mc_segmap_parloop_24620_iter; + int64_t futhark_mc_segmap_parloop_24620_total_total_runtime; + int futhark_mc_segmap_parloop_24620_total_runs; + int64_t futhark_mc_segmap_parloop_24620_total_iter; +