From 6d3684d1b32087b385bebce9ea0fa22cb522ab21 Mon Sep 17 00:00:00 2001 From: Justin Bedo Date: Thu, 20 Jun 2024 17:08:56 +1000 Subject: update futhark --- pca.c | 27483 +++++++++++++++++++++++++++++--------------------------------- pca.fut | 6 +- pca.h | 7 +- 3 files changed, 12729 insertions(+), 14767 deletions(-) diff --git a/pca.c b/pca.c index a8bd404..f0d92ab 100644 --- a/pca.c +++ b/pca.c @@ -1,4 +1,5 @@ -// Generated by Futhark 0.24.0 (prerelease - include info below when reporting bugs) +// Generated by Futhark 0.25.17. +// Compiled with GHC 9.6.5. // We need to define _GNU_SOURCE before // _any_ headers files are imported to get @@ -11,11 +12,14 @@ #ifdef __clang__ #pragma clang diagnostic ignored "-Wunused-function" #pragma clang diagnostic ignored "-Wunused-variable" +#pragma clang diagnostic ignored "-Wunused-const-variable" #pragma clang diagnostic ignored "-Wparentheses" #pragma clang diagnostic ignored "-Wunused-label" +#pragma clang diagnostic ignored "-Wunused-but-set-variable" #elif __GNUC__ #pragma GCC diagnostic ignored "-Wunused-function" #pragma GCC diagnostic ignored "-Wunused-variable" +#pragma GCC diagnostic ignored "-Wunused-const-variable" #pragma GCC diagnostic ignored "-Wparentheses" #pragma GCC diagnostic ignored "-Wunused-label" #pragma GCC diagnostic ignored "-Wunused-but-set-variable" @@ -51,7 +55,7 @@ const char *futhark_get_tuning_param_class(int); // Arrays struct futhark_f64_2d; struct futhark_f64_2d *futhark_new_f64_2d(struct futhark_context *ctx, const double *data, int64_t dim0, int64_t dim1); -struct futhark_f64_2d *futhark_new_raw_f64_2d(struct futhark_context *ctx, const unsigned char *data, int64_t offset, int64_t dim0, int64_t dim1); +struct futhark_f64_2d *futhark_new_raw_f64_2d(struct futhark_context *ctx, unsigned char *data, int64_t dim0, int64_t dim1); int futhark_free_f64_2d(struct futhark_context *ctx, struct futhark_f64_2d *arr); int futhark_values_f64_2d(struct futhark_context *ctx, struct futhark_f64_2d *arr, double *data); unsigned char *futhark_values_raw_f64_2d(struct futhark_context *ctx, struct futhark_f64_2d *arr); @@ -67,11 +71,11 @@ int futhark_entry_pcaWithQuantile(struct futhark_context *ctx, struct futhark_f6 // Miscellaneous int futhark_context_sync(struct futhark_context *ctx); void futhark_context_config_set_cache_file(struct futhark_context_config *cfg, const char *f); -char *futhark_context_report(struct futhark_context *ctx); char *futhark_context_get_error(struct futhark_context *ctx); void futhark_context_set_logging_file(struct futhark_context *ctx, FILE *f); void futhark_context_pause_profiling(struct futhark_context *ctx); void futhark_context_unpause_profiling(struct futhark_context *ctx); +char *futhark_context_report(struct futhark_context *ctx); int futhark_context_clear_caches(struct futhark_context *ctx); #define FUTHARK_BACKEND_multicore #define FUTHARK_SUCCESS 0 @@ -93,6 +97,7 @@ int futhark_context_clear_caches(struct futhark_context *ctx); #undef NDEBUG #include #include +#define SCALAR_FUN_ATTR static inline // Start of util.h. // // Various helper functions that are useful in all generated C code. @@ -231,6 +236,44 @@ static void str_builder(struct str_builder *b, const char *s, ...) { b->used += needed; } +static void str_builder_str(struct str_builder *b, const char *s) { + size_t needed = strlen(s); + if (b->capacity < b->used + needed + 1) { + b->capacity *= 2; + b->str = realloc(b->str, b->capacity); + } + strcpy(b->str+b->used, s); + b->used += needed; +} + +static void str_builder_char(struct str_builder *b, char c) { + size_t needed = 1; + if (b->capacity < b->used + needed + 1) { + b->capacity *= 2; + b->str = realloc(b->str, b->capacity); + } + b->str[b->used] = c; + b->str[b->used+1] = 0; + b->used += needed; +} + +static void str_builder_json_str(struct str_builder* sb, const char* s) { + str_builder_char(sb, '"'); + for (int j = 0; s[j]; j++) { + char c = s[j]; + switch (c) { + case '\n': + str_builder_str(sb, "\\n"); + break; + case '"': + str_builder_str(sb, "\\\""); + break; + default: + str_builder_char(sb, c); + } + } + str_builder_char(sb, '"'); +} static char *strclone(const char *str) { size_t size = strlen(str) + 1; @@ -243,6 +286,25 @@ static char *strclone(const char *str) { return copy; } +// Assumes NULL-terminated. +static char *strconcat(const char *src_fragments[]) { + size_t src_len = 0; + const char **p; + + for (p = src_fragments; *p; p++) { + src_len += strlen(*p); + } + + char *src = (char*) malloc(src_len + 1); + size_t n = 0; + for (p = src_fragments; *p; p++) { + strcpy(src + n, *p); + n += strlen(*p); + } + + return src; +} + // End of util.h. // Start of cache.h @@ -611,7 +673,7 @@ __constant static const unsigned short offset_table[64] = { 0, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 0, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024 }; -static uint16_t float2halfbits(float value) { +SCALAR_FUN_ATTR uint16_t float2halfbits(float value) { union { float x; uint32_t y; } u; u.x = value; uint32_t bits = u.y; @@ -621,7 +683,7 @@ static uint16_t float2halfbits(float value) { return hbits; } -static float halfbits2float(uint16_t value) { +SCALAR_FUN_ATTR float halfbits2float(uint16_t value) { uint32_t bits = mantissa_table[offset_table[value>>10]+(value&0x3FF)] + exponent_table[value>>10]; union { uint32_t x; float y; } u; @@ -629,7 +691,7 @@ static float halfbits2float(uint16_t value) { return u.y; } -static uint16_t halfbitsnextafter(uint16_t from, uint16_t to) { +SCALAR_FUN_ATTR uint16_t halfbitsnextafter(uint16_t from, uint16_t to) { int fabs = from & 0x7FFF, tabs = to & 0x7FFF; if(fabs > 0x7C00 || tabs > 0x7C00) { return ((from&0x7FFF)>0x7C00) ? (from|0x200) : (to|0x200); @@ -932,6 +994,73 @@ static int free_list_first(struct free_list *l, fl_mem *mem_out) { } // End of free_list.h. +// Start of event_list.h + +typedef int (*event_report_fn)(struct str_builder*, void*); + +struct event { + void* data; + event_report_fn f; + const char* name; + char *description; +}; + +struct event_list { + struct event *events; + int num_events; + int capacity; +}; + +static void event_list_init(struct event_list *l) { + l->capacity = 100; + l->num_events = 0; + l->events = calloc(l->capacity, sizeof(struct event)); +} + +static void event_list_free(struct event_list *l) { + free(l->events); +} + +static void add_event_to_list(struct event_list *l, + const char* name, + char* description, + void* data, + event_report_fn f) { + if (l->num_events == l->capacity) { + l->capacity *= 2; + l->events = realloc(l->events, l->capacity * sizeof(struct event)); + } + l->events[l->num_events].name = name; + l->events[l->num_events].description = description; + l->events[l->num_events].data = data; + l->events[l->num_events].f = f; + l->num_events++; +} + +static int report_events_in_list(struct event_list *l, + struct str_builder* sb) { + int ret = 0; + for (int i = 0; i < l->num_events; i++) { + if (i != 0) { + str_builder_str(sb, ","); + } + str_builder_str(sb, "{\"name\":"); + str_builder_json_str(sb, l->events[i].name); + str_builder_str(sb, ",\"description\":"); + str_builder_json_str(sb, l->events[i].description); + free(l->events[i].description); + if (l->events[i].f(sb, l->events[i].data) != 0) { + ret = 1; + break; + } + str_builder(sb, "}"); + } + event_list_free(l); + event_list_init(l); + return ret; +} + +// End of event_list.h #ifdef _MSC_VER #define inline __inline @@ -965,57 +1094,60 @@ static int free_list_first(struct free_list *l, fl_mem *mem_out) { // Double-precision definitions are only included if the preprocessor // macro FUTHARK_F64_ENABLED is set. -static inline uint8_t add8(uint8_t x, uint8_t y) { +SCALAR_FUN_ATTR int32_t futrts_to_bits32(float x); +SCALAR_FUN_ATTR float futrts_from_bits32(int32_t x); + +SCALAR_FUN_ATTR uint8_t add8(uint8_t x, uint8_t y) { return x + y; } -static inline uint16_t add16(uint16_t x, uint16_t y) { +SCALAR_FUN_ATTR uint16_t add16(uint16_t x, uint16_t y) { return x + y; } -static inline uint32_t add32(uint32_t x, uint32_t y) { +SCALAR_FUN_ATTR uint32_t add32(uint32_t x, uint32_t y) { return x + y; } -static inline uint64_t add64(uint64_t x, uint64_t y) { +SCALAR_FUN_ATTR uint64_t add64(uint64_t x, uint64_t y) { return x + y; } -static inline uint8_t sub8(uint8_t x, uint8_t y) { +SCALAR_FUN_ATTR uint8_t sub8(uint8_t x, uint8_t y) { return x - y; } -static inline uint16_t sub16(uint16_t x, uint16_t y) { +SCALAR_FUN_ATTR uint16_t sub16(uint16_t x, uint16_t y) { return x - y; } -static inline uint32_t sub32(uint32_t x, uint32_t y) { +SCALAR_FUN_ATTR uint32_t sub32(uint32_t x, uint32_t y) { return x - y; } -static inline uint64_t sub64(uint64_t x, uint64_t y) { +SCALAR_FUN_ATTR uint64_t sub64(uint64_t x, uint64_t y) { return x - y; } -static inline uint8_t mul8(uint8_t x, uint8_t y) { +SCALAR_FUN_ATTR uint8_t mul8(uint8_t x, uint8_t y) { return x * y; } -static inline uint16_t mul16(uint16_t x, uint16_t y) { +SCALAR_FUN_ATTR uint16_t mul16(uint16_t x, uint16_t y) { return x * y; } -static inline uint32_t mul32(uint32_t x, uint32_t y) { +SCALAR_FUN_ATTR uint32_t mul32(uint32_t x, uint32_t y) { return x * y; } -static inline uint64_t mul64(uint64_t x, uint64_t y) { +SCALAR_FUN_ATTR uint64_t mul64(uint64_t x, uint64_t y) { return x * y; } #if ISPC -static inline uint8_t udiv8(uint8_t x, uint8_t y) { +SCALAR_FUN_ATTR uint8_t udiv8(uint8_t x, uint8_t y) { // This strange pattern is used to prevent the ISPC compiler from // causing SIGFPEs and bogus results on divisions where inactive lanes // have 0-valued divisors. It ensures that any inactive lane instead @@ -1028,242 +1160,242 @@ static inline uint8_t udiv8(uint8_t x, uint8_t y) { return x / ys; } -static inline uint16_t udiv16(uint16_t x, uint16_t y) { +SCALAR_FUN_ATTR uint16_t udiv16(uint16_t x, uint16_t y) { uint16_t ys = 1; foreach_active(i){ ys = y; } - + return x / ys; } -static inline uint32_t udiv32(uint32_t x, uint32_t y) { +SCALAR_FUN_ATTR uint32_t udiv32(uint32_t x, uint32_t y) { uint32_t ys = 1; foreach_active(i){ ys = y; } - + return x / ys; } -static inline uint64_t udiv64(uint64_t x, uint64_t y) { +SCALAR_FUN_ATTR uint64_t udiv64(uint64_t x, uint64_t y) { uint64_t ys = 1; foreach_active(i){ ys = y; } - + return x / ys; } -static inline uint8_t udiv_up8(uint8_t x, uint8_t y) { +SCALAR_FUN_ATTR uint8_t udiv_up8(uint8_t x, uint8_t y) { uint8_t ys = 1; foreach_active(i){ ys = y; } - + return (x + y - 1) / ys; } -static inline uint16_t udiv_up16(uint16_t x, uint16_t y) { +SCALAR_FUN_ATTR uint16_t udiv_up16(uint16_t x, uint16_t y) { uint16_t ys = 1; foreach_active(i){ ys = y; } - + return (x + y - 1) / ys; } -static inline uint32_t udiv_up32(uint32_t x, uint32_t y) { +SCALAR_FUN_ATTR uint32_t udiv_up32(uint32_t x, uint32_t y) { uint32_t ys = 1; foreach_active(i){ ys = y; } - + return (x + y - 1) / ys; } -static inline uint64_t udiv_up64(uint64_t x, uint64_t y) { +SCALAR_FUN_ATTR uint64_t udiv_up64(uint64_t x, uint64_t y) { uint64_t ys = 1; foreach_active(i){ ys = y; } - + return (x + y - 1) / ys; } -static inline uint8_t umod8(uint8_t x, uint8_t y) { +SCALAR_FUN_ATTR uint8_t umod8(uint8_t x, uint8_t y) { uint8_t ys = 1; foreach_active(i){ ys = y; } - + return x % ys; } -static inline uint16_t umod16(uint16_t x, uint16_t y) { +SCALAR_FUN_ATTR uint16_t umod16(uint16_t x, uint16_t y) { uint16_t ys = 1; foreach_active(i){ ys = y; } - + return x % ys; } -static inline uint32_t umod32(uint32_t x, uint32_t y) { +SCALAR_FUN_ATTR uint32_t umod32(uint32_t x, uint32_t y) { uint32_t ys = 1; foreach_active(i){ ys = y; } - + return x % ys; } -static inline uint64_t umod64(uint64_t x, uint64_t y) { +SCALAR_FUN_ATTR uint64_t umod64(uint64_t x, uint64_t y) { uint64_t ys = 1; foreach_active(i){ ys = y; } - + return x % ys; } -static inline uint8_t udiv_safe8(uint8_t x, uint8_t y) { +SCALAR_FUN_ATTR uint8_t udiv_safe8(uint8_t x, uint8_t y) { uint8_t ys = 1; foreach_active(i){ ys = y; } - + return y == 0 ? 0 : x / ys; } -static inline uint16_t udiv_safe16(uint16_t x, uint16_t y) { +SCALAR_FUN_ATTR uint16_t udiv_safe16(uint16_t x, uint16_t y) { uint16_t ys = 1; foreach_active(i){ ys = y; } - + return y == 0 ? 0 : x / ys; } -static inline uint32_t udiv_safe32(uint32_t x, uint32_t y) { +SCALAR_FUN_ATTR uint32_t udiv_safe32(uint32_t x, uint32_t y) { uint32_t ys = 1; foreach_active(i){ ys = y; } - + return y == 0 ? 0 : x / ys; } -static inline uint64_t udiv_safe64(uint64_t x, uint64_t y) { +SCALAR_FUN_ATTR uint64_t udiv_safe64(uint64_t x, uint64_t y) { uint64_t ys = 1; foreach_active(i){ ys = y; } - + return y == 0 ? 0 : x / ys; } -static inline uint8_t udiv_up_safe8(uint8_t x, uint8_t y) { +SCALAR_FUN_ATTR uint8_t udiv_up_safe8(uint8_t x, uint8_t y) { uint8_t ys = 1; foreach_active(i){ ys = y; } - + return y == 0 ? 0 : (x + y - 1) / ys; } -static inline uint16_t udiv_up_safe16(uint16_t x, uint16_t y) { +SCALAR_FUN_ATTR uint16_t udiv_up_safe16(uint16_t x, uint16_t y) { uint16_t ys = 1; foreach_active(i){ ys = y; } - + return y == 0 ? 0 : (x + y - 1) / ys; } -static inline uint32_t udiv_up_safe32(uint32_t x, uint32_t y) { +SCALAR_FUN_ATTR uint32_t udiv_up_safe32(uint32_t x, uint32_t y) { uint32_t ys = 1; foreach_active(i){ ys = y; } - + return y == 0 ? 0 : (x + y - 1) / ys; } -static inline uint64_t udiv_up_safe64(uint64_t x, uint64_t y) { +SCALAR_FUN_ATTR uint64_t udiv_up_safe64(uint64_t x, uint64_t y) { uint64_t ys = 1; foreach_active(i){ ys = y; } - + return y == 0 ? 0 : (x + y - 1) / ys; } -static inline uint8_t umod_safe8(uint8_t x, uint8_t y) { +SCALAR_FUN_ATTR uint8_t umod_safe8(uint8_t x, uint8_t y) { uint8_t ys = 1; foreach_active(i){ ys = y; } - + return y == 0 ? 0 : x % ys; } -static inline uint16_t umod_safe16(uint16_t x, uint16_t y) { +SCALAR_FUN_ATTR uint16_t umod_safe16(uint16_t x, uint16_t y) { uint16_t ys = 1; foreach_active(i){ ys = y; } - + return y == 0 ? 0 : x % ys; } -static inline uint32_t umod_safe32(uint32_t x, uint32_t y) { +SCALAR_FUN_ATTR uint32_t umod_safe32(uint32_t x, uint32_t y) { uint32_t ys = 1; foreach_active(i){ ys = y; } - + return y == 0 ? 0 : x % ys; } -static inline uint64_t umod_safe64(uint64_t x, uint64_t y) { +SCALAR_FUN_ATTR uint64_t umod_safe64(uint64_t x, uint64_t y) { uint64_t ys = 1; foreach_active(i){ ys = y; } - + return y == 0 ? 0 : x % ys; } -static inline int8_t sdiv8(int8_t x, int8_t y) { +SCALAR_FUN_ATTR int8_t sdiv8(int8_t x, int8_t y) { int8_t ys = 1; foreach_active(i){ ys = y; } - + int8_t q = x / ys; int8_t r = x % ys; return q - ((r != 0 && r < 0 != y < 0) ? 1 : 0); } -static inline int16_t sdiv16(int16_t x, int16_t y) { +SCALAR_FUN_ATTR int16_t sdiv16(int16_t x, int16_t y) { int16_t ys = 1; foreach_active(i){ ys = y; } - + int16_t q = x / ys; int16_t r = x % ys; return q - ((r != 0 && r < 0 != y < 0) ? 1 : 0); } -static inline int32_t sdiv32(int32_t x, int32_t y) { +SCALAR_FUN_ATTR int32_t sdiv32(int32_t x, int32_t y) { int32_t ys = 1; foreach_active(i){ ys = y; @@ -1274,775 +1406,775 @@ static inline int32_t sdiv32(int32_t x, int32_t y) { return q - ((r != 0 && r < 0 != y < 0) ? 1 : 0); } -static inline int64_t sdiv64(int64_t x, int64_t y) { +SCALAR_FUN_ATTR int64_t sdiv64(int64_t x, int64_t y) { int64_t ys = 1; foreach_active(i){ ys = y; } - + int64_t q = x / ys; int64_t r = x % ys; return q - ((r != 0 && r < 0 != y < 0) ? 1 : 0); } -static inline int8_t sdiv_up8(int8_t x, int8_t y) { +SCALAR_FUN_ATTR int8_t sdiv_up8(int8_t x, int8_t y) { return sdiv8(x + y - 1, y); } -static inline int16_t sdiv_up16(int16_t x, int16_t y) { +SCALAR_FUN_ATTR int16_t sdiv_up16(int16_t x, int16_t y) { return sdiv16(x + y - 1, y); } -static inline int32_t sdiv_up32(int32_t x, int32_t y) { +SCALAR_FUN_ATTR int32_t sdiv_up32(int32_t x, int32_t y) { return sdiv32(x + y - 1, y); } -static inline int64_t sdiv_up64(int64_t x, int64_t y) { +SCALAR_FUN_ATTR int64_t sdiv_up64(int64_t x, int64_t y) { return sdiv64(x + y - 1, y); } -static inline int8_t smod8(int8_t x, int8_t y) { +SCALAR_FUN_ATTR int8_t smod8(int8_t x, int8_t y) { int8_t ys = 1; foreach_active(i){ ys = y; } - + int8_t r = x % ys; return r + (r == 0 || (x > 0 && y > 0) || (x < 0 && y < 0) ? 0 : y); } -static inline int16_t smod16(int16_t x, int16_t y) { +SCALAR_FUN_ATTR int16_t smod16(int16_t x, int16_t y) { int16_t ys = 1; foreach_active(i){ ys = y; } - + int16_t r = x % ys; return r + (r == 0 || (x > 0 && y > 0) || (x < 0 && y < 0) ? 0 : y); } -static inline int32_t smod32(int32_t x, int32_t y) { +SCALAR_FUN_ATTR int32_t smod32(int32_t x, int32_t y) { int32_t ys = 1; foreach_active(i){ ys = y; } - + int32_t r = x % ys; return r + (r == 0 || (x > 0 && y > 0) || (x < 0 && y < 0) ? 0 : y); } -static inline int64_t smod64(int64_t x, int64_t y) { +SCALAR_FUN_ATTR int64_t smod64(int64_t x, int64_t y) { int64_t ys = 1; foreach_active(i){ ys = y; } - + int64_t r = x % ys; return r + (r == 0 || (x > 0 && y > 0) || (x < 0 && y < 0) ? 0 : y); } -static inline int8_t sdiv_safe8(int8_t x, int8_t y) { +SCALAR_FUN_ATTR int8_t sdiv_safe8(int8_t x, int8_t y) { return y == 0 ? 0 : sdiv8(x, y); } -static inline int16_t sdiv_safe16(int16_t x, int16_t y) { +SCALAR_FUN_ATTR int16_t sdiv_safe16(int16_t x, int16_t y) { return y == 0 ? 0 : sdiv16(x, y); } -static inline int32_t sdiv_safe32(int32_t x, int32_t y) { +SCALAR_FUN_ATTR int32_t sdiv_safe32(int32_t x, int32_t y) { return y == 0 ? 0 : sdiv32(x, y); } -static inline int64_t sdiv_safe64(int64_t x, int64_t y) { +SCALAR_FUN_ATTR int64_t sdiv_safe64(int64_t x, int64_t y) { return y == 0 ? 0 : sdiv64(x, y); } -static inline int8_t sdiv_up_safe8(int8_t x, int8_t y) { +SCALAR_FUN_ATTR int8_t sdiv_up_safe8(int8_t x, int8_t y) { return sdiv_safe8(x + y - 1, y); } -static inline int16_t sdiv_up_safe16(int16_t x, int16_t y) { +SCALAR_FUN_ATTR int16_t sdiv_up_safe16(int16_t x, int16_t y) { return sdiv_safe16(x + y - 1, y); } -static inline int32_t sdiv_up_safe32(int32_t x, int32_t y) { +SCALAR_FUN_ATTR int32_t sdiv_up_safe32(int32_t x, int32_t y) { return sdiv_safe32(x + y - 1, y); } -static inline int64_t sdiv_up_safe64(int64_t x, int64_t y) { +SCALAR_FUN_ATTR int64_t sdiv_up_safe64(int64_t x, int64_t y) { return sdiv_safe64(x + y - 1, y); } -static inline int8_t smod_safe8(int8_t x, int8_t y) { +SCALAR_FUN_ATTR int8_t smod_safe8(int8_t x, int8_t y) { return y == 0 ? 0 : smod8(x, y); } -static inline int16_t smod_safe16(int16_t x, int16_t y) { +SCALAR_FUN_ATTR int16_t smod_safe16(int16_t x, int16_t y) { return y == 0 ? 0 : smod16(x, y); } -static inline int32_t smod_safe32(int32_t x, int32_t y) { +SCALAR_FUN_ATTR int32_t smod_safe32(int32_t x, int32_t y) { return y == 0 ? 0 : smod32(x, y); } -static inline int64_t smod_safe64(int64_t x, int64_t y) { +SCALAR_FUN_ATTR int64_t smod_safe64(int64_t x, int64_t y) { return y == 0 ? 0 : smod64(x, y); } -static inline int8_t squot8(int8_t x, int8_t y) { +SCALAR_FUN_ATTR int8_t squot8(int8_t x, int8_t y) { int8_t ys = 1; foreach_active(i){ ys = y; } - + return x / ys; } -static inline int16_t squot16(int16_t x, int16_t y) { +SCALAR_FUN_ATTR int16_t squot16(int16_t x, int16_t y) { int16_t ys = 1; foreach_active(i){ ys = y; } - + return x / ys; } -static inline int32_t squot32(int32_t x, int32_t y) { +SCALAR_FUN_ATTR int32_t squot32(int32_t x, int32_t y) { int32_t ys = 1; foreach_active(i){ ys = y; } - + return x / ys; } -static inline int64_t squot64(int64_t x, int64_t y) { +SCALAR_FUN_ATTR int64_t squot64(int64_t x, int64_t y) { int64_t ys = 1; foreach_active(i){ ys = y; } - + return x / ys; } -static inline int8_t srem8(int8_t x, int8_t y) { +SCALAR_FUN_ATTR int8_t srem8(int8_t x, int8_t y) { int8_t ys = 1; foreach_active(i){ ys = y; } - + return x % ys; } -static inline int16_t srem16(int16_t x, int16_t y) { +SCALAR_FUN_ATTR int16_t srem16(int16_t x, int16_t y) { int16_t ys = 1; foreach_active(i){ ys = y; } - + return x % ys; } -static inline int32_t srem32(int32_t x, int32_t y) { +SCALAR_FUN_ATTR int32_t srem32(int32_t x, int32_t y) { int32_t ys = 1; foreach_active(i){ ys = y; } - + return x % ys; } -static inline int64_t srem64(int64_t x, int64_t y) { +SCALAR_FUN_ATTR int64_t srem64(int64_t x, int64_t y) { int8_t ys = 1; foreach_active(i){ ys = y; } - + return x % ys; } -static inline int8_t squot_safe8(int8_t x, int8_t y) { +SCALAR_FUN_ATTR int8_t squot_safe8(int8_t x, int8_t y) { int8_t ys = 1; foreach_active(i){ ys = y; } - + return y == 0 ? 0 : x / ys; } -static inline int16_t squot_safe16(int16_t x, int16_t y) { +SCALAR_FUN_ATTR int16_t squot_safe16(int16_t x, int16_t y) { int16_t ys = 1; foreach_active(i){ ys = y; } - + return y == 0 ? 0 : x / ys; } -static inline int32_t squot_safe32(int32_t x, int32_t y) { +SCALAR_FUN_ATTR int32_t squot_safe32(int32_t x, int32_t y) { int32_t ys = 1; foreach_active(i){ ys = y; } - + return y == 0 ? 0 : x / ys; } -static inline int64_t squot_safe64(int64_t x, int64_t y) { +SCALAR_FUN_ATTR int64_t squot_safe64(int64_t x, int64_t y) { int64_t ys = 1; foreach_active(i){ ys = y; } - + return y == 0 ? 0 : x / ys; } -static inline int8_t srem_safe8(int8_t x, int8_t y) { +SCALAR_FUN_ATTR int8_t srem_safe8(int8_t x, int8_t y) { int8_t ys = 1; foreach_active(i){ ys = y; } - + return y == 0 ? 0 : x % ys; } -static inline int16_t srem_safe16(int16_t x, int16_t y) { +SCALAR_FUN_ATTR int16_t srem_safe16(int16_t x, int16_t y) { int16_t ys = 1; foreach_active(i){ ys = y; } - + return y == 0 ? 0 : x % ys; } -static inline int32_t srem_safe32(int32_t x, int32_t y) { +SCALAR_FUN_ATTR int32_t srem_safe32(int32_t x, int32_t y) { int32_t ys = 1; foreach_active(i){ ys = y; } - + return y == 0 ? 0 : x % ys; } -static inline int64_t srem_safe64(int64_t x, int64_t y) { +SCALAR_FUN_ATTR int64_t srem_safe64(int64_t x, int64_t y) { int64_t ys = 1; foreach_active(i){ ys = y; } - + return y == 0 ? 0 : x % ys; } #else -static inline uint8_t udiv8(uint8_t x, uint8_t y) { +SCALAR_FUN_ATTR uint8_t udiv8(uint8_t x, uint8_t y) { return x / y; } -static inline uint16_t udiv16(uint16_t x, uint16_t y) { +SCALAR_FUN_ATTR uint16_t udiv16(uint16_t x, uint16_t y) { return x / y; } -static inline uint32_t udiv32(uint32_t x, uint32_t y) { +SCALAR_FUN_ATTR uint32_t udiv32(uint32_t x, uint32_t y) { return x / y; } -static inline uint64_t udiv64(uint64_t x, uint64_t y) { +SCALAR_FUN_ATTR uint64_t udiv64(uint64_t x, uint64_t y) { return x / y; } -static inline uint8_t udiv_up8(uint8_t x, uint8_t y) { +SCALAR_FUN_ATTR uint8_t udiv_up8(uint8_t x, uint8_t y) { return (x + y - 1) / y; } -static inline uint16_t udiv_up16(uint16_t x, uint16_t y) { +SCALAR_FUN_ATTR uint16_t udiv_up16(uint16_t x, uint16_t y) { return (x + y - 1) / y; } -static inline uint32_t udiv_up32(uint32_t x, uint32_t y) { +SCALAR_FUN_ATTR uint32_t udiv_up32(uint32_t x, uint32_t y) { return (x + y - 1) / y; } -static inline uint64_t udiv_up64(uint64_t x, uint64_t y) { +SCALAR_FUN_ATTR uint64_t udiv_up64(uint64_t x, uint64_t y) { return (x + y - 1) / y; } -static inline uint8_t umod8(uint8_t x, uint8_t y) { +SCALAR_FUN_ATTR uint8_t umod8(uint8_t x, uint8_t y) { return x % y; } -static inline uint16_t umod16(uint16_t x, uint16_t y) { +SCALAR_FUN_ATTR uint16_t umod16(uint16_t x, uint16_t y) { return x % y; } -static inline uint32_t umod32(uint32_t x, uint32_t y) { +SCALAR_FUN_ATTR uint32_t umod32(uint32_t x, uint32_t y) { return x % y; } -static inline uint64_t umod64(uint64_t x, uint64_t y) { +SCALAR_FUN_ATTR uint64_t umod64(uint64_t x, uint64_t y) { return x % y; } -static inline uint8_t udiv_safe8(uint8_t x, uint8_t y) { +SCALAR_FUN_ATTR uint8_t udiv_safe8(uint8_t x, uint8_t y) { return y == 0 ? 0 : x / y; } -static inline uint16_t udiv_safe16(uint16_t x, uint16_t y) { +SCALAR_FUN_ATTR uint16_t udiv_safe16(uint16_t x, uint16_t y) { return y == 0 ? 0 : x / y; } -static inline uint32_t udiv_safe32(uint32_t x, uint32_t y) { +SCALAR_FUN_ATTR uint32_t udiv_safe32(uint32_t x, uint32_t y) { return y == 0 ? 0 : x / y; } -static inline uint64_t udiv_safe64(uint64_t x, uint64_t y) { +SCALAR_FUN_ATTR uint64_t udiv_safe64(uint64_t x, uint64_t y) { return y == 0 ? 0 : x / y; } -static inline uint8_t udiv_up_safe8(uint8_t x, uint8_t y) { +SCALAR_FUN_ATTR uint8_t udiv_up_safe8(uint8_t x, uint8_t y) { return y == 0 ? 0 : (x + y - 1) / y; } -static inline uint16_t udiv_up_safe16(uint16_t x, uint16_t y) { +SCALAR_FUN_ATTR uint16_t udiv_up_safe16(uint16_t x, uint16_t y) { return y == 0 ? 0 : (x + y - 1) / y; } -static inline uint32_t udiv_up_safe32(uint32_t x, uint32_t y) { +SCALAR_FUN_ATTR uint32_t udiv_up_safe32(uint32_t x, uint32_t y) { return y == 0 ? 0 : (x + y - 1) / y; } -static inline uint64_t udiv_up_safe64(uint64_t x, uint64_t y) { +SCALAR_FUN_ATTR uint64_t udiv_up_safe64(uint64_t x, uint64_t y) { return y == 0 ? 0 : (x + y - 1) / y; } -static inline uint8_t umod_safe8(uint8_t x, uint8_t y) { +SCALAR_FUN_ATTR uint8_t umod_safe8(uint8_t x, uint8_t y) { return y == 0 ? 0 : x % y; } -static inline uint16_t umod_safe16(uint16_t x, uint16_t y) { +SCALAR_FUN_ATTR uint16_t umod_safe16(uint16_t x, uint16_t y) { return y == 0 ? 0 : x % y; } -static inline uint32_t umod_safe32(uint32_t x, uint32_t y) { +SCALAR_FUN_ATTR uint32_t umod_safe32(uint32_t x, uint32_t y) { return y == 0 ? 0 : x % y; } -static inline uint64_t umod_safe64(uint64_t x, uint64_t y) { +SCALAR_FUN_ATTR uint64_t umod_safe64(uint64_t x, uint64_t y) { return y == 0 ? 0 : x % y; } -static inline int8_t sdiv8(int8_t x, int8_t y) { +SCALAR_FUN_ATTR int8_t sdiv8(int8_t x, int8_t y) { int8_t q = x / y; int8_t r = x % y; return q - ((r != 0 && r < 0 != y < 0) ? 1 : 0); } -static inline int16_t sdiv16(int16_t x, int16_t y) { +SCALAR_FUN_ATTR int16_t sdiv16(int16_t x, int16_t y) { int16_t q = x / y; int16_t r = x % y; return q - ((r != 0 && r < 0 != y < 0) ? 1 : 0); } -static inline int32_t sdiv32(int32_t x, int32_t y) { +SCALAR_FUN_ATTR int32_t sdiv32(int32_t x, int32_t y) { int32_t q = x / y; int32_t r = x % y; return q - ((r != 0 && r < 0 != y < 0) ? 1 : 0); } -static inline int64_t sdiv64(int64_t x, int64_t y) { +SCALAR_FUN_ATTR int64_t sdiv64(int64_t x, int64_t y) { int64_t q = x / y; int64_t r = x % y; return q - ((r != 0 && r < 0 != y < 0) ? 1 : 0); } -static inline int8_t sdiv_up8(int8_t x, int8_t y) { +SCALAR_FUN_ATTR int8_t sdiv_up8(int8_t x, int8_t y) { return sdiv8(x + y - 1, y); } -static inline int16_t sdiv_up16(int16_t x, int16_t y) { +SCALAR_FUN_ATTR int16_t sdiv_up16(int16_t x, int16_t y) { return sdiv16(x + y - 1, y); } -static inline int32_t sdiv_up32(int32_t x, int32_t y) { +SCALAR_FUN_ATTR int32_t sdiv_up32(int32_t x, int32_t y) { return sdiv32(x + y - 1, y); } -static inline int64_t sdiv_up64(int64_t x, int64_t y) { +SCALAR_FUN_ATTR int64_t sdiv_up64(int64_t x, int64_t y) { return sdiv64(x + y - 1, y); } -static inline int8_t smod8(int8_t x, int8_t y) { +SCALAR_FUN_ATTR int8_t smod8(int8_t x, int8_t y) { int8_t r = x % y; return r + (r == 0 || (x > 0 && y > 0) || (x < 0 && y < 0) ? 0 : y); } -static inline int16_t smod16(int16_t x, int16_t y) { +SCALAR_FUN_ATTR int16_t smod16(int16_t x, int16_t y) { int16_t r = x % y; return r + (r == 0 || (x > 0 && y > 0) || (x < 0 && y < 0) ? 0 : y); } -static inline int32_t smod32(int32_t x, int32_t y) { +SCALAR_FUN_ATTR int32_t smod32(int32_t x, int32_t y) { int32_t r = x % y; return r + (r == 0 || (x > 0 && y > 0) || (x < 0 && y < 0) ? 0 : y); } -static inline int64_t smod64(int64_t x, int64_t y) { +SCALAR_FUN_ATTR int64_t smod64(int64_t x, int64_t y) { int64_t r = x % y; return r + (r == 0 || (x > 0 && y > 0) || (x < 0 && y < 0) ? 0 : y); } -static inline int8_t sdiv_safe8(int8_t x, int8_t y) { +SCALAR_FUN_ATTR int8_t sdiv_safe8(int8_t x, int8_t y) { return y == 0 ? 0 : sdiv8(x, y); } -static inline int16_t sdiv_safe16(int16_t x, int16_t y) { +SCALAR_FUN_ATTR int16_t sdiv_safe16(int16_t x, int16_t y) { return y == 0 ? 0 : sdiv16(x, y); } -static inline int32_t sdiv_safe32(int32_t x, int32_t y) { +SCALAR_FUN_ATTR int32_t sdiv_safe32(int32_t x, int32_t y) { return y == 0 ? 0 : sdiv32(x, y); } -static inline int64_t sdiv_safe64(int64_t x, int64_t y) { +SCALAR_FUN_ATTR int64_t sdiv_safe64(int64_t x, int64_t y) { return y == 0 ? 0 : sdiv64(x, y); } -static inline int8_t sdiv_up_safe8(int8_t x, int8_t y) { +SCALAR_FUN_ATTR int8_t sdiv_up_safe8(int8_t x, int8_t y) { return sdiv_safe8(x + y - 1, y); } -static inline int16_t sdiv_up_safe16(int16_t x, int16_t y) { +SCALAR_FUN_ATTR int16_t sdiv_up_safe16(int16_t x, int16_t y) { return sdiv_safe16(x + y - 1, y); } -static inline int32_t sdiv_up_safe32(int32_t x, int32_t y) { +SCALAR_FUN_ATTR int32_t sdiv_up_safe32(int32_t x, int32_t y) { return sdiv_safe32(x + y - 1, y); } -static inline int64_t sdiv_up_safe64(int64_t x, int64_t y) { +SCALAR_FUN_ATTR int64_t sdiv_up_safe64(int64_t x, int64_t y) { return sdiv_safe64(x + y - 1, y); } -static inline int8_t smod_safe8(int8_t x, int8_t y) { +SCALAR_FUN_ATTR int8_t smod_safe8(int8_t x, int8_t y) { return y == 0 ? 0 : smod8(x, y); } -static inline int16_t smod_safe16(int16_t x, int16_t y) { +SCALAR_FUN_ATTR int16_t smod_safe16(int16_t x, int16_t y) { return y == 0 ? 0 : smod16(x, y); } -static inline int32_t smod_safe32(int32_t x, int32_t y) { +SCALAR_FUN_ATTR int32_t smod_safe32(int32_t x, int32_t y) { return y == 0 ? 0 : smod32(x, y); } -static inline int64_t smod_safe64(int64_t x, int64_t y) { +SCALAR_FUN_ATTR int64_t smod_safe64(int64_t x, int64_t y) { return y == 0 ? 0 : smod64(x, y); } -static inline int8_t squot8(int8_t x, int8_t y) { +SCALAR_FUN_ATTR int8_t squot8(int8_t x, int8_t y) { return x / y; } -static inline int16_t squot16(int16_t x, int16_t y) { +SCALAR_FUN_ATTR int16_t squot16(int16_t x, int16_t y) { return x / y; } -static inline int32_t squot32(int32_t x, int32_t y) { +SCALAR_FUN_ATTR int32_t squot32(int32_t x, int32_t y) { return x / y; } -static inline int64_t squot64(int64_t x, int64_t y) { +SCALAR_FUN_ATTR int64_t squot64(int64_t x, int64_t y) { return x / y; } -static inline int8_t srem8(int8_t x, int8_t y) { +SCALAR_FUN_ATTR int8_t srem8(int8_t x, int8_t y) { return x % y; } -static inline int16_t srem16(int16_t x, int16_t y) { +SCALAR_FUN_ATTR int16_t srem16(int16_t x, int16_t y) { return x % y; } -static inline int32_t srem32(int32_t x, int32_t y) { +SCALAR_FUN_ATTR int32_t srem32(int32_t x, int32_t y) { return x % y; } -static inline int64_t srem64(int64_t x, int64_t y) { +SCALAR_FUN_ATTR int64_t srem64(int64_t x, int64_t y) { return x % y; } -static inline int8_t squot_safe8(int8_t x, int8_t y) { +SCALAR_FUN_ATTR int8_t squot_safe8(int8_t x, int8_t y) { return y == 0 ? 0 : x / y; } -static inline int16_t squot_safe16(int16_t x, int16_t y) { +SCALAR_FUN_ATTR int16_t squot_safe16(int16_t x, int16_t y) { return y == 0 ? 0 : x / y; } -static inline int32_t squot_safe32(int32_t x, int32_t y) { +SCALAR_FUN_ATTR int32_t squot_safe32(int32_t x, int32_t y) { return y == 0 ? 0 : x / y; } -static inline int64_t squot_safe64(int64_t x, int64_t y) { +SCALAR_FUN_ATTR int64_t squot_safe64(int64_t x, int64_t y) { return y == 0 ? 0 : x / y; } -static inline int8_t srem_safe8(int8_t x, int8_t y) { +SCALAR_FUN_ATTR int8_t srem_safe8(int8_t x, int8_t y) { return y == 0 ? 0 : x % y; } -static inline int16_t srem_safe16(int16_t x, int16_t y) { +SCALAR_FUN_ATTR int16_t srem_safe16(int16_t x, int16_t y) { return y == 0 ? 0 : x % y; } -static inline int32_t srem_safe32(int32_t x, int32_t y) { +SCALAR_FUN_ATTR int32_t srem_safe32(int32_t x, int32_t y) { return y == 0 ? 0 : x % y; } -static inline int64_t srem_safe64(int64_t x, int64_t y) { +SCALAR_FUN_ATTR int64_t srem_safe64(int64_t x, int64_t y) { return y == 0 ? 0 : x % y; } #endif -static inline int8_t smin8(int8_t x, int8_t y) { +SCALAR_FUN_ATTR int8_t smin8(int8_t x, int8_t y) { return x < y ? x : y; } -static inline int16_t smin16(int16_t x, int16_t y) { +SCALAR_FUN_ATTR int16_t smin16(int16_t x, int16_t y) { return x < y ? x : y; } -static inline int32_t smin32(int32_t x, int32_t y) { +SCALAR_FUN_ATTR int32_t smin32(int32_t x, int32_t y) { return x < y ? x : y; } -static inline int64_t smin64(int64_t x, int64_t y) { +SCALAR_FUN_ATTR int64_t smin64(int64_t x, int64_t y) { return x < y ? x : y; } -static inline uint8_t umin8(uint8_t x, uint8_t y) { +SCALAR_FUN_ATTR uint8_t umin8(uint8_t x, uint8_t y) { return x < y ? x : y; } -static inline uint16_t umin16(uint16_t x, uint16_t y) { +SCALAR_FUN_ATTR uint16_t umin16(uint16_t x, uint16_t y) { return x < y ? x : y; } -static inline uint32_t umin32(uint32_t x, uint32_t y) { +SCALAR_FUN_ATTR uint32_t umin32(uint32_t x, uint32_t y) { return x < y ? x : y; } -static inline uint64_t umin64(uint64_t x, uint64_t y) { +SCALAR_FUN_ATTR uint64_t umin64(uint64_t x, uint64_t y) { return x < y ? x : y; } -static inline int8_t smax8(int8_t x, int8_t y) { +SCALAR_FUN_ATTR int8_t smax8(int8_t x, int8_t y) { return x < y ? y : x; } -static inline int16_t smax16(int16_t x, int16_t y) { +SCALAR_FUN_ATTR int16_t smax16(int16_t x, int16_t y) { return x < y ? y : x; } -static inline int32_t smax32(int32_t x, int32_t y) { +SCALAR_FUN_ATTR int32_t smax32(int32_t x, int32_t y) { return x < y ? y : x; } -static inline int64_t smax64(int64_t x, int64_t y) { +SCALAR_FUN_ATTR int64_t smax64(int64_t x, int64_t y) { return x < y ? y : x; } -static inline uint8_t umax8(uint8_t x, uint8_t y) { +SCALAR_FUN_ATTR uint8_t umax8(uint8_t x, uint8_t y) { return x < y ? y : x; } -static inline uint16_t umax16(uint16_t x, uint16_t y) { +SCALAR_FUN_ATTR uint16_t umax16(uint16_t x, uint16_t y) { return x < y ? y : x; } -static inline uint32_t umax32(uint32_t x, uint32_t y) { +SCALAR_FUN_ATTR uint32_t umax32(uint32_t x, uint32_t y) { return x < y ? y : x; } -static inline uint64_t umax64(uint64_t x, uint64_t y) { +SCALAR_FUN_ATTR uint64_t umax64(uint64_t x, uint64_t y) { return x < y ? y : x; } -static inline uint8_t shl8(uint8_t x, uint8_t y) { +SCALAR_FUN_ATTR uint8_t shl8(uint8_t x, uint8_t y) { return (uint8_t)(x << y); } -static inline uint16_t shl16(uint16_t x, uint16_t y) { +SCALAR_FUN_ATTR uint16_t shl16(uint16_t x, uint16_t y) { return (uint16_t)(x << y); } -static inline uint32_t shl32(uint32_t x, uint32_t y) { +SCALAR_FUN_ATTR uint32_t shl32(uint32_t x, uint32_t y) { return x << y; } -static inline uint64_t shl64(uint64_t x, uint64_t y) { +SCALAR_FUN_ATTR uint64_t shl64(uint64_t x, uint64_t y) { return x << y; } -static inline uint8_t lshr8(uint8_t x, uint8_t y) { +SCALAR_FUN_ATTR uint8_t lshr8(uint8_t x, uint8_t y) { return x >> y; } -static inline uint16_t lshr16(uint16_t x, uint16_t y) { +SCALAR_FUN_ATTR uint16_t lshr16(uint16_t x, uint16_t y) { return x >> y; } -static inline uint32_t lshr32(uint32_t x, uint32_t y) { +SCALAR_FUN_ATTR uint32_t lshr32(uint32_t x, uint32_t y) { return x >> y; } -static inline uint64_t lshr64(uint64_t x, uint64_t y) { +SCALAR_FUN_ATTR uint64_t lshr64(uint64_t x, uint64_t y) { return x >> y; } -static inline int8_t ashr8(int8_t x, int8_t y) { +SCALAR_FUN_ATTR int8_t ashr8(int8_t x, int8_t y) { return x >> y; } -static inline int16_t ashr16(int16_t x, int16_t y) { +SCALAR_FUN_ATTR int16_t ashr16(int16_t x, int16_t y) { return x >> y; } -static inline int32_t ashr32(int32_t x, int32_t y) { +SCALAR_FUN_ATTR int32_t ashr32(int32_t x, int32_t y) { return x >> y; } -static inline int64_t ashr64(int64_t x, int64_t y) { +SCALAR_FUN_ATTR int64_t ashr64(int64_t x, int64_t y) { return x >> y; } -static inline uint8_t and8(uint8_t x, uint8_t y) { +SCALAR_FUN_ATTR uint8_t and8(uint8_t x, uint8_t y) { return x & y; } -static inline uint16_t and16(uint16_t x, uint16_t y) { +SCALAR_FUN_ATTR uint16_t and16(uint16_t x, uint16_t y) { return x & y; } -static inline uint32_t and32(uint32_t x, uint32_t y) { +SCALAR_FUN_ATTR uint32_t and32(uint32_t x, uint32_t y) { return x & y; } -static inline uint64_t and64(uint64_t x, uint64_t y) { +SCALAR_FUN_ATTR uint64_t and64(uint64_t x, uint64_t y) { return x & y; } -static inline uint8_t or8(uint8_t x, uint8_t y) { +SCALAR_FUN_ATTR uint8_t or8(uint8_t x, uint8_t y) { return x | y; } -static inline uint16_t or16(uint16_t x, uint16_t y) { +SCALAR_FUN_ATTR uint16_t or16(uint16_t x, uint16_t y) { return x | y; } -static inline uint32_t or32(uint32_t x, uint32_t y) { +SCALAR_FUN_ATTR uint32_t or32(uint32_t x, uint32_t y) { return x | y; } -static inline uint64_t or64(uint64_t x, uint64_t y) { +SCALAR_FUN_ATTR uint64_t or64(uint64_t x, uint64_t y) { return x | y; } -static inline uint8_t xor8(uint8_t x, uint8_t y) { +SCALAR_FUN_ATTR uint8_t xor8(uint8_t x, uint8_t y) { return x ^ y; } -static inline uint16_t xor16(uint16_t x, uint16_t y) { +SCALAR_FUN_ATTR uint16_t xor16(uint16_t x, uint16_t y) { return x ^ y; } -static inline uint32_t xor32(uint32_t x, uint32_t y) { +SCALAR_FUN_ATTR uint32_t xor32(uint32_t x, uint32_t y) { return x ^ y; } -static inline uint64_t xor64(uint64_t x, uint64_t y) { +SCALAR_FUN_ATTR uint64_t xor64(uint64_t x, uint64_t y) { return x ^ y; } -static inline bool ult8(uint8_t x, uint8_t y) { +SCALAR_FUN_ATTR bool ult8(uint8_t x, uint8_t y) { return x < y; } -static inline bool ult16(uint16_t x, uint16_t y) { +SCALAR_FUN_ATTR bool ult16(uint16_t x, uint16_t y) { return x < y; } -static inline bool ult32(uint32_t x, uint32_t y) { +SCALAR_FUN_ATTR bool ult32(uint32_t x, uint32_t y) { return x < y; } -static inline bool ult64(uint64_t x, uint64_t y) { +SCALAR_FUN_ATTR bool ult64(uint64_t x, uint64_t y) { return x < y; } -static inline bool ule8(uint8_t x, uint8_t y) { +SCALAR_FUN_ATTR bool ule8(uint8_t x, uint8_t y) { return x <= y; } -static inline bool ule16(uint16_t x, uint16_t y) { +SCALAR_FUN_ATTR bool ule16(uint16_t x, uint16_t y) { return x <= y; } -static inline bool ule32(uint32_t x, uint32_t y) { +SCALAR_FUN_ATTR bool ule32(uint32_t x, uint32_t y) { return x <= y; } -static inline bool ule64(uint64_t x, uint64_t y) { +SCALAR_FUN_ATTR bool ule64(uint64_t x, uint64_t y) { return x <= y; } -static inline bool slt8(int8_t x, int8_t y) { +SCALAR_FUN_ATTR bool slt8(int8_t x, int8_t y) { return x < y; } -static inline bool slt16(int16_t x, int16_t y) { +SCALAR_FUN_ATTR bool slt16(int16_t x, int16_t y) { return x < y; } -static inline bool slt32(int32_t x, int32_t y) { +SCALAR_FUN_ATTR bool slt32(int32_t x, int32_t y) { return x < y; } -static inline bool slt64(int64_t x, int64_t y) { +SCALAR_FUN_ATTR bool slt64(int64_t x, int64_t y) { return x < y; } -static inline bool sle8(int8_t x, int8_t y) { +SCALAR_FUN_ATTR bool sle8(int8_t x, int8_t y) { return x <= y; } -static inline bool sle16(int16_t x, int16_t y) { +SCALAR_FUN_ATTR bool sle16(int16_t x, int16_t y) { return x <= y; } -static inline bool sle32(int32_t x, int32_t y) { +SCALAR_FUN_ATTR bool sle32(int32_t x, int32_t y) { return x <= y; } -static inline bool sle64(int64_t x, int64_t y) { +SCALAR_FUN_ATTR bool sle64(int64_t x, int64_t y) { return x <= y; } -static inline uint8_t pow8(uint8_t x, uint8_t y) { +SCALAR_FUN_ATTR uint8_t pow8(uint8_t x, uint8_t y) { uint8_t res = 1, rem = y; while (rem != 0) { @@ -2054,7 +2186,7 @@ static inline uint8_t pow8(uint8_t x, uint8_t y) { return res; } -static inline uint16_t pow16(uint16_t x, uint16_t y) { +SCALAR_FUN_ATTR uint16_t pow16(uint16_t x, uint16_t y) { uint16_t res = 1, rem = y; while (rem != 0) { @@ -2066,7 +2198,7 @@ static inline uint16_t pow16(uint16_t x, uint16_t y) { return res; } -static inline uint32_t pow32(uint32_t x, uint32_t y) { +SCALAR_FUN_ATTR uint32_t pow32(uint32_t x, uint32_t y) { uint32_t res = 1, rem = y; while (rem != 0) { @@ -2078,7 +2210,7 @@ static inline uint32_t pow32(uint32_t x, uint32_t y) { return res; } -static inline uint64_t pow64(uint64_t x, uint64_t y) { +SCALAR_FUN_ATTR uint64_t pow64(uint64_t x, uint64_t y) { uint64_t res = 1, rem = y; while (rem != 0) { @@ -2090,35 +2222,35 @@ static inline uint64_t pow64(uint64_t x, uint64_t y) { return res; } -static inline bool itob_i8_bool(int8_t x) { +SCALAR_FUN_ATTR bool itob_i8_bool(int8_t x) { return x != 0; } -static inline bool itob_i16_bool(int16_t x) { +SCALAR_FUN_ATTR bool itob_i16_bool(int16_t x) { return x != 0; } -static inline bool itob_i32_bool(int32_t x) { +SCALAR_FUN_ATTR bool itob_i32_bool(int32_t x) { return x != 0; } -static inline bool itob_i64_bool(int64_t x) { +SCALAR_FUN_ATTR bool itob_i64_bool(int64_t x) { return x != 0; } -static inline int8_t btoi_bool_i8(bool x) { +SCALAR_FUN_ATTR int8_t btoi_bool_i8(bool x) { return x; } -static inline int16_t btoi_bool_i16(bool x) { +SCALAR_FUN_ATTR int16_t btoi_bool_i16(bool x) { return x; } -static inline int32_t btoi_bool_i32(bool x) { +SCALAR_FUN_ATTR int32_t btoi_bool_i32(bool x) { return x; } -static inline int64_t btoi_bool_i64(bool x) { +SCALAR_FUN_ATTR int64_t btoi_bool_i64(bool x) { return x; } @@ -2155,19 +2287,19 @@ static inline int64_t btoi_bool_i64(bool x) { #define zext_i64_i32(x) ((int32_t) (uint64_t) (x)) #define zext_i64_i64(x) ((int64_t) (uint64_t) (x)) -static int8_t abs8(int8_t x) { +SCALAR_FUN_ATTR int8_t abs8(int8_t x) { return (int8_t)abs(x); } -static int16_t abs16(int16_t x) { +SCALAR_FUN_ATTR int16_t abs16(int16_t x) { return (int16_t)abs(x); } -static int32_t abs32(int32_t x) { +SCALAR_FUN_ATTR int32_t abs32(int32_t x) { return abs(x); } -static int64_t abs64(int64_t x) { +SCALAR_FUN_ATTR int64_t abs64(int64_t x) { #if defined(__OPENCL_VERSION__) || defined(ISPC) return abs(x); #else @@ -2176,60 +2308,60 @@ static int64_t abs64(int64_t x) { } #if defined(__OPENCL_VERSION__) -static int32_t futrts_popc8(int8_t x) { +SCALAR_FUN_ATTR int32_t futrts_popc8(int8_t x) { return popcount(x); } -static int32_t futrts_popc16(int16_t x) { +SCALAR_FUN_ATTR int32_t futrts_popc16(int16_t x) { return popcount(x); } -static int32_t futrts_popc32(int32_t x) { +SCALAR_FUN_ATTR int32_t futrts_popc32(int32_t x) { return popcount(x); } -static int32_t futrts_popc64(int64_t x) { +SCALAR_FUN_ATTR int32_t futrts_popc64(int64_t x) { return popcount(x); } #elif defined(__CUDA_ARCH__) -static int32_t futrts_popc8(int8_t x) { +SCALAR_FUN_ATTR int32_t futrts_popc8(int8_t x) { return __popc(zext_i8_i32(x)); } -static int32_t futrts_popc16(int16_t x) { +SCALAR_FUN_ATTR int32_t futrts_popc16(int16_t x) { return __popc(zext_i16_i32(x)); } -static int32_t futrts_popc32(int32_t x) { +SCALAR_FUN_ATTR int32_t futrts_popc32(int32_t x) { return __popc(x); } -static int32_t futrts_popc64(int64_t x) { +SCALAR_FUN_ATTR int32_t futrts_popc64(int64_t x) { return __popcll(x); } #else // Not OpenCL or CUDA, but plain C. -static int32_t futrts_popc8(uint8_t x) { +SCALAR_FUN_ATTR int32_t futrts_popc8(uint8_t x) { int c = 0; for (; x; ++c) { x &= x - 1; } return c; } -static int32_t futrts_popc16(uint16_t x) { +SCALAR_FUN_ATTR int32_t futrts_popc16(uint16_t x) { int c = 0; for (; x; ++c) { x &= x - 1; } return c; } -static int32_t futrts_popc32(uint32_t x) { +SCALAR_FUN_ATTR int32_t futrts_popc32(uint32_t x) { int c = 0; for (; x; ++c) { x &= x - 1; } return c; } -static int32_t futrts_popc64(uint64_t x) { +SCALAR_FUN_ATTR int32_t futrts_popc64(uint64_t x) { int c = 0; for (; x; ++c) { x &= x - 1; } return c; @@ -2237,28 +2369,28 @@ static int32_t futrts_popc64(uint64_t x) { #endif #if defined(__OPENCL_VERSION__) -static uint8_t futrts_umul_hi8 ( uint8_t a, uint8_t b) { return mul_hi(a, b); } -static uint16_t futrts_umul_hi16(uint16_t a, uint16_t b) { return mul_hi(a, b); } -static uint32_t futrts_umul_hi32(uint32_t a, uint32_t b) { return mul_hi(a, b); } -static uint64_t futrts_umul_hi64(uint64_t a, uint64_t b) { return mul_hi(a, b); } -static uint8_t futrts_smul_hi8 ( int8_t a, int8_t b) { return mul_hi(a, b); } -static uint16_t futrts_smul_hi16(int16_t a, int16_t b) { return mul_hi(a, b); } -static uint32_t futrts_smul_hi32(int32_t a, int32_t b) { return mul_hi(a, b); } -static uint64_t futrts_smul_hi64(int64_t a, int64_t b) { return mul_hi(a, b); } +SCALAR_FUN_ATTR uint8_t futrts_umul_hi8 ( uint8_t a, uint8_t b) { return mul_hi(a, b); } +SCALAR_FUN_ATTR uint16_t futrts_umul_hi16(uint16_t a, uint16_t b) { return mul_hi(a, b); } +SCALAR_FUN_ATTR uint32_t futrts_umul_hi32(uint32_t a, uint32_t b) { return mul_hi(a, b); } +SCALAR_FUN_ATTR uint64_t futrts_umul_hi64(uint64_t a, uint64_t b) { return mul_hi(a, b); } +SCALAR_FUN_ATTR uint8_t futrts_smul_hi8 ( int8_t a, int8_t b) { return mul_hi(a, b); } +SCALAR_FUN_ATTR uint16_t futrts_smul_hi16(int16_t a, int16_t b) { return mul_hi(a, b); } +SCALAR_FUN_ATTR uint32_t futrts_smul_hi32(int32_t a, int32_t b) { return mul_hi(a, b); } +SCALAR_FUN_ATTR uint64_t futrts_smul_hi64(int64_t a, int64_t b) { return mul_hi(a, b); } #elif defined(__CUDA_ARCH__) -static uint8_t futrts_umul_hi8(uint8_t a, uint8_t b) { return ((uint16_t)a) * ((uint16_t)b) >> 8; } -static uint16_t futrts_umul_hi16(uint16_t a, uint16_t b) { return ((uint32_t)a) * ((uint32_t)b) >> 16; } -static uint32_t futrts_umul_hi32(uint32_t a, uint32_t b) { return __umulhi(a, b); } -static uint64_t futrts_umul_hi64(uint64_t a, uint64_t b) { return __umul64hi(a, b); } -static uint8_t futrts_smul_hi8 ( int8_t a, int8_t b) { return ((int16_t)a) * ((int16_t)b) >> 8; } -static uint16_t futrts_smul_hi16(int16_t a, int16_t b) { return ((int32_t)a) * ((int32_t)b) >> 16; } -static uint32_t futrts_smul_hi32(int32_t a, int32_t b) { return __mulhi(a, b); } -static uint64_t futrts_smul_hi64(int64_t a, int64_t b) { return __mul64hi(a, b); } +SCALAR_FUN_ATTR uint8_t futrts_umul_hi8(uint8_t a, uint8_t b) { return ((uint16_t)a) * ((uint16_t)b) >> 8; } +SCALAR_FUN_ATTR uint16_t futrts_umul_hi16(uint16_t a, uint16_t b) { return ((uint32_t)a) * ((uint32_t)b) >> 16; } +SCALAR_FUN_ATTR uint32_t futrts_umul_hi32(uint32_t a, uint32_t b) { return __umulhi(a, b); } +SCALAR_FUN_ATTR uint64_t futrts_umul_hi64(uint64_t a, uint64_t b) { return __umul64hi(a, b); } +SCALAR_FUN_ATTR uint8_t futrts_smul_hi8 ( int8_t a, int8_t b) { return ((int16_t)a) * ((int16_t)b) >> 8; } +SCALAR_FUN_ATTR uint16_t futrts_smul_hi16(int16_t a, int16_t b) { return ((int32_t)a) * ((int32_t)b) >> 16; } +SCALAR_FUN_ATTR uint32_t futrts_smul_hi32(int32_t a, int32_t b) { return __mulhi(a, b); } +SCALAR_FUN_ATTR uint64_t futrts_smul_hi64(int64_t a, int64_t b) { return __mul64hi(a, b); } #elif ISPC -static uint8_t futrts_umul_hi8(uint8_t a, uint8_t b) { return ((uint16_t)a) * ((uint16_t)b) >> 8; } -static uint16_t futrts_umul_hi16(uint16_t a, uint16_t b) { return ((uint32_t)a) * ((uint32_t)b) >> 16; } -static uint32_t futrts_umul_hi32(uint32_t a, uint32_t b) { return ((uint64_t)a) * ((uint64_t)b) >> 32; } -static uint64_t futrts_umul_hi64(uint64_t a, uint64_t b) { +SCALAR_FUN_ATTR uint8_t futrts_umul_hi8(uint8_t a, uint8_t b) { return ((uint16_t)a) * ((uint16_t)b) >> 8; } +SCALAR_FUN_ATTR uint16_t futrts_umul_hi16(uint16_t a, uint16_t b) { return ((uint32_t)a) * ((uint32_t)b) >> 16; } +SCALAR_FUN_ATTR uint32_t futrts_umul_hi32(uint32_t a, uint32_t b) { return ((uint64_t)a) * ((uint64_t)b) >> 32; } +SCALAR_FUN_ATTR uint64_t futrts_umul_hi64(uint64_t a, uint64_t b) { uint64_t ah = a >> 32; uint64_t al = a & 0xffffffff; uint64_t bh = b >> 32; @@ -2281,10 +2413,10 @@ static uint64_t futrts_umul_hi64(uint64_t a, uint64_t b) { return h; } -static int8_t futrts_smul_hi8 ( int8_t a, int8_t b) { return ((uint16_t)a) * ((uint16_t)b) >> 8; } -static int16_t futrts_smul_hi16(int16_t a, int16_t b) { return ((uint32_t)a) * ((uint32_t)b) >> 16; } -static int32_t futrts_smul_hi32(int32_t a, int32_t b) { return ((uint64_t)a) * ((uint64_t)b) >> 32; } -static int64_t futrts_smul_hi64(int64_t a, int64_t b) { +SCALAR_FUN_ATTR int8_t futrts_smul_hi8 ( int8_t a, int8_t b) { return ((uint16_t)a) * ((uint16_t)b) >> 8; } +SCALAR_FUN_ATTR int16_t futrts_smul_hi16(int16_t a, int16_t b) { return ((uint32_t)a) * ((uint32_t)b) >> 16; } +SCALAR_FUN_ATTR int32_t futrts_smul_hi32(int32_t a, int32_t b) { return ((uint64_t)a) * ((uint64_t)b) >> 32; } +SCALAR_FUN_ATTR int64_t futrts_smul_hi64(int64_t a, int64_t b) { uint64_t ah = a >> 32; uint64_t al = a & 0xffffffff; uint64_t bh = b >> 32; @@ -2309,132 +2441,132 @@ static int64_t futrts_smul_hi64(int64_t a, int64_t b) { } #else // Not OpenCL, ISPC, or CUDA, but plain C. -static uint8_t futrts_umul_hi8(uint8_t a, uint8_t b) { return ((uint16_t)a) * ((uint16_t)b) >> 8; } -static uint16_t futrts_umul_hi16(uint16_t a, uint16_t b) { return ((uint32_t)a) * ((uint32_t)b) >> 16; } -static uint32_t futrts_umul_hi32(uint32_t a, uint32_t b) { return ((uint64_t)a) * ((uint64_t)b) >> 32; } -static uint64_t futrts_umul_hi64(uint64_t a, uint64_t b) { return ((__uint128_t)a) * ((__uint128_t)b) >> 64; } -static int8_t futrts_smul_hi8(int8_t a, int8_t b) { return ((int16_t)a) * ((int16_t)b) >> 8; } -static int16_t futrts_smul_hi16(int16_t a, int16_t b) { return ((int32_t)a) * ((int32_t)b) >> 16; } -static int32_t futrts_smul_hi32(int32_t a, int32_t b) { return ((int64_t)a) * ((int64_t)b) >> 32; } -static int64_t futrts_smul_hi64(int64_t a, int64_t b) { return ((__int128_t)a) * ((__int128_t)b) >> 64; } +SCALAR_FUN_ATTR uint8_t futrts_umul_hi8(uint8_t a, uint8_t b) { return ((uint16_t)a) * ((uint16_t)b) >> 8; } +SCALAR_FUN_ATTR uint16_t futrts_umul_hi16(uint16_t a, uint16_t b) { return ((uint32_t)a) * ((uint32_t)b) >> 16; } +SCALAR_FUN_ATTR uint32_t futrts_umul_hi32(uint32_t a, uint32_t b) { return ((uint64_t)a) * ((uint64_t)b) >> 32; } +SCALAR_FUN_ATTR uint64_t futrts_umul_hi64(uint64_t a, uint64_t b) { return ((__uint128_t)a) * ((__uint128_t)b) >> 64; } +SCALAR_FUN_ATTR int8_t futrts_smul_hi8(int8_t a, int8_t b) { return ((int16_t)a) * ((int16_t)b) >> 8; } +SCALAR_FUN_ATTR int16_t futrts_smul_hi16(int16_t a, int16_t b) { return ((int32_t)a) * ((int32_t)b) >> 16; } +SCALAR_FUN_ATTR int32_t futrts_smul_hi32(int32_t a, int32_t b) { return ((int64_t)a) * ((int64_t)b) >> 32; } +SCALAR_FUN_ATTR int64_t futrts_smul_hi64(int64_t a, int64_t b) { return ((__int128_t)a) * ((__int128_t)b) >> 64; } #endif #if defined(__OPENCL_VERSION__) -static uint8_t futrts_umad_hi8 ( uint8_t a, uint8_t b, uint8_t c) { return mad_hi(a, b, c); } -static uint16_t futrts_umad_hi16(uint16_t a, uint16_t b, uint16_t c) { return mad_hi(a, b, c); } -static uint32_t futrts_umad_hi32(uint32_t a, uint32_t b, uint32_t c) { return mad_hi(a, b, c); } -static uint64_t futrts_umad_hi64(uint64_t a, uint64_t b, uint64_t c) { return mad_hi(a, b, c); } -static uint8_t futrts_smad_hi8( int8_t a, int8_t b, int8_t c) { return mad_hi(a, b, c); } -static uint16_t futrts_smad_hi16(int16_t a, int16_t b, int16_t c) { return mad_hi(a, b, c); } -static uint32_t futrts_smad_hi32(int32_t a, int32_t b, int32_t c) { return mad_hi(a, b, c); } -static uint64_t futrts_smad_hi64(int64_t a, int64_t b, int64_t c) { return mad_hi(a, b, c); } +SCALAR_FUN_ATTR uint8_t futrts_umad_hi8 ( uint8_t a, uint8_t b, uint8_t c) { return mad_hi(a, b, c); } +SCALAR_FUN_ATTR uint16_t futrts_umad_hi16(uint16_t a, uint16_t b, uint16_t c) { return mad_hi(a, b, c); } +SCALAR_FUN_ATTR uint32_t futrts_umad_hi32(uint32_t a, uint32_t b, uint32_t c) { return mad_hi(a, b, c); } +SCALAR_FUN_ATTR uint64_t futrts_umad_hi64(uint64_t a, uint64_t b, uint64_t c) { return mad_hi(a, b, c); } +SCALAR_FUN_ATTR uint8_t futrts_smad_hi8( int8_t a, int8_t b, int8_t c) { return mad_hi(a, b, c); } +SCALAR_FUN_ATTR uint16_t futrts_smad_hi16(int16_t a, int16_t b, int16_t c) { return mad_hi(a, b, c); } +SCALAR_FUN_ATTR uint32_t futrts_smad_hi32(int32_t a, int32_t b, int32_t c) { return mad_hi(a, b, c); } +SCALAR_FUN_ATTR uint64_t futrts_smad_hi64(int64_t a, int64_t b, int64_t c) { return mad_hi(a, b, c); } #else // Not OpenCL -static uint8_t futrts_umad_hi8( uint8_t a, uint8_t b, uint8_t c) { return futrts_umul_hi8(a, b) + c; } -static uint16_t futrts_umad_hi16(uint16_t a, uint16_t b, uint16_t c) { return futrts_umul_hi16(a, b) + c; } -static uint32_t futrts_umad_hi32(uint32_t a, uint32_t b, uint32_t c) { return futrts_umul_hi32(a, b) + c; } -static uint64_t futrts_umad_hi64(uint64_t a, uint64_t b, uint64_t c) { return futrts_umul_hi64(a, b) + c; } -static uint8_t futrts_smad_hi8 ( int8_t a, int8_t b, int8_t c) { return futrts_smul_hi8(a, b) + c; } -static uint16_t futrts_smad_hi16(int16_t a, int16_t b, int16_t c) { return futrts_smul_hi16(a, b) + c; } -static uint32_t futrts_smad_hi32(int32_t a, int32_t b, int32_t c) { return futrts_smul_hi32(a, b) + c; } -static uint64_t futrts_smad_hi64(int64_t a, int64_t b, int64_t c) { return futrts_smul_hi64(a, b) + c; } +SCALAR_FUN_ATTR uint8_t futrts_umad_hi8( uint8_t a, uint8_t b, uint8_t c) { return futrts_umul_hi8(a, b) + c; } +SCALAR_FUN_ATTR uint16_t futrts_umad_hi16(uint16_t a, uint16_t b, uint16_t c) { return futrts_umul_hi16(a, b) + c; } +SCALAR_FUN_ATTR uint32_t futrts_umad_hi32(uint32_t a, uint32_t b, uint32_t c) { return futrts_umul_hi32(a, b) + c; } +SCALAR_FUN_ATTR uint64_t futrts_umad_hi64(uint64_t a, uint64_t b, uint64_t c) { return futrts_umul_hi64(a, b) + c; } +SCALAR_FUN_ATTR uint8_t futrts_smad_hi8 ( int8_t a, int8_t b, int8_t c) { return futrts_smul_hi8(a, b) + c; } +SCALAR_FUN_ATTR uint16_t futrts_smad_hi16(int16_t a, int16_t b, int16_t c) { return futrts_smul_hi16(a, b) + c; } +SCALAR_FUN_ATTR uint32_t futrts_smad_hi32(int32_t a, int32_t b, int32_t c) { return futrts_smul_hi32(a, b) + c; } +SCALAR_FUN_ATTR uint64_t futrts_smad_hi64(int64_t a, int64_t b, int64_t c) { return futrts_smul_hi64(a, b) + c; } #endif #if defined(__OPENCL_VERSION__) -static int32_t futrts_clzz8(int8_t x) { +SCALAR_FUN_ATTR int32_t futrts_clzz8(int8_t x) { return clz(x); } -static int32_t futrts_clzz16(int16_t x) { +SCALAR_FUN_ATTR int32_t futrts_clzz16(int16_t x) { return clz(x); } -static int32_t futrts_clzz32(int32_t x) { +SCALAR_FUN_ATTR int32_t futrts_clzz32(int32_t x) { return clz(x); } -static int32_t futrts_clzz64(int64_t x) { +SCALAR_FUN_ATTR int32_t futrts_clzz64(int64_t x) { return clz(x); } #elif defined(__CUDA_ARCH__) -static int32_t futrts_clzz8(int8_t x) { +SCALAR_FUN_ATTR int32_t futrts_clzz8(int8_t x) { return __clz(zext_i8_i32(x)) - 24; } -static int32_t futrts_clzz16(int16_t x) { +SCALAR_FUN_ATTR int32_t futrts_clzz16(int16_t x) { return __clz(zext_i16_i32(x)) - 16; } -static int32_t futrts_clzz32(int32_t x) { +SCALAR_FUN_ATTR int32_t futrts_clzz32(int32_t x) { return __clz(x); } -static int32_t futrts_clzz64(int64_t x) { +SCALAR_FUN_ATTR int32_t futrts_clzz64(int64_t x) { return __clzll(x); } #elif ISPC -static int32_t futrts_clzz8(int8_t x) { +SCALAR_FUN_ATTR int32_t futrts_clzz8(int8_t x) { return count_leading_zeros((int32_t)(uint8_t)x)-24; } -static int32_t futrts_clzz16(int16_t x) { +SCALAR_FUN_ATTR int32_t futrts_clzz16(int16_t x) { return count_leading_zeros((int32_t)(uint16_t)x)-16; } -static int32_t futrts_clzz32(int32_t x) { +SCALAR_FUN_ATTR int32_t futrts_clzz32(int32_t x) { return count_leading_zeros(x); } -static int32_t futrts_clzz64(int64_t x) { +SCALAR_FUN_ATTR int32_t futrts_clzz64(int64_t x) { return count_leading_zeros(x); } #else // Not OpenCL, ISPC or CUDA, but plain C. -static int32_t futrts_clzz8(int8_t x) { +SCALAR_FUN_ATTR int32_t futrts_clzz8(int8_t x) { return x == 0 ? 8 : __builtin_clz((uint32_t)zext_i8_i32(x)) - 24; } -static int32_t futrts_clzz16(int16_t x) { +SCALAR_FUN_ATTR int32_t futrts_clzz16(int16_t x) { return x == 0 ? 16 : __builtin_clz((uint32_t)zext_i16_i32(x)) - 16; } -static int32_t futrts_clzz32(int32_t x) { +SCALAR_FUN_ATTR int32_t futrts_clzz32(int32_t x) { return x == 0 ? 32 : __builtin_clz((uint32_t)x); } -static int32_t futrts_clzz64(int64_t x) { +SCALAR_FUN_ATTR int32_t futrts_clzz64(int64_t x) { return x == 0 ? 64 : __builtin_clzll((uint64_t)x); } #endif #if defined(__OPENCL_VERSION__) -static int32_t futrts_ctzz8(int8_t x) { +SCALAR_FUN_ATTR int32_t futrts_ctzz8(int8_t x) { int i = 0; for (; i < 8 && (x & 1) == 0; i++, x >>= 1) ; return i; } -static int32_t futrts_ctzz16(int16_t x) { +SCALAR_FUN_ATTR int32_t futrts_ctzz16(int16_t x) { int i = 0; for (; i < 16 && (x & 1) == 0; i++, x >>= 1) ; return i; } -static int32_t futrts_ctzz32(int32_t x) { +SCALAR_FUN_ATTR int32_t futrts_ctzz32(int32_t x) { int i = 0; for (; i < 32 && (x & 1) == 0; i++, x >>= 1) ; return i; } -static int32_t futrts_ctzz64(int64_t x) { +SCALAR_FUN_ATTR int32_t futrts_ctzz64(int64_t x) { int i = 0; for (; i < 64 && (x & 1) == 0; i++, x >>= 1) ; @@ -2443,151 +2575,151 @@ static int32_t futrts_ctzz64(int64_t x) { #elif defined(__CUDA_ARCH__) -static int32_t futrts_ctzz8(int8_t x) { +SCALAR_FUN_ATTR int32_t futrts_ctzz8(int8_t x) { int y = __ffs(x); return y == 0 ? 8 : y - 1; } -static int32_t futrts_ctzz16(int16_t x) { +SCALAR_FUN_ATTR int32_t futrts_ctzz16(int16_t x) { int y = __ffs(x); return y == 0 ? 16 : y - 1; } -static int32_t futrts_ctzz32(int32_t x) { +SCALAR_FUN_ATTR int32_t futrts_ctzz32(int32_t x) { int y = __ffs(x); return y == 0 ? 32 : y - 1; } -static int32_t futrts_ctzz64(int64_t x) { +SCALAR_FUN_ATTR int32_t futrts_ctzz64(int64_t x) { int y = __ffsll(x); return y == 0 ? 64 : y - 1; } #elif ISPC -static int32_t futrts_ctzz8(int8_t x) { +SCALAR_FUN_ATTR int32_t futrts_ctzz8(int8_t x) { return x == 0 ? 8 : count_trailing_zeros((int32_t)x); } -static int32_t futrts_ctzz16(int16_t x) { +SCALAR_FUN_ATTR int32_t futrts_ctzz16(int16_t x) { return x == 0 ? 16 : count_trailing_zeros((int32_t)x); } -static int32_t futrts_ctzz32(int32_t x) { +SCALAR_FUN_ATTR int32_t futrts_ctzz32(int32_t x) { return count_trailing_zeros(x); } -static int32_t futrts_ctzz64(int64_t x) { +SCALAR_FUN_ATTR int32_t futrts_ctzz64(int64_t x) { return count_trailing_zeros(x); } #else // Not OpenCL or CUDA, but plain C. -static int32_t futrts_ctzz8(int8_t x) { +SCALAR_FUN_ATTR int32_t futrts_ctzz8(int8_t x) { return x == 0 ? 8 : __builtin_ctz((uint32_t)x); } -static int32_t futrts_ctzz16(int16_t x) { +SCALAR_FUN_ATTR int32_t futrts_ctzz16(int16_t x) { return x == 0 ? 16 : __builtin_ctz((uint32_t)x); } -static int32_t futrts_ctzz32(int32_t x) { +SCALAR_FUN_ATTR int32_t futrts_ctzz32(int32_t x) { return x == 0 ? 32 : __builtin_ctz((uint32_t)x); } -static int32_t futrts_ctzz64(int64_t x) { +SCALAR_FUN_ATTR int32_t futrts_ctzz64(int64_t x) { return x == 0 ? 64 : __builtin_ctzll((uint64_t)x); } #endif -static inline float fdiv32(float x, float y) { +SCALAR_FUN_ATTR float fdiv32(float x, float y) { return x / y; } -static inline float fadd32(float x, float y) { +SCALAR_FUN_ATTR float fadd32(float x, float y) { return x + y; } -static inline float fsub32(float x, float y) { +SCALAR_FUN_ATTR float fsub32(float x, float y) { return x - y; } -static inline float fmul32(float x, float y) { +SCALAR_FUN_ATTR float fmul32(float x, float y) { return x * y; } -static inline bool cmplt32(float x, float y) { +SCALAR_FUN_ATTR bool cmplt32(float x, float y) { return x < y; } -static inline bool cmple32(float x, float y) { +SCALAR_FUN_ATTR bool cmple32(float x, float y) { return x <= y; } -static inline float sitofp_i8_f32(int8_t x) { +SCALAR_FUN_ATTR float sitofp_i8_f32(int8_t x) { return (float) x; } -static inline float sitofp_i16_f32(int16_t x) { +SCALAR_FUN_ATTR float sitofp_i16_f32(int16_t x) { return (float) x; } -static inline float sitofp_i32_f32(int32_t x) { +SCALAR_FUN_ATTR float sitofp_i32_f32(int32_t x) { return (float) x; } -static inline float sitofp_i64_f32(int64_t x) { +SCALAR_FUN_ATTR float sitofp_i64_f32(int64_t x) { return (float) x; } -static inline float uitofp_i8_f32(uint8_t x) { +SCALAR_FUN_ATTR float uitofp_i8_f32(uint8_t x) { return (float) x; } -static inline float uitofp_i16_f32(uint16_t x) { +SCALAR_FUN_ATTR float uitofp_i16_f32(uint16_t x) { return (float) x; } -static inline float uitofp_i32_f32(uint32_t x) { +SCALAR_FUN_ATTR float uitofp_i32_f32(uint32_t x) { return (float) x; } -static inline float uitofp_i64_f32(uint64_t x) { +SCALAR_FUN_ATTR float uitofp_i64_f32(uint64_t x) { return (float) x; } #ifdef __OPENCL_VERSION__ -static inline float fabs32(float x) { +SCALAR_FUN_ATTR float fabs32(float x) { return fabs(x); } -static inline float fmax32(float x, float y) { +SCALAR_FUN_ATTR float fmax32(float x, float y) { return fmax(x, y); } -static inline float fmin32(float x, float y) { +SCALAR_FUN_ATTR float fmin32(float x, float y) { return fmin(x, y); } -static inline float fpow32(float x, float y) { +SCALAR_FUN_ATTR float fpow32(float x, float y) { return pow(x, y); } #elif ISPC -static inline float fabs32(float x) { +SCALAR_FUN_ATTR float fabs32(float x) { return abs(x); } -static inline float fmax32(float x, float y) { +SCALAR_FUN_ATTR float fmax32(float x, float y) { return isnan(x) ? y : isnan(y) ? x : max(x, y); } -static inline float fmin32(float x, float y) { +SCALAR_FUN_ATTR float fmin32(float x, float y) { return isnan(x) ? y : isnan(y) ? x : min(x, y); } -static inline float fpow32(float a, float b) { +SCALAR_FUN_ATTR float fpow32(float a, float b) { float ret; foreach_active (i) { uniform float r = __stdlib_powf(extract(a, i), extract(b, i)); @@ -2598,46 +2730,46 @@ static inline float fpow32(float a, float b) { #else // Not OpenCL, but CUDA or plain C. -static inline float fabs32(float x) { +SCALAR_FUN_ATTR float fabs32(float x) { return fabsf(x); } -static inline float fmax32(float x, float y) { +SCALAR_FUN_ATTR float fmax32(float x, float y) { return fmaxf(x, y); } -static inline float fmin32(float x, float y) { +SCALAR_FUN_ATTR float fmin32(float x, float y) { return fminf(x, y); } -static inline float fpow32(float x, float y) { +SCALAR_FUN_ATTR float fpow32(float x, float y) { return powf(x, y); } #endif -static inline bool futrts_isnan32(float x) { +SCALAR_FUN_ATTR bool futrts_isnan32(float x) { return isnan(x); } #if ISPC -static inline bool futrts_isinf32(float x) { +SCALAR_FUN_ATTR bool futrts_isinf32(float x) { return !isnan(x) && isnan(x - x); } -static inline bool futrts_isfinite32(float x) { +SCALAR_FUN_ATTR bool futrts_isfinite32(float x) { return !isnan(x) && !futrts_isinf32(x); } #else -static inline bool futrts_isinf32(float x) { +SCALAR_FUN_ATTR bool futrts_isinf32(float x) { return isinf(x); } #endif -static inline int8_t fptosi_f32_i8(float x) { +SCALAR_FUN_AT