From a0b5f58d8d2040306006a8b9deded629692ed8f3 Mon Sep 17 00:00:00 2001 From: Justin Bedo Date: Tue, 7 Feb 2023 15:27:42 +1100 Subject: init --- .gitignore | 2 + LICENSE | 19 + default.nix | 8 + flake.lock | 42 + flake.nix | 16 + meson.build | 7 + pca.c | 8106 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ pca.fut | 75 + pca.h | 65 + 9 files changed, 8340 insertions(+) create mode 100644 .gitignore create mode 100644 LICENSE create mode 100644 default.nix create mode 100644 flake.lock create mode 100644 flake.nix create mode 100644 meson.build create mode 100644 pca.c create mode 100644 pca.fut create mode 100644 pca.h diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..aaf410d --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +result +build diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..eef30a7 --- /dev/null +++ b/LICENSE @@ -0,0 +1,19 @@ +Copyright (c) 2023 Justin Bedo + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/default.nix b/default.nix new file mode 100644 index 0000000..52704ae --- /dev/null +++ b/default.nix @@ -0,0 +1,8 @@ +{ stdenv, meson, ninja }: + +stdenv.mkDerivation { + pname = "codapca"; + version = "0.1"; + src = ./.; + nativeBuildInputs = [meson ninja]; +} diff --git a/flake.lock b/flake.lock new file mode 100644 index 0000000..3b0759d --- /dev/null +++ b/flake.lock @@ -0,0 +1,42 @@ +{ + "nodes": { + "flake-utils": { + "locked": { + "lastModified": 1667395993, + "narHash": "sha256-nuEHfE/LcWyuSWnS8t12N1wc105Qtau+/OdUAjtQ0rA=", + "owner": "numtide", + "repo": "flake-utils", + "rev": "5aed5285a952e0b949eb3ba02c12fa4fcfef535f", + "type": "github" + }, + "original": { + "owner": "numtide", + "repo": "flake-utils", + "type": "github" + } + }, + "nixpkgs": { + "locked": { + "lastModified": 1675745927, + "narHash": "sha256-+iTd4MU5Mj/ZIqmfNUNK5tiRxq6lf35OFyM9sGOcJlo=", + "owner": "nixos", + "repo": "nixpkgs", + "rev": "4ee1a6c3ae9ebf58183e446aa53e5efaac34cbb3", + "type": "github" + }, + "original": { + "owner": "nixos", + "repo": "nixpkgs", + "type": "github" + } + }, + "root": { + "inputs": { + "flake-utils": "flake-utils", + "nixpkgs": "nixpkgs" + } + } + }, + "root": "root", + "version": 7 +} diff --git a/flake.nix b/flake.nix new file mode 100644 index 0000000..38e252b --- /dev/null +++ b/flake.nix @@ -0,0 +1,16 @@ +{ + inputs.nixpkgs.url = "github:nixos/nixpkgs"; + inputs.flake-utils.url = "github:numtide/flake-utils"; + + outputs = { + self, + nixpkgs, + flake-utils, + }: + flake-utils.lib.eachDefaultSystem (system: let + pkgs = import nixpkgs {inherit system;}; + in rec { + packages = {lib = pkgs.callPackage ./. {};}; + defaultPackage = packages.lib; + }); +} diff --git a/meson.build b/meson.build new file mode 100644 index 0000000..0145c5a --- /dev/null +++ b/meson.build @@ -0,0 +1,7 @@ +project('coda-pca', 'c', + version : '0.1', + default_options : ['warning_level=3']) +cc = meson.get_compiler('c') +m_dep = cc.find_library('m', required : false) +install_headers('pca.h') +shared_library('codapca', 'pca.c', dependencies : m_dep, install : true) diff --git a/pca.c b/pca.c new file mode 100644 index 0000000..cd135a5 --- /dev/null +++ b/pca.c @@ -0,0 +1,8106 @@ +// Generated by Futhark 0.24.0 (prerelease - include info below when reporting bugs) + +// We need to define _GNU_SOURCE before +// _any_ headers files are imported to get +// the usage statistics of a thread (i.e. have RUSAGE_THREAD) on GNU/Linux +// https://manpages.courier-mta.org/htmlman2/getrusage.2.html +#ifndef _GNU_SOURCE // Avoid possible double-definition warning. +#define _GNU_SOURCE +#endif + +#ifdef __clang__ +#pragma clang diagnostic ignored "-Wunused-function" +#pragma clang diagnostic ignored "-Wunused-variable" +#pragma clang diagnostic ignored "-Wparentheses" +#pragma clang diagnostic ignored "-Wunused-label" +#elif __GNUC__ +#pragma GCC diagnostic ignored "-Wunused-function" +#pragma GCC diagnostic ignored "-Wunused-variable" +#pragma GCC diagnostic ignored "-Wparentheses" +#pragma GCC diagnostic ignored "-Wunused-label" +#pragma GCC diagnostic ignored "-Wunused-but-set-variable" +#endif + +// Headers +#include +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +// Initialisation +struct futhark_context_config; +struct futhark_context_config *futhark_context_config_new(void); +void futhark_context_config_free(struct futhark_context_config *cfg); +int futhark_context_config_set_tuning_param(struct futhark_context_config *cfg, const char *param_name, size_t new_value); +struct futhark_context; +struct futhark_context *futhark_context_new(struct futhark_context_config *cfg); +void futhark_context_free(struct futhark_context *cfg); +void futhark_context_config_set_debugging(struct futhark_context_config *cfg, int flag); +void futhark_context_config_set_profiling(struct futhark_context_config *cfg, int flag); +void futhark_context_config_set_logging(struct futhark_context_config *cfg, int flag); +int futhark_get_tuning_param_count(void); +const char *futhark_get_tuning_param_name(int); +const char *futhark_get_tuning_param_class(int); + +// Arrays +struct futhark_f64_2d; +struct futhark_f64_2d *futhark_new_f64_2d(struct futhark_context *ctx, const double *data, int64_t dim0, int64_t dim1); +struct futhark_f64_2d *futhark_new_raw_f64_2d(struct futhark_context *ctx, const unsigned char *data, int64_t offset, int64_t dim0, int64_t dim1); +int futhark_free_f64_2d(struct futhark_context *ctx, struct futhark_f64_2d *arr); +int futhark_values_f64_2d(struct futhark_context *ctx, struct futhark_f64_2d *arr, double *data); +unsigned char *futhark_values_raw_f64_2d(struct futhark_context *ctx, struct futhark_f64_2d *arr); +const int64_t *futhark_shape_f64_2d(struct futhark_context *ctx, struct futhark_f64_2d *arr); + +// Opaque values + + + +// Entry points +int futhark_entry_dloss(struct futhark_context *ctx, struct futhark_f64_2d **out0, struct futhark_f64_2d **out1, const struct futhark_f64_2d *in0, const struct futhark_f64_2d *in1, const struct futhark_f64_2d *in2); +int futhark_entry_loss(struct futhark_context *ctx, double *out0, const struct futhark_f64_2d *in0, const struct futhark_f64_2d *in1, const struct futhark_f64_2d *in2); +int futhark_entry_pcaWithQuantile(struct futhark_context *ctx, struct futhark_f64_2d **out0, struct futhark_f64_2d **out1, struct futhark_f64_2d **out2, double *out3, const double in0, const int64_t in1, const struct futhark_f64_2d *in2); + +// Miscellaneous +int futhark_context_sync(struct futhark_context *ctx); +void futhark_context_config_set_cache_file(struct futhark_context_config *cfg, const char *f); +char *futhark_context_report(struct futhark_context *ctx); +char *futhark_context_get_error(struct futhark_context *ctx); +void futhark_context_set_logging_file(struct futhark_context *ctx, FILE *f); +void futhark_context_pause_profiling(struct futhark_context *ctx); +void futhark_context_unpause_profiling(struct futhark_context *ctx); +int futhark_context_clear_caches(struct futhark_context *ctx); +#define FUTHARK_BACKEND_c +#define FUTHARK_SUCCESS 0 +#define FUTHARK_PROGRAM_ERROR 2 +#define FUTHARK_OUT_OF_MEMORY 3 + +#ifdef __cplusplus +} +#endif + +#include +#include +#include +#include +#include +// If NDEBUG is set, the assert() macro will do nothing. Since Futhark +// (unfortunately) makes use of assert() for error detection (and even some +// side effects), we want to avoid that. +#undef NDEBUG +#include +#include +// Start of util.h. +// +// Various helper functions that are useful in all generated C code. + +#include +#include + +static const char *fut_progname = "(embedded Futhark)"; + +static void futhark_panic(int eval, const char *fmt, ...) __attribute__((noreturn)); +static char* msgprintf(const char *s, ...); +static void* slurp_file(const char *filename, size_t *size); +static int dump_file(const char *file, const void *buf, size_t n); +struct str_builder; +static void str_builder_init(struct str_builder *b); +static void str_builder(struct str_builder *b, const char *s, ...); +static char *strclone(const char *str); + +static void futhark_panic(int eval, const char *fmt, ...) { + va_list ap; + va_start(ap, fmt); + fprintf(stderr, "%s: ", fut_progname); + vfprintf(stderr, fmt, ap); + va_end(ap); + exit(eval); +} + +// For generating arbitrary-sized error messages. It is the callers +// responsibility to free the buffer at some point. +static char* msgprintf(const char *s, ...) { + va_list vl; + va_start(vl, s); + size_t needed = 1 + (size_t)vsnprintf(NULL, 0, s, vl); + char *buffer = (char*) malloc(needed); + va_start(vl, s); // Must re-init. + vsnprintf(buffer, needed, s, vl); + return buffer; +} + +static inline void check_err(int errval, int sets_errno, const char *fun, int line, + const char *msg, ...) { + if (errval) { + char errnum[10]; + + va_list vl; + va_start(vl, msg); + + fprintf(stderr, "ERROR: "); + vfprintf(stderr, msg, vl); + fprintf(stderr, " in %s() at line %d with error code %s\n", + fun, line, + sets_errno ? strerror(errno) : errnum); + exit(errval); + } +} + +#define CHECK_ERR(err, ...) check_err(err, 0, __func__, __LINE__, __VA_ARGS__) +#define CHECK_ERRNO(err, ...) check_err(err, 1, __func__, __LINE__, __VA_ARGS__) + +// Read the rest of an open file into a NUL-terminated string; returns +// NULL on error. +static void* fslurp_file(FILE *f, size_t *size) { + long start = ftell(f); + fseek(f, 0, SEEK_END); + long src_size = ftell(f)-start; + fseek(f, start, SEEK_SET); + unsigned char *s = (unsigned char*) malloc((size_t)src_size + 1); + if (fread(s, 1, (size_t)src_size, f) != (size_t)src_size) { + free(s); + s = NULL; + } else { + s[src_size] = '\0'; + } + + if (size) { + *size = (size_t)src_size; + } + + return s; +} + +// Read a file into a NUL-terminated string; returns NULL on error. +static void* slurp_file(const char *filename, size_t *size) { + FILE *f = fopen(filename, "rb"); // To avoid Windows messing with linebreaks. + if (f == NULL) return NULL; + unsigned char *s = fslurp_file(f, size); + fclose(f); + return s; +} + +// Dump 'n' bytes from 'buf' into the file at the designated location. +// Returns 0 on success. +static int dump_file(const char *file, const void *buf, size_t n) { + FILE *f = fopen(file, "w"); + + if (f == NULL) { + return 1; + } + + if (fwrite(buf, sizeof(char), n, f) != n) { + return 1; + } + + if (fclose(f) != 0) { + return 1; + } + + return 0; +} + +struct str_builder { + char *str; + size_t capacity; // Size of buffer. + size_t used; // Bytes used, *not* including final zero. +}; + +static void str_builder_init(struct str_builder *b) { + b->capacity = 10; + b->used = 0; + b->str = malloc(b->capacity); + b->str[0] = 0; +} + +static void str_builder(struct str_builder *b, const char *s, ...) { + va_list vl; + va_start(vl, s); + size_t needed = (size_t)vsnprintf(NULL, 0, s, vl); + + while (b->capacity < b->used + needed + 1) { + b->capacity *= 2; + b->str = realloc(b->str, b->capacity); + } + + va_start(vl, s); // Must re-init. + vsnprintf(b->str+b->used, b->capacity-b->used, s, vl); + b->used += needed; +} + + +static char *strclone(const char *str) { + size_t size = strlen(str) + 1; + char *copy = (char*) malloc(size); + if (copy == NULL) { + return NULL; + } + + memcpy(copy, str, size); + return copy; +} + +// End of util.h. +// Start of cache.h + +#define CACHE_HASH_SIZE 8 // In 32-bit words. + +struct cache_hash { + uint32_t hash[CACHE_HASH_SIZE]; +}; + +// Initialise a blank cache. +static void cache_hash_init(struct cache_hash *c); + +// Hash some bytes and add them to the accumulated hash. +static void cache_hash(struct cache_hash *out, const char *in, size_t n); + +// Try to restore cache contents from a file with the given name. +// Assumes the cache is invalid if it contains the given hash. +// Allocates memory and reads the cache conents, which is returned in +// *buf with size *buflen. If the cache is successfully loaded, this +// function returns 0. Otherwise it returns nonzero. Errno is set if +// the failure to load the cache is due to anything except invalid +// cache conents. Note that failing to restore the cache is not +// necessarily a problem: it might just be invalid or not created yet. +static int cache_restore(const char *fname, const struct cache_hash *hash, + unsigned char **buf, size_t *buflen); + +// Store cache contents in the given file, with the given hash. +static int cache_store(const char *fname, const struct cache_hash *hash, + const unsigned char *buf, size_t buflen); + +// Now for the implementation. + +static void cache_hash_init(struct cache_hash *c) { + memset(c->hash, 0, CACHE_HASH_SIZE * sizeof(uint32_t)); +} + +static void cache_hash(struct cache_hash *out, const char *in, size_t n) { + // Adaptation of djb2 for larger output size by storing intermediate + // states. + uint32_t hash = 5381; + for (size_t i = 0; i < n; i++) { + hash = ((hash << 5) + hash) + in[i]; + out->hash[i % CACHE_HASH_SIZE] ^= hash; + } +} + +#define CACHE_HEADER_SIZE 8 +static const char cache_header[CACHE_HEADER_SIZE] = "FUTHARK\0"; + +static int cache_restore(const char *fname, const struct cache_hash *hash, + unsigned char **buf, size_t *buflen) { + FILE *f = fopen(fname, "rb"); + + if (f == NULL) { + return 1; + } + + char f_header[CACHE_HEADER_SIZE]; + + if (fread(f_header, sizeof(char), CACHE_HEADER_SIZE, f) != CACHE_HEADER_SIZE) { + goto error; + } + + if (memcmp(f_header, cache_header, CACHE_HEADER_SIZE) != 0) { + goto error; + } + + if (fseek(f, 0, SEEK_END) != 0) { + goto error; + } + int64_t f_size = (int64_t)ftell(f); + if (fseek(f, CACHE_HEADER_SIZE, SEEK_SET) != 0) { + goto error; + } + + int64_t expected_size; + + if (fread(&expected_size, sizeof(int64_t), 1, f) != 1) { + goto error; + } + + if (f_size != expected_size) { + errno = 0; + goto error; + } + + int32_t f_hash[CACHE_HASH_SIZE]; + + if (fread(f_hash, sizeof(int32_t), CACHE_HASH_SIZE, f) != CACHE_HASH_SIZE) { + goto error; + } + + if (memcmp(f_hash, hash->hash, CACHE_HASH_SIZE) != 0) { + errno = 0; + goto error; + } + + *buflen = f_size - CACHE_HEADER_SIZE - sizeof(int64_t) - CACHE_HASH_SIZE*sizeof(int32_t); + *buf = malloc(*buflen); + if (fread(*buf, sizeof(char), *buflen, f) != *buflen) { + free(*buf); + goto error; + } + + fclose(f); + + return 0; + + error: + fclose(f); + return 1; +} + +static int cache_store(const char *fname, const struct cache_hash *hash, + const unsigned char *buf, size_t buflen) { + FILE *f = fopen(fname, "wb"); + + if (f == NULL) { + return 1; + } + + if (fwrite(cache_header, CACHE_HEADER_SIZE, 1, f) != 1) { + goto error; + } + + int64_t size = CACHE_HEADER_SIZE + sizeof(int64_t) + CACHE_HASH_SIZE*sizeof(int32_t) + buflen; + + if (fwrite(&size, sizeof(size), 1, f) != 1) { + goto error; + } + + if (fwrite(hash->hash, sizeof(int32_t), CACHE_HASH_SIZE, f) != CACHE_HASH_SIZE) { + goto error; + } + + if (fwrite(buf, sizeof(unsigned char), buflen, f) != buflen) { + goto error; + } + + fclose(f); + + return 0; + + error: + fclose(f); + return 1; +} + +// End of cache.h +// Start of half.h. + +// Conversion functions are from http://half.sourceforge.net/, but +// translated to C. +// +// Copyright (c) 2012-2021 Christian Rau +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#ifndef __OPENCL_VERSION__ +#define __constant +#endif + +__constant static const uint16_t base_table[512] = { + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0001, 0x0002, 0x0004, 0x0008, 0x0010, 0x0020, 0x0040, 0x0080, 0x0100, + 0x0200, 0x0400, 0x0800, 0x0C00, 0x1000, 0x1400, 0x1800, 0x1C00, 0x2000, 0x2400, 0x2800, 0x2C00, 0x3000, 0x3400, 0x3800, 0x3C00, + 0x4000, 0x4400, 0x4800, 0x4C00, 0x5000, 0x5400, 0x5800, 0x5C00, 0x6000, 0x6400, 0x6800, 0x6C00, 0x7000, 0x7400, 0x7800, 0x7C00, + 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, + 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, + 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, + 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, + 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, + 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, + 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8001, 0x8002, 0x8004, 0x8008, 0x8010, 0x8020, 0x8040, 0x8080, 0x8100, + 0x8200, 0x8400, 0x8800, 0x8C00, 0x9000, 0x9400, 0x9800, 0x9C00, 0xA000, 0xA400, 0xA800, 0xAC00, 0xB000, 0xB400, 0xB800, 0xBC00, + 0xC000, 0xC400, 0xC800, 0xCC00, 0xD000, 0xD400, 0xD800, 0xDC00, 0xE000, 0xE400, 0xE800, 0xEC00, 0xF000, 0xF400, 0xF800, 0xFC00, + 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, + 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, + 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, + 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, + 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, + 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, + 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00 }; + +__constant static const unsigned char shift_table[512] = { + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 13, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 13 }; + +__constant static const uint32_t mantissa_table[2048] = { + 0x00000000, 0x33800000, 0x34000000, 0x34400000, 0x34800000, 0x34A00000, 0x34C00000, 0x34E00000, 0x35000000, 0x35100000, 0x35200000, 0x35300000, 0x35400000, 0x35500000, 0x35600000, 0x35700000, + 0x35800000, 0x35880000, 0x35900000, 0x35980000, 0x35A00000, 0x35A80000, 0x35B00000, 0x35B80000, 0x35C00000, 0x35C80000, 0x35D00000, 0x35D80000, 0x35E00000, 0x35E80000, 0x35F00000, 0x35F80000, + 0x36000000, 0x36040000, 0x36080000, 0x360C0000, 0x36100000, 0x36140000, 0x36180000, 0x361C0000, 0x36200000, 0x36240000, 0x36280000, 0x362C0000, 0x36300000, 0x36340000, 0x36380000, 0x363C0000, + 0x36400000, 0x36440000, 0x36480000, 0x364C0000, 0x36500000, 0x36540000, 0x36580000, 0x365C0000, 0x36600000, 0x36640000, 0x36680000, 0x366C0000, 0x36700000, 0x36740000, 0x36780000, 0x367C0000, + 0x36800000, 0x36820000, 0x36840000, 0x36860000, 0x36880000, 0x368A0000, 0x368C0000, 0x368E0000, 0x36900000, 0x36920000, 0x36940000, 0x36960000, 0x36980000, 0x369A0000, 0x369C0000, 0x369E0000, + 0x36A00000, 0x36A20000, 0x36A40000, 0x36A60000, 0x36A80000, 0x36AA0000, 0x36AC0000, 0x36AE0000, 0x36B00000, 0x36B20000, 0x36B40000, 0x36B60000, 0x36B80000, 0x36BA0000, 0x36BC0000, 0x36BE0000, + 0x36C00000, 0x36C20000, 0x36C40000, 0x36C60000, 0x36C80000, 0x36CA0000, 0x36CC0000, 0x36CE0000, 0x36D00000, 0x36D20000, 0x36D40000, 0x36D60000, 0x36D80000, 0x36DA0000, 0x36DC0000, 0x36DE0000, + 0x36E00000, 0x36E20000, 0x36E40000, 0x36E60000, 0x36E80000, 0x36EA0000, 0x36EC0000, 0x36EE0000, 0x36F00000, 0x36F20000, 0x36F40000, 0x36F60000, 0x36F80000, 0x36FA0000, 0x36FC0000, 0x36FE0000, + 0x37000000, 0x37010000, 0x37020000, 0x37030000, 0x37040000, 0x37050000, 0x37060000, 0x37070000, 0x37080000, 0x37090000, 0x370A0000, 0x370B0000, 0x370C0000, 0x370D0000, 0x370E0000, 0x370F0000, + 0x37100000, 0x37110000, 0x37120000, 0x37130000, 0x37140000, 0x37150000, 0x37160000, 0x37170000, 0x37180000, 0x37190000, 0x371A0000, 0x371B0000, 0x371C0000, 0x371D0000, 0x371E0000, 0x371F0000, + 0x37200000, 0x37210000, 0x37220000, 0x37230000, 0x37240000, 0x37250000, 0x37260000, 0x37270000, 0x37280000, 0x37290000, 0x372A0000, 0x372B0000, 0x372C0000, 0x372D0000, 0x372E0000, 0x372F0000, + 0x37300000, 0x37310000, 0x37320000, 0x37330000, 0x37340000, 0x37350000, 0x37360000, 0x37370000, 0x37380000, 0x37390000, 0x373A0000, 0x373B0000, 0x373C0000, 0x373D0000, 0x373E0000, 0x373F0000, + 0x37400000, 0x37410000, 0x37420000, 0x37430000, 0x37440000, 0x37450000, 0x37460000, 0x37470000, 0x37480000, 0x37490000, 0x374A0000, 0x374B0000, 0x374C0000, 0x374D0000, 0x374E0000, 0x374F0000, + 0x37500000, 0x37510000, 0x37520000, 0x37530000, 0x37540000, 0x37550000, 0x37560000, 0x37570000, 0x37580000, 0x37590000, 0x375A0000, 0x375B0000, 0x375C0000, 0x375D0000, 0x375E0000, 0x375F0000, + 0x37600000, 0x37610000, 0x37620000, 0x37630000, 0x37640000, 0x37650000, 0x37660000, 0x37670000, 0x37680000, 0x37690000, 0x376A0000, 0x376B0000, 0x376C0000, 0x376D0000, 0x376E0000, 0x376F0000, + 0x37700000, 0x37710000, 0x37720000, 0x37730000, 0x37740000, 0x37750000, 0x37760000, 0x37770000, 0x37780000, 0x37790000, 0x377A0000, 0x377B0000, 0x377C0000, 0x377D0000, 0x377E0000, 0x377F0000, + 0x37800000, 0x37808000, 0x37810000, 0x37818000, 0x37820000, 0x37828000, 0x37830000, 0x37838000, 0x37840000, 0x37848000, 0x37850000, 0x37858000, 0x37860000, 0x37868000, 0x37870000, 0x37878000, + 0x37880000, 0x37888000, 0x37890000, 0x37898000, 0x378A0000, 0x378A8000, 0x378B0000, 0x378B8000, 0x378C0000, 0x378C8000, 0x378D0000, 0x378D8000, 0x378E0000, 0x378E8000, 0x378F0000, 0x378F8000, + 0x37900000, 0x37908000, 0x37910000, 0x37918000, 0x37920000, 0x37928000, 0x37930000, 0x37938000, 0x37940000, 0x37948000, 0x37950000, 0x37958000, 0x37960000, 0x37968000, 0x37970000, 0x37978000, + 0x37980000, 0x37988000, 0x37990000, 0x37998000, 0x379A0000, 0x379A8000, 0x379B0000, 0x379B8000, 0x379C0000, 0x379C8000, 0x379D0000, 0x379D8000, 0x379E0000, 0x379E8000, 0x379F0000, 0x379F8000, + 0x37A00000, 0x37A08000, 0x37A10000, 0x37A18000, 0x37A20000, 0x37A28000, 0x37A30000, 0x37A38000, 0x37A40000, 0x37A48000, 0x37A50000, 0x37A58000, 0x37A60000, 0x37A68000, 0x37A70000, 0x37A78000, + 0x37A80000, 0x37A88000, 0x37A90000, 0x37A98000, 0x37AA0000, 0x37AA8000, 0x37AB0000, 0x37AB8000, 0x37AC0000, 0x37AC8000, 0x37AD0000, 0x37AD8000, 0x37AE0000, 0x37AE8000, 0x37AF0000, 0x37AF8000, + 0x37B00000, 0x37B08000, 0x37B10000, 0x37B18000, 0x37B20000, 0x37B28000, 0x37B30000, 0x37B38000, 0x37B40000, 0x37B48000, 0x37B50000, 0x37B58000, 0x37B60000, 0x37B68000, 0x37B70000, 0x37B78000, + 0x37B80000, 0x37B88000, 0x37B90000, 0x37B98000, 0x37BA0000, 0x37BA8000, 0x37BB0000, 0x37BB8000, 0x37BC0000, 0x37BC8000, 0x37BD0000, 0x37BD8000, 0x37BE0000, 0x37BE8000, 0x37BF0000, 0x37BF8000, + 0x37C00000, 0x37C08000, 0x37C10000, 0x37C18000, 0x37C20000, 0x37C28000, 0x37C30000, 0x37C38000, 0x37C40000, 0x37C48000, 0x37C50000, 0x37C58000, 0x37C60000, 0x37C68000, 0x37C70000, 0x37C78000, + 0x37C80000, 0x37C88000, 0x37C90000, 0x37C98000, 0x37CA0000, 0x37CA8000, 0x37CB0000, 0x37CB8000, 0x37CC0000, 0x37CC8000, 0x37CD0000, 0x37CD8000, 0x37CE0000, 0x37CE8000, 0x37CF0000, 0x37CF8000, + 0x37D00000, 0x37D08000, 0x37D10000, 0x37D18000, 0x37D20000, 0x37D28000, 0x37D30000, 0x37D38000, 0x37D40000, 0x37D48000, 0x37D50000, 0x37D58000, 0x37D60000, 0x37D68000, 0x37D70000, 0x37D78000, + 0x37D80000, 0x37D88000, 0x37D90000, 0x37D98000, 0x37DA0000, 0x37DA8000, 0x37DB0000, 0x37DB8000, 0x37DC0000, 0x37DC8000, 0x37DD0000, 0x37DD8000, 0x37DE0000, 0x37DE8000, 0x37DF0000, 0x37DF8000, + 0x37E00000, 0x37E08000, 0x37E10000, 0x37E18000, 0x37E20000, 0x37E28000, 0x37E30000, 0x37E38000, 0x37E40000, 0x37E48000, 0x37E50000, 0x37E58000, 0x37E60000, 0x37E68000, 0x37E70000, 0x37E78000, + 0x37E80000, 0x37E88000, 0x37E90000, 0x37E98000, 0x37EA0000, 0x37EA8000, 0x37EB0000, 0x37EB8000, 0x37EC0000, 0x37EC8000, 0x37ED0000, 0x37ED8000, 0x37EE0000, 0x37EE8000, 0x37EF0000, 0x37EF8000, + 0x37F00000, 0x37F08000, 0x37F10000, 0x37F18000, 0x37F20000, 0x37F28000, 0x37F30000, 0x37F38000, 0x37F40000, 0x37F48000, 0x37F50000, 0x37F58000, 0x37F60000, 0x37F68000, 0x37F70000, 0x37F78000, + 0x37F80000, 0x37F88000, 0x37F90000, 0x37F98000, 0x37FA0000, 0x37FA8000, 0x37FB0000, 0x37FB8000, 0x37FC0000, 0x37FC8000, 0x37FD0000, 0x37FD8000, 0x37FE0000, 0x37FE8000, 0x37FF0000, 0x37FF8000, + 0x38000000, 0x38004000, 0x38008000, 0x3800C000, 0x38010000, 0x38014000, 0x38018000, 0x3801C000, 0x38020000, 0x38024000, 0x38028000, 0x3802C000, 0x38030000, 0x38034000, 0x38038000, 0x3803C000, + 0x38040000, 0x38044000, 0x38048000, 0x3804C000, 0x38050000, 0x38054000, 0x38058000, 0x3805C000, 0x38060000, 0x38064000, 0x38068000, 0x3806C000, 0x38070000, 0x38074000, 0x38078000, 0x3807C000, + 0x38080000, 0x38084000, 0x38088000, 0x3808C000, 0x38090000, 0x38094000, 0x38098000, 0x3809C000, 0x380A0000, 0x380A4000, 0x380A8000, 0x380AC000, 0x380B0000, 0x380B4000, 0x380B8000, 0x380BC000, + 0x380C0000, 0x380C4000, 0x380C8000, 0x380CC000, 0x380D0000, 0x380D4000, 0x380D8000, 0x380DC000, 0x380E0000, 0x380E4000, 0x380E8000, 0x380EC000, 0x380F0000, 0x380F4000, 0x380F8000, 0x380FC000, + 0x38100000, 0x38104000, 0x38108000, 0x3810C000, 0x38110000, 0x38114000, 0x38118000, 0x3811C000, 0x38120000, 0x38124000, 0x38128000, 0x3812C000, 0x38130000, 0x38134000, 0x38138000, 0x3813C000, + 0x38140000, 0x38144000, 0x38148000, 0x3814C000, 0x38150000, 0x38154000, 0x38158000, 0x3815C000, 0x38160000, 0x38164000, 0x38168000, 0x3816C000, 0x38170000, 0x38174000, 0x38178000, 0x3817C000, + 0x38180000, 0x38184000, 0x38188000, 0x3818C000, 0x38190000, 0x38194000, 0x38198000, 0x3819C000, 0x381A0000, 0x381A4000, 0x381A8000, 0x381AC000, 0x381B0000, 0x381B4000, 0x381B8000, 0x381BC000, + 0x381C0000, 0x381C4000, 0x381C8000, 0x381CC000, 0x381D0000, 0x381D4000, 0x381D8000, 0x381DC000, 0x381E0000, 0x381E4000, 0x381E8000, 0x381EC000, 0x381F0000, 0x381F4000, 0x381F8000, 0x381FC000, + 0x38200000, 0x38204000, 0x38208000, 0x3820C000, 0x38210000, 0x38214000, 0x38218000, 0x3821C000, 0x38220000, 0x38224000, 0x38228000, 0x3822C000, 0x38230000, 0x38234000, 0x38238000, 0x3823C000, + 0x38240000, 0x38244000, 0x38248000, 0x3824C000, 0x38250000, 0x38254000, 0x38258000, 0x3825C000, 0x38260000, 0x38264000, 0x38268000, 0x3826C000, 0x38270000, 0x38274000, 0x38278000, 0x3827C000, + 0x38280000, 0x38284000, 0x38288000, 0x3828C000, 0x38290000, 0x38294000, 0x38298000, 0x3829C000, 0x382A0000, 0x382A4000, 0x382A8000, 0x382AC000, 0x382B0000, 0x382B4000, 0x382B8000, 0x382BC000, + 0x382C0000, 0x382C4000, 0x382C8000, 0x382CC000, 0x382D0000, 0x382D4000, 0x382D8000, 0x382DC000, 0x382E0000, 0x382E4000, 0x382E8000, 0x382EC000, 0x382F0000, 0x382F4000, 0x382F8000, 0x382FC000, + 0x38300000, 0x38304000, 0x38308000, 0x3830C000, 0x38310000, 0x38314000, 0x38318000, 0x3831C000, 0x38320000, 0x38324000, 0x38328000, 0x3832C000, 0x38330000, 0x38334000, 0x38338000, 0x3833C000, + 0x38340000, 0x38344000, 0x38348000, 0x3834C000, 0x38350000, 0x38354000, 0x38358000, 0x3835C000, 0x38360000, 0x38364000, 0x38368000, 0x3836C000, 0x38370000, 0x38374000, 0x38378000, 0x3837C000, + 0x38380000, 0x38384000, 0x38388000, 0x3838C000, 0x38390000, 0x38394000, 0x38398000, 0x3839C000, 0x383A0000, 0x383A4000, 0x383A8000, 0x383AC000, 0x383B0000, 0x383B4000, 0x383B8000, 0x383BC000, + 0x383C0000, 0x383C4000, 0x383C8000, 0x383CC000, 0x383D0000, 0x383D4000, 0x383D8000, 0x383DC000, 0x383E0000, 0x383E4000, 0x383E8000, 0x383EC000, 0x383F0000, 0x383F4000, 0x383F8000, 0x383FC000, + 0x38400000, 0x38404000, 0x38408000, 0x3840C000, 0x38410000, 0x38414000, 0x38418000, 0x3841C000, 0x38420000, 0x38424000, 0x38428000, 0x3842C000, 0x38430000, 0x38434000, 0x38438000, 0x3843C000, + 0x38440000, 0x38444000, 0x38448000, 0x3844C000, 0x38450000, 0x38454000, 0x38458000, 0x3845C000, 0x38460000, 0x38464000, 0x38468000, 0x3846C000, 0x38470000, 0x38474000, 0x38478000, 0x3847C000, + 0x38480000, 0x38484000, 0x38488000, 0x3848C000, 0x38490000, 0x38494000, 0x38498000, 0x3849C000, 0x384A0000, 0x384A4000, 0x384A8000, 0x384AC000, 0x384B0000, 0x384B4000, 0x384B8000, 0x384BC000, + 0x384C0000, 0x384C4000, 0x384C8000, 0x384CC000, 0x384D0000, 0x384D4000, 0x384D8000, 0x384DC000, 0x384E0000, 0x384E4000, 0x384E8000, 0x384EC000, 0x384F0000, 0x384F4000, 0x384F8000, 0x384FC000, + 0x38500000, 0x38504000, 0x38508000, 0x3850C000, 0x38510000, 0x38514000, 0x38518000, 0x3851C000, 0x38520000, 0x38524000, 0x38528000, 0x3852C000, 0x38530000, 0x38534000, 0x38538000, 0x3853C000, + 0x38540000, 0x38544000, 0x38548000, 0x3854C000, 0x38550000, 0x38554000, 0x38558000, 0x3855C000, 0x38560000, 0x38564000, 0x38568000, 0x3856C000, 0x38570000, 0x38574000, 0x38578000, 0x3857C000, + 0x38580000, 0x38584000, 0x38588000, 0x3858C000, 0x38590000, 0x38594000, 0x38598000, 0x3859C000, 0x385A0000, 0x385A4000, 0x385A8000, 0x385AC000, 0x385B0000, 0x385B4000, 0x385B8000, 0x385BC000, + 0x385C0000, 0x385C4000, 0x385C8000, 0x385CC000, 0x385D0000, 0x385D4000, 0x385D8000, 0x385DC000, 0x385E0000, 0x385E4000, 0x385E8000, 0x385EC000, 0x385F0000, 0x385F4000, 0x385F8000, 0x385FC000, + 0x38600000, 0x38604000, 0x38608000, 0x3860C000, 0x38610000, 0x38614000, 0x38618000, 0x3861C000, 0x38620000, 0x38624000, 0x38628000, 0x3862C000, 0x38630000, 0x38634000, 0x38638000, 0x3863C000, + 0x38640000, 0x38644000, 0x38648000, 0x3864C000, 0x38650000, 0x38654000, 0x38658000, 0x3865C000, 0x38660000, 0x38664000, 0x38668000, 0x3866C000, 0x38670000, 0x38674000, 0x38678000, 0x3867C000, + 0x38680000, 0x38684000, 0x38688000, 0x3868C000, 0x38690000, 0x38694000, 0x38698000, 0x3869C000, 0x386A0000, 0x386A4000, 0x386A8000, 0x386AC000, 0x386B0000, 0x386B4000, 0x386B8000, 0x386BC000, + 0x386C0000, 0x386C4000, 0x386C8000, 0x386CC000, 0x386D0000, 0x386D4000, 0x386D8000, 0x386DC000, 0x386E0000, 0x386E4000, 0x386E8000, 0x386EC000, 0x386F0000, 0x386F4000, 0x386F8000, 0x386FC000, + 0x38700000, 0x38704000, 0x38708000, 0x3870C000, 0x38710000, 0x38714000, 0x38718000, 0x3871C000, 0x38720000, 0x38724000, 0x38728000, 0x3872C000, 0x38730000, 0x38734000, 0x38738000, 0x3873C000, + 0x38740000, 0x38744000, 0x38748000, 0x3874C000, 0x38750000, 0x38754000, 0x38758000, 0x3875C000, 0x38760000, 0x38764000, 0x38768000, 0x3876C000, 0x38770000, 0x38774000, 0x38778000, 0x3877C000, + 0x38780000, 0x38784000, 0x38788000, 0x3878C000, 0x38790000, 0x38794000, 0x38798000, 0x3879C000, 0x387A0000, 0x387A4000, 0x387A8000, 0x387AC000, 0x387B0000, 0x387B4000, 0x387B8000, 0x387BC000, + 0x387C0000, 0x387C4000, 0x387C8000, 0x387CC000, 0x387D0000, 0x387D4000, 0x387D8000, 0x387DC000, 0x387E0000, 0x387E4000, 0x387E8000, 0x387EC000, 0x387F0000, 0x387F4000, 0x387F8000, 0x387FC000, + 0x38000000, 0x38002000, 0x38004000, 0x38006000, 0x38008000, 0x3800A000, 0x3800C000, 0x3800E000, 0x38010000, 0x38012000, 0x38014000, 0x38016000, 0x38018000, 0x3801A000, 0x3801C000, 0x3801E000, + 0x38020000, 0x38022000, 0x38024000, 0x38026000, 0x38028000, 0x3802A000, 0x3802C000, 0x3802E000, 0x38030000, 0x38032000, 0x38034000, 0x38036000, 0x38038000, 0x3803A000, 0x3803C000, 0x3803E000, + 0x38040000, 0x38042000, 0x38044000, 0x38046000, 0x38048000, 0x3804A000, 0x3804C000, 0x3804E000, 0x38050000, 0x38052000, 0x38054000, 0x38056000, 0x38058000, 0x3805A000, 0x3805C000, 0x3805E000, + 0x38060000, 0x38062000, 0x38064000, 0x38066000, 0x38068000, 0x3806A000, 0x3806C000, 0x3806E000, 0x38070000, 0x38072000, 0x38074000, 0x38076000, 0x38078000, 0x3807A000, 0x3807C000, 0x3807E000, + 0x38080000, 0x38082000, 0x38084000, 0x38086000, 0x38088000, 0x3808A000, 0x3808C000, 0x3808E000, 0x38090000, 0x38092000, 0x38094000, 0x38096000, 0x38098000, 0x3809A000, 0x3809C000, 0x3809E000, + 0x380A0000, 0x380A2000, 0x380A4000, 0x380A6000, 0x380A8000, 0x380AA000, 0x380AC000, 0x380AE000, 0x380B0000, 0x380B2000, 0x380B4000, 0x380B6000, 0x380B8000, 0x380BA000, 0x380BC000, 0x380BE000, + 0x380C0000, 0x380C2000, 0x380C4000, 0x380C6000, 0x380C8000, 0x380CA000, 0x380CC000, 0x380CE000, 0x380D0000, 0x380D2000, 0x380D4000, 0x380D6000, 0x380D8000, 0x380DA000, 0x380DC000, 0x380DE000, + 0x380E0000, 0x380E2000, 0x380E4000, 0x380E6000, 0x380E8000, 0x380EA000, 0x380EC000, 0x380EE000, 0x380F0000, 0x380F2000, 0x380F4000, 0x380F6000, 0x380F8000, 0x380FA000, 0x380FC000, 0x380FE000, + 0x38100000, 0x38102000, 0x38104000, 0x38106000, 0x38108000, 0x3810A000, 0x3810C000, 0x3810E000, 0x38110000, 0x38112000, 0x38114000, 0x38116000, 0x38118000, 0x3811A000, 0x3811C000, 0x3811E000, + 0x38120000, 0x38122000, 0x38124000, 0x38126000, 0x38128000, 0x3812A000, 0x3812C000, 0x3812E000, 0x38130000, 0x38132000, 0x38134000, 0x38136000, 0x38138000, 0x3813A000, 0x3813C000, 0x3813E000, + 0x38140000, 0x38142000, 0x38144000, 0x38146000, 0x38148000, 0x3814A000, 0x3814C000, 0x3814E000, 0x38150000, 0x38152000, 0x38154000, 0x38156000, 0x38158000, 0x3815A000, 0x3815C000, 0x3815E000, + 0x38160000, 0x38162000, 0x38164000, 0x38166000, 0x38168000, 0x3816A000, 0x3816C000, 0x3816E000, 0x38170000, 0x38172000, 0x38174000, 0x38176000, 0x38178000, 0x3817A000, 0x3817C000, 0x3817E000, + 0x38180000, 0x38182000, 0x38184000, 0x38186000, 0x38188000, 0x3818A000, 0x3818C000, 0x3818E000, 0x38190000, 0x38192000, 0x38194000, 0x38196000, 0x38198000, 0x3819A000, 0x3819C000, 0x3819E000, + 0x381A0000, 0x381A2000, 0x381A4000, 0x381A6000, 0x381A8000, 0x381AA000, 0x381AC000, 0x381AE000, 0x381B0000, 0x381B2000, 0x381B4000, 0x381B6000, 0x381B8000, 0x381BA000, 0x381BC000, 0x381BE000, + 0x381C0000, 0x381C2000, 0x381C4000, 0x381C6000, 0x381C8000, 0x381CA000, 0x381CC000, 0x381CE000, 0x381D0000, 0x381D2000, 0x381D4000, 0x381D6000, 0x381D8000, 0x381DA000, 0x381DC000, 0x381DE000, + 0x381E0000, 0x381E2000, 0x381E4000, 0x381E6000, 0x381E8000, 0x381EA000, 0x381EC000, 0x381EE000, 0x381F0000, 0x381F2000, 0x381F4000, 0x381F6000, 0x381F8000, 0x381FA000, 0x381FC000, 0x381FE000, + 0x38200000, 0x38202000, 0x38204000, 0x38206000, 0x38208000, 0x3820A000, 0x3820C000, 0x3820E000, 0x38210000, 0x38212000, 0x38214000, 0x38216000, 0x38218000, 0x3821A000, 0x3821C000, 0x3821E000, + 0x38220000, 0x38222000, 0x38224000, 0x38226000, 0x38228000, 0x3822A000, 0x3822C000, 0x3822E000, 0x38230000, 0x38232000, 0x38234000, 0x38236000, 0x38238000, 0x3823A000, 0x3823C000, 0x3823E000, + 0x38240000, 0x38242000, 0x38244000, 0x38246000, 0x38248000, 0x3824A000, 0x3824C000, 0x3824E000, 0x38250000, 0x38252000, 0x38254000, 0x38256000, 0x38258000, 0x3825A000, 0x3825C000, 0x3825E000, + 0x38260000, 0x38262000, 0x38264000, 0x38266000, 0x38268000, 0x3826A000, 0x3826C000, 0x3826E000, 0x38270000, 0x38272000, 0x38274000, 0x38276000, 0x38278000, 0x3827A000, 0x3827C000, 0x3827E000, + 0x38280000, 0x38282000, 0x38284000, 0x38286000, 0x38288000, 0x3828A000, 0x3828C000, 0x3828E000, 0x38290000, 0x38292000, 0x38294000, 0x38296000, 0x38298000, 0x3829A000, 0x3829C000, 0x3829E000, + 0x382A0000, 0x382A2000, 0x382A4000, 0x382A6000, 0x382A8000, 0x382AA000, 0x382AC000, 0x382AE000, 0x382B0000, 0x382B2000, 0x382B4000, 0x382B6000, 0x382B8000, 0x382BA000, 0x382BC000, 0x382BE000, + 0x382C0000, 0x382C2000, 0x382C4000, 0x382C6000, 0x382C8000, 0x382CA000, 0x382CC000, 0x382CE000, 0x382D0000, 0x382D2000, 0x382D4000, 0x382D6000, 0x382D8000, 0x382DA000, 0x382DC000, 0x382DE000, + 0x382E0000, 0x382E2000, 0x382E4000, 0x382E6000, 0x382E8000, 0x382EA000, 0x382EC000, 0x382EE000, 0x382F0000, 0x382F2000, 0x382F4000, 0x382F6000, 0x382F8000, 0x382FA000, 0x382FC000, 0x382FE000, + 0x38300000, 0x38302000, 0x38304000, 0x38306000, 0x38308000, 0x3830A000, 0x3830C000, 0x3830E000, 0x38310000, 0x38312000, 0x38314000, 0x38316000, 0x38318000, 0x3831A000, 0x3831C000, 0x3831E000, + 0x38320000, 0x38322000, 0x38324000, 0x38326000, 0x38328000, 0x3832A000, 0x3832C000, 0x3832E000, 0x38330000, 0x38332000, 0x38334000, 0x38336000, 0x38338000, 0x3833A000, 0x3833C000, 0x3833E000, + 0x38340000, 0x38342000, 0x38344000, 0x38346000, 0x38348000, 0x3834A000, 0x3834C000, 0x3834E000, 0x38350000, 0x38352000, 0x38354000, 0x38356000, 0x38358000, 0x3835A000, 0x3835C000, 0x3835E000, + 0x38360000, 0x38362000, 0x38364000, 0x38366000, 0x38368000, 0x3836A000, 0x3836C000, 0x3836E000, 0x38370000, 0x38372000, 0x38374000, 0x38376000, 0x38378000, 0x3837A000, 0x3837C000, 0x3837E000, + 0x38380000, 0x38382000, 0x38384000, 0x38386000, 0x38388000, 0x3838A000, 0x3838C000, 0x3838E000, 0x38390000, 0x38392000, 0x38394000, 0x38396000, 0x38398000, 0x3839A000, 0x3839C000, 0x3839E000, + 0x383A0000, 0x383A2000, 0x383A4000, 0x383A6000, 0x383A8000, 0x383AA000, 0x383AC000, 0x383AE000, 0x383B0000, 0x383B2000, 0x383B4000, 0x383B6000, 0x383B8000, 0x383BA000, 0x383BC000, 0x383BE000, + 0x383C0000, 0x383C2000, 0x383C4000, 0x383C6000, 0x383C8000, 0x383CA000, 0x383CC000, 0x383CE000, 0x383D0000, 0x383D2000, 0x383D4000, 0x383D6000, 0x383D8000, 0x383DA000, 0x383DC000, 0x383DE000, + 0x383E0000, 0x383E2000, 0x383E4000, 0x383E6000, 0x383E8000, 0x383EA000, 0x383EC000, 0x383EE000, 0x383F0000, 0x383F2000, 0x383F4000, 0x383F6000, 0x383F8000, 0x383FA000, 0x383FC000, 0x383FE000, + 0x38400000, 0x38402000, 0x38404000, 0x38406000, 0x38408000, 0x3840A000, 0x3840C000, 0x3840E000, 0x38410000, 0x38412000, 0x38414000, 0x38416000, 0x38418000, 0x3841A000, 0x3841C000, 0x3841E000, + 0x38420000, 0x38422000, 0x38424000, 0x38426000, 0x38428000, 0x3842A000, 0x3842C000, 0x3842E000, 0x38430000, 0x38432000, 0x38434000, 0x38436000, 0x38438000, 0x3843A000, 0x3843C000, 0x3843E000, + 0x38440000, 0x38442000, 0x38444000, 0x38446000, 0x38448000, 0x3844A000, 0x3844C000, 0x3844E000, 0x38450000, 0x38452000, 0x38454000, 0x38456000, 0x38458000, 0x3845A000, 0x3845C000, 0x3845E000, + 0x38460000, 0x38462000, 0x38464000, 0x38466000, 0x38468000, 0x3846A000, 0x3846C000, 0x3846E000, 0x38470000, 0x38472000, 0x38474000, 0x38476000, 0x38478000, 0x3847A000, 0x3847C000, 0x3847E000, + 0x38480000, 0x38482000, 0x38484000, 0x38486000, 0x38488000, 0x3848A000, 0x3848C000, 0x3848E000, 0x38490000, 0x38492000, 0x38494000, 0x38496000, 0x38498000, 0x3849A000, 0x3849C000, 0x3849E000, + 0x384A0000, 0x384A2000, 0x384A4000, 0x384A6000, 0x384A8000, 0x384AA000, 0x384AC000, 0x384AE000, 0x384B0000, 0x384B2000, 0x384B4000, 0x384B6000, 0x384B8000, 0x384BA000, 0x384BC000, 0x384BE000, + 0x384C0000, 0x384C2000, 0x384C4000, 0x384C6000, 0x384C8000, 0x384CA000, 0x384CC000, 0x384CE000, 0x384D0000, 0x384D2000, 0x384D4000, 0x384D6000, 0x384D8000, 0x384DA000, 0x384DC000, 0x384DE000, + 0x384E0000, 0x384E2000, 0x384E4000, 0x384E6000, 0x384E8000, 0x384EA000, 0x384EC000, 0x384EE000, 0x384F0000, 0x384F2000, 0x384F4000, 0x384F6000, 0x384F8000, 0x384FA000, 0x384FC000, 0x384FE000, + 0x38500000, 0x38502000, 0x38504000, 0x38506000, 0x38508000, 0x3850A000, 0x3850C000, 0x3850E000, 0x38510000, 0x38512000, 0x38514000, 0x38516000, 0x38518000, 0x3851A000, 0x3851C000, 0x3851E000, + 0x38520000, 0x38522000, 0x38524000, 0x38526000, 0x38528000, 0x3852A000, 0x3852C000, 0x3852E000, 0x38530000, 0x38532000, 0x38534000, 0x38536000, 0x38538000, 0x3853A000, 0x3853C000, 0x3853E000, + 0x38540000, 0x38542000, 0x38544000, 0x38546000, 0x38548000, 0x3854A000, 0x3854C000, 0x3854E000, 0x38550000, 0x38552000, 0x38554000, 0x38556000, 0x38558000, 0x3855A000, 0x3855C000, 0x3855E000, + 0x38560000, 0x38562000, 0x38564000, 0x38566000, 0x38568000, 0x3856A000, 0x3856C000, 0x3856E000, 0x38570000, 0x38572000, 0x38574000, 0x38576000, 0x38578000, 0x3857A000, 0x3857C000, 0x3857E000, + 0x38580000, 0x38582000, 0x38584000, 0x38586000, 0x38588000, 0x3858A000, 0x3858C000, 0x3858E000, 0x38590000, 0x38592000, 0x38594000, 0x38596000, 0x38598000, 0x3859A000, 0x3859C000, 0x3859E000, + 0x385A0000, 0x385A2000, 0x385A4000, 0x385A6000, 0x385A8000, 0x385AA000, 0x385AC000, 0x385AE000, 0x385B0000, 0x385B2000, 0x385B4000, 0x385B6000, 0x385B8000, 0x385BA000, 0x385BC000, 0x385BE000, + 0x385C0000, 0x385C2000, 0x385C4000, 0x385C6000, 0x385C8000, 0x385CA000, 0x385CC000, 0x385CE000, 0x385D0000, 0x385D2000, 0x385D4000, 0x385D6000, 0x385D8000, 0x385DA000, 0x385DC000, 0x385DE000, + 0x385E0000, 0x385E2000, 0x385E4000, 0x385E6000, 0x385E8000, 0x385EA000, 0x385EC000, 0x385EE000, 0x385F0000, 0x385F2000, 0x385F4000, 0x385F6000, 0x385F8000, 0x385FA000, 0x385FC000, 0x385FE000, + 0x38600000, 0x38602000, 0x38604000, 0x38606000, 0x38608000, 0x3860A000, 0x3860C000, 0x3860E000, 0x38610000, 0x38612000, 0x38614000, 0x38616000, 0x38618000, 0x3861A000, 0x3861C000, 0x3861E000, + 0x38620000, 0x38622000, 0x38624000, 0x38626000, 0x38628000, 0x3862A000, 0x3862C000, 0x3862E000, 0x38630000, 0x38632000, 0x38634000, 0x38636000, 0x38638000, 0x3863A000, 0x3863C000, 0x3863E000, + 0x38640000, 0x38642000, 0x38644000, 0x38646000, 0x38648000, 0x3864A000, 0x3864C000, 0x3864E000, 0x38650000, 0x38652000, 0x38654000, 0x38656000, 0x38658000, 0x3865A000, 0x3865C000, 0x3865E000, + 0x38660000, 0x38662000, 0x38664000, 0x38666000, 0x38668000, 0x3866A000, 0x3866C000, 0x3866E000, 0x38670000, 0x38672000, 0x38674000, 0x38676000, 0x38678000, 0x3867A000, 0x3867C000, 0x3867E000, + 0x38680000, 0x38682000, 0x38684000, 0x38686000, 0x38688000, 0x3868A000, 0x3868C000, 0x3868E000, 0x38690000, 0x38692000, 0x38694000, 0x38696000, 0x38698000, 0x3869A000, 0x3869C000, 0x3869E000, + 0x386A0000, 0x386A2000, 0x386A4000, 0x386A6000, 0x386A8000, 0x386AA000, 0x386AC000, 0x386AE000, 0x386B0000, 0x386B2000, 0x386B4000, 0x386B6000, 0x386B8000, 0x386BA000, 0x386BC000, 0x386BE000, + 0x386C0000, 0x386C2000, 0x386C4000, 0x386C6000, 0x386C8000, 0x386CA000, 0x386CC000, 0x386CE000, 0x386D0000, 0x386D2000, 0x386D4000, 0x386D6000, 0x386D8000, 0x386DA000, 0x386DC000, 0x386DE000, + 0x386E0000, 0x386E2000, 0x386E4000, 0x386E6000, 0x386E8000, 0x386EA000, 0x386EC000, 0x386EE000, 0x386F0000, 0x386F2000, 0x386F4000, 0x386F6000, 0x386F8000, 0x386FA000, 0x386FC000, 0x386FE000, + 0x38700000, 0x38702000, 0x38704000, 0x38706000, 0x38708000, 0x3870A000, 0x3870C000, 0x3870E000, 0x38710000, 0x38712000, 0x38714000, 0x38716000, 0x38718000, 0x3871A000, 0x3871C000, 0x3871E000, + 0x38720000, 0x38722000, 0x38724000, 0x38726000, 0x38728000, 0x3872A000, 0x3872C000, 0x3872E000, 0x38730000, 0x38732000, 0x38734000, 0x38736000, 0x38738000, 0x3873A000, 0x3873C000, 0x3873E000, + 0x38740000, 0x38742000, 0x38744000, 0x38746000, 0x38748000, 0x3874A000, 0x3874C000, 0x3874E000, 0x38750000, 0x38752000, 0x38754000, 0x38756000, 0x38758000, 0x3875A000, 0x3875C000, 0x3875E000, + 0x38760000, 0x38762000, 0x38764000, 0x38766000, 0x38768000, 0x3876A000, 0x3876C000, 0x3876E000, 0x38770000, 0x38772000, 0x38774000, 0x38776000, 0x38778000, 0x3877A000, 0x3877C000, 0x3877E000, + 0x38780000, 0x38782000, 0x38784000, 0x38786000, 0x38788000, 0x3878A000, 0x3878C000, 0x3878E000, 0x38790000, 0x38792000, 0x38794000, 0x38796000, 0x38798000, 0x3879A000, 0x3879C000, 0x3879E000, + 0x387A0000, 0x387A2000, 0x387A4000, 0x387A6000, 0x387A8000, 0x387AA000, 0x387AC000, 0x387AE000, 0x387B0000, 0x387B2000, 0x387B4000, 0x387B6000, 0x387B8000, 0x387BA000, 0x387BC000, 0x387BE000, + 0x387C0000, 0x387C2000, 0x387C4000, 0x387C6000, 0x387C8000, 0x387CA000, 0x387CC000, 0x387CE000, 0x387D0000, 0x387D2000, 0x387D4000, 0x387D6000, 0x387D8000, 0x387DA000, 0x387DC000, 0x387DE000, + 0x387E0000, 0x387E2000, 0x387E4000, 0x387E6000, 0x387E8000, 0x387EA000, 0x387EC000, 0x387EE000, 0x387F0000, 0x387F2000, 0x387F4000, 0x387F6000, 0x387F8000, 0x387FA000, 0x387FC000, 0x387FE000 }; +__constant static const uint32_t exponent_table[64] = { + 0x00000000, 0x00800000, 0x01000000, 0x01800000, 0x02000000, 0x02800000, 0x03000000, 0x03800000, 0x04000000, 0x04800000, 0x05000000, 0x05800000, 0x06000000, 0x06800000, 0x07000000, 0x07800000, + 0x08000000, 0x08800000, 0x09000000, 0x09800000, 0x0A000000, 0x0A800000, 0x0B000000, 0x0B800000, 0x0C000000, 0x0C800000, 0x0D000000, 0x0D800000, 0x0E000000, 0x0E800000, 0x0F000000, 0x47800000, + 0x80000000, 0x80800000, 0x81000000, 0x81800000, 0x82000000, 0x82800000, 0x83000000, 0x83800000, 0x84000000, 0x84800000, 0x85000000, 0x85800000, 0x86000000, 0x86800000, 0x87000000, 0x87800000, + 0x88000000, 0x88800000, 0x89000000, 0x89800000, 0x8A000000, 0x8A800000, 0x8B000000, 0x8B800000, 0x8C000000, 0x8C800000, 0x8D000000, 0x8D800000, 0x8E000000, 0x8E800000, 0x8F000000, 0xC7800000 }; +__constant static const unsigned short offset_table[64] = { + 0, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, + 0, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024 }; + +static uint16_t float2halfbits(float value) { + union { float x; uint32_t y; } u; + u.x = value; + uint32_t bits = u.y; + + uint16_t hbits = base_table[bits>>23] + (uint16_t)((bits&0x7FFFFF)>>shift_table[bits>>23]);; + + return hbits; +} + +static float halfbits2float(uint16_t value) { + uint32_t bits = mantissa_table[offset_table[value>>10]+(value&0x3FF)] + exponent_table[value>>10]; + + union { uint32_t x; float y; } u; + u.x = bits; + return u.y; +} + +static uint16_t halfbitsnextafter(uint16_t from, uint16_t to) { + int fabs = from & 0x7FFF, tabs = to & 0x7FFF; + if(fabs > 0x7C00 || tabs > 0x7C00) { + return ((from&0x7FFF)>0x7C00) ? (from|0x200) : (to|0x200); + } + if(from == to || !(fabs|tabs)) { + return to; + } + if(!fabs) { + return (to&0x8000)+1; + } + unsigned int out = + from + + (((from>>15)^(unsigned int)((from^(0x8000|(0x8000-(from>>15))))<(to^(0x8000|(0x8000-(to>>15))))))<<1) + - 1; + return out; +} + +// End of half.h. +// Start of timing.h. + +// The function get_wall_time() returns the wall time in microseconds +// (with an unspecified offset). + +#ifdef _WIN32 + +#include + +static int64_t get_wall_time(void) { + LARGE_INTEGER time,freq; + assert(QueryPerformanceFrequency(&freq)); + assert(QueryPerformanceCounter(&time)); + return ((double)time.QuadPart / freq.QuadPart) * 1000000; +} + +#else +// Assuming POSIX + +#include +#include + +static int64_t get_wall_time(void) { + struct timeval time; + assert(gettimeofday(&time,NULL) == 0); + return time.tv_sec * 1000000 + time.tv_usec; +} + +static int64_t get_wall_time_ns(void) { + struct timespec time; + assert(clock_gettime(CLOCK_REALTIME, &time) == 0); + return time.tv_sec * 1000000000 + time.tv_nsec; +} + +#endif + +// End of timing.h. +// Start of lock.h. + +// A very simple cross-platform implementation of locks. Uses +// pthreads on Unix and some Windows thing there. Futhark's +// host-level code is not multithreaded, but user code may be, so we +// need some mechanism for ensuring atomic access to API functions. +// This is that mechanism. It is not exposed to user code at all, so +// we do not have to worry about name collisions. + +#ifdef _WIN32 + +typedef HANDLE lock_t; + +static void create_lock(lock_t *lock) { + *lock = CreateMutex(NULL, // Default security attributes. + FALSE, // Initially unlocked. + NULL); // Unnamed. +} + +static void lock_lock(lock_t *lock) { + assert(WaitForSingleObject(*lock, INFINITE) == WAIT_OBJECT_0); +} + +static void lock_unlock(lock_t *lock) { + assert(ReleaseMutex(*lock)); +} + +static void free_lock(lock_t *lock) { + CloseHandle(*lock); +} + +#else +// Assuming POSIX + +#include + +typedef pthread_mutex_t lock_t; + +static void create_lock(lock_t *lock) { + int r = pthread_mutex_init(lock, NULL); + assert(r == 0); +} + +static void lock_lock(lock_t *lock) { + int r = pthread_mutex_lock(lock); + assert(r == 0); +} + +static void lock_unlock(lock_t *lock) { + int r = pthread_mutex_unlock(lock); + assert(r == 0); +} + +static void free_lock(lock_t *lock) { + // Nothing to do for pthreads. + (void)lock; +} + +#endif + +// End of lock.h. +// Start of free_list.h. + +typedef uintptr_t fl_mem; + +// An entry in the free list. May be invalid, to avoid having to +// deallocate entries as soon as they are removed. There is also a +// tag, to help with memory reuse. +struct free_list_entry { + size_t size; + fl_mem mem; + const char *tag; + unsigned char valid; +}; + +struct free_list { + struct free_list_entry *entries; // Pointer to entries. + int capacity; // Number of entries. + int used; // Number of valid entries. + lock_t lock; // Thread safety. +}; + +static void free_list_init(struct free_list *l) { + l->capacity = 30; // Picked arbitrarily. + l->used = 0; + l->entries = (struct free_list_entry*) malloc(sizeof(struct free_list_entry) * l->capacity); + for (int i = 0; i < l->capacity; i++) { + l->entries[i].valid = 0; + } + create_lock(&l->lock); +} + +// Remove invalid entries from the free list. +static void free_list_pack(struct free_list *l) { + lock_lock(&l->lock); + int p = 0; + for (int i = 0; i < l->capacity; i++) { + if (l->entries[i].valid) { + l->entries[p] = l->entries[i]; + if (i > p) { + l->entries[i].valid = 0; + } + p++; + } + } + + // Now p is the number of used elements. We don't want it to go + // less than the default capacity (although in practice it's OK as + // long as it doesn't become 1). + if (p < 30) { + p = 30; + } + l->entries = realloc(l->entries, p * sizeof(struct free_list_entry)); + l->capacity = p; + lock_unlock(&l->lock); +} + +static void free_list_destroy(struct free_list *l) { + assert(l->used == 0); + free(l->entries); + free_lock(&l->lock); +} + +// Not part of the interface, so no locking. +static int free_list_find_invalid(struct free_list *l) { + int i; + for (i = 0; i < l->capacity; i++) { + if (!l->entries[i].valid) { + break; + } + } + return i; +} + +static void free_list_insert(struct free_list *l, size_t size, fl_mem mem, const char *tag) { + lock_lock(&l->lock); + int i = free_list_find_invalid(l); + + if (i == l->capacity) { + // List is full; so we have to grow it. + int new_capacity = l->capacity * 2 * sizeof(struct free_list_entry); + l->entries = realloc(l->entries, new_capacity); + for (int j = 0; j < l->capacity; j++) { + l->entries[j+l->capacity].valid = 0; + } + l->capacity *= 2; + } + + // Now 'i' points to the first invalid entry. + l->entries[i].valid = 1; + l->entries[i].size = size; + l->entries[i].mem = mem; + l->entries[i].tag = tag; + + l->used++; + lock_unlock(&l->lock); +} + +// Determine whether this entry in the free list is acceptable for +// satisfying the request. Not public, so no locking. +static bool free_list_acceptable(size_t size, const char* tag, struct free_list_entry *entry) { + // We check not just the hard requirement (is the entry acceptable + // and big enough?) but also put a cap on how much wasted space + // (internal fragmentation) we allow. This is necessarily a + // heuristic, and a crude one. + + if (!entry->valid) { + return false; + } + + if (size > entry->size) { + return false; + } + + // We know the block fits. Now the question is whether it is too + // big. Our policy is as follows: + // + // 1) We don't care about wasted space below 4096 bytes (to avoid + // churn in tiny allocations). + // + // 2) If the tag matches, we allow _any_ amount of wasted space. + // + // 3) Otherwise we allow up to 50% wasted space. + + if (entry->size < 4096) { + return true; + } + + if (entry->tag == tag) { + return true; + } + + if (entry->size < size * 2) { + return true; + } + + return false; +} + +// Find and remove a memory block of the indicated tag, or if that +// does not exist, another memory block with exactly the desired size. +// Returns 0 on success. +static int free_list_find(struct free_list *l, size_t size, const char *tag, + size_t *size_out, fl_mem *mem_out) { + lock_lock(&l->lock); + int size_match = -1; + int i; + int ret = 1; + for (i = 0; i < l->capacity; i++) { + if (free_list_acceptable(size, tag, &l->entries[i]) && + (size_match < 0 || l->entries[i].size < l->entries[size_match].size)) { + // If this entry is valid, has sufficient size, and is smaller than the + // best entry found so far, use this entry. + size_match = i; + } + } + + if (size_match >= 0) { + l->entries[size_match].valid = 0; + *size_out = l->entries[size_match].size; + *mem_out = l->entries[size_match].mem; + l->used--; + ret = 0; + } + lock_unlock(&l->lock); + return ret; +} + +// Remove the first block in the free list. Returns 0 if a block was +// removed, and nonzero if the free list was already empty. +static int free_list_first(struct free_list *l, fl_mem *mem_out) { + lock_lock(&l->lock); + int ret = 1; + for (int i = 0; i < l->capacity; i++) { + if (l->entries[i].valid) { + l->entries[i].valid = 0; + *mem_out = l->entries[i].mem; + l->used--; + ret = 0; + break; + } + } + lock_unlock(&l->lock); + return ret; +} + +// End of free_list.h. + +#ifdef _MSC_VER +#define inline __inline +#endif +#include +#include +#include +#include +#include + + + +#define FUTHARK_F64_ENABLED + +// Start of scalar.h. + +// Implementation of the primitive scalar operations. Very +// repetitive. This code is inserted directly into both CUDA and +// OpenCL programs, as well as the CPU code, so it has some #ifdefs to +// work everywhere. Some operations are defined as macros because +// this allows us to use them as constant expressions in things like +// array sizes and static initialisers. + +// Some of the #ifdefs are because OpenCL uses type-generic functions +// for some operations (e.g. sqrt), while C and CUDA sensibly use +// distinct functions for different precisions (e.g. sqrtf() and +// sqrt()). This is quite annoying. Due to C's unfortunate casting +// rules, it is also really easy to accidentally implement +// floating-point functions in the wrong precision, so be careful. + +// Double-precision definitions are only included if the preprocessor +// macro FUTHARK_F64_ENABLED is set. + +static inline uint8_t add8(uint8_t x, uint8_t y) { + return x + y; +} + +static inline uint16_t add16(uint16_t x, uint16_t y) { + return x + y; +} + +static inline uint32_t add32(uint32_t x, uint32_t y) { + return x + y; +} + +static inline uint64_t add64(uint64_t x, uint64_t y) { + return x + y; +} + +static inline uint8_t sub8(uint8_t x, uint8_t y) { + return x - y; +} + +static inline uint16_t sub16(uint16_t x, uint16_t y) { + return x - y; +} + +static inline uint32_t sub32(uint32_t x, uint32_t y) { + return x - y; +} + +static inline uint64_t sub64(uint64_t x, uint64_t y) { + return x - y; +} + +static inline uint8_t mul8(uint8_t x, uint8_t y) { + return x * y; +} + +static inline uint16_t mul16(uint16_t x, uint16_t y) { + return x * y; +} + +static inline uint32_t mul32(uint32_t x, uint32_t y) { + return x * y; +} + +static inline uint64_t mul64(uint64_t x, uint64_t y) { + return x * y; +} + +#if ISPC + +static inline uint8_t udiv8(uint8_t x, uint8_t y) { + // This strange pattern is used to prevent the ISPC compiler from + // causing SIGFPEs and bogus results on divisions where inactive lanes + // have 0-valued divisors. It ensures that any inactive lane instead + // has a divisor of 1. https://github.com/ispc/ispc/issues/2292 + uint8_t ys = 1; + foreach_active(i){ + ys = y; + } + + return x / ys; +} + +static inline uint16_t udiv16(uint16_t x, uint16_t y) { + uint16_t ys = 1; + foreach_active(i){ + ys = y; + } + + return x / ys; +} + +static inline uint32_t udiv32(uint32_t x, uint32_t y) { + uint32_t ys = 1; + foreach_active(i){ + ys = y; + } + + + return x / ys; +} + +static inline uint64_t udiv64(uint64_t x, uint64_t y) { + uint64_t ys = 1; + foreach_active(i){ + ys = y; + } + + + return x / ys; +} + +static inline uint8_t udiv_up8(uint8_t x, uint8_t y) { + uint8_t ys = 1; + foreach_active(i){ + ys = y; + } + + + return (x + y - 1) / ys; +} + +static inline uint16_t udiv_up16(uint16_t x, uint16_t y) { + uint16_t ys = 1; + foreach_active(i){ + ys = y; + } + + return (x + y - 1) / ys; +} + +static inline uint32_t udiv_up32(uint32_t x, uint32_t y) { + uint32_t ys = 1; + foreach_active(i){ + ys = y; + } + + return (x + y - 1) / ys; +} + +static inline uint64_t udiv_up64(uint64_t x, uint64_t y) { + uint64_t ys = 1; + foreach_active(i){ + ys = y; + } + + return (x + y - 1) / ys; +} + +static inline uint8_t umod8(uint8_t x, uint8_t y) { + uint8_t ys = 1; + foreach_active(i){ + ys = y; + } + + return x % ys; +} + +static inline uint16_t umod16(uint16_t x, uint16_t y) { + uint16_t ys = 1; + foreach_active(i){ + ys = y; + } + + + return x % ys; +} + +static inline uint32_t umod32(uint32_t x, uint32_t y) { + uint32_t ys = 1; + foreach_active(i){ + ys = y; + } + + return x % ys; +} + +static inline uint64_t umod64(uint64_t x, uint64_t y) { + uint64_t ys = 1; + foreach_active(i){ + ys = y; + } + + return x % ys; +} + +static inline uint8_t udiv_safe8(uint8_t x, uint8_t y) { + uint8_t ys = 1; + foreach_active(i){ + ys = y; + } + + return y == 0 ? 0 : x / ys; +} + +static inline uint16_t udiv_safe16(uint16_t x, uint16_t y) { + uint16_t ys = 1; + foreach_active(i){ + ys = y; + } + + return y == 0 ? 0 : x / ys; +} + +static inline uint32_t udiv_safe32(uint32_t x, uint32_t y) { + uint32_t ys = 1; + foreach_active(i){ + ys = y; + } + + return y == 0 ? 0 : x / ys; +} + +static inline uint64_t udiv_safe64(uint64_t x, uint64_t y) { + uint64_t ys = 1; + foreach_active(i){ + ys = y; + } + + return y == 0 ? 0 : x / ys; +} + +static inline uint8_t udiv_up_safe8(uint8_t x, uint8_t y) { + uint8_t ys = 1; + foreach_active(i){ + ys = y; + } + + return y == 0 ? 0 : (x + y - 1) / ys; +} + +static inline uint16_t udiv_up_safe16(uint16_t x, uint16_t y) { + uint16_t ys = 1; + foreach_active(i){ + ys = y; + } + + return y == 0 ? 0 : (x + y - 1) / ys; +} + +static inline uint32_t udiv_up_safe32(uint32_t x, uint32_t y) { + uint32_t ys = 1; + foreach_active(i){ + ys = y; + } + + return y == 0 ? 0 : (x + y - 1) / ys; +} + +static inline uint64_t udiv_up_safe64(uint64_t x, uint64_t y) { + uint64_t ys = 1; + foreach_active(i){ + ys = y; + } + + return y == 0 ? 0 : (x + y - 1) / ys; +} + +static inline uint8_t umod_safe8(uint8_t x, uint8_t y) { + uint8_t ys = 1; + foreach_active(i){ + ys = y; + } + + return y == 0 ? 0 : x % ys; +} + +static inline uint16_t umod_safe16(uint16_t x, uint16_t y) { + uint16_t ys = 1; + foreach_active(i){ + ys = y; + } + + return y == 0 ? 0 : x % ys; +} + +static inline uint32_t umod_safe32(uint32_t x, uint32_t y) { + uint32_t ys = 1; + foreach_active(i){ +