diff options
Diffstat (limited to 'dot.c')
-rw-r--r-- | dot.c | 173 |
1 files changed, 173 insertions, 0 deletions
@@ -0,0 +1,173 @@ +// Copyright © 2016 by Justin Bedő +// +// Permission to use, copy, modify, and/or distribute this software for +// any purpose with or without fee is hereby granted, provided that the +// above copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL +// WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED +// WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR +// BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES +// OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, +// WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, +// ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS +// SOFTWARE. + +#include<math.h> +#include<fcntl.h> +#include<stdint.h> +#include<stdio.h> +#include<stdlib.h> +#include<string.h> +#include<sys/mman.h> +#include<sys/stat.h> +#include<sys/types.h> +#include<unistd.h> +#include<zlib.h> +#include"dict.h" + +void +usage(char *cmd) +{ + fprintf(stderr, "usage: %s [-dje] dict1 dict2 ...\n", cmd); + exit(-1); +} + +void +transpose64(uint64_t A[64]) { + int j, k; + uint64_t m, t; + + m = 0x00000000FFFFFFFFul; + for (j = 32; j != 0; j = j >> 1, m = m ^ (m << j)) + for (k = 0; k < 64; k = (k + j + 1) & ~j) { + t = (A[k] ^ (A[k+j] >> j)) & m; + A[k] = A[k] ^ t; + A[k+j] = A[k+j] ^ (t << j); + } +} + +uint64_t * +map(char *path) +{ + struct stat sb; + int fd = open(path, O_RDONLY); + if(fd == -1) + error("open"); + if(fstat(fd, &sb) == -1) + error("fstat"); + if(sb.st_size != BLOOMSIZE * sizeof(uint64_t)){ + fprintf(stderr, "incompatible bloom table\n"); + exit(-1); + } + void *addr = mmap(NULL, sb.st_size, PROT_READ, MAP_SHARED, fd, 0); + if(addr == MAP_FAILED) + error("mmap"); + return addr; +} + +double * +mapent() +{ + char path[PATH_MAX]; + static char *template = "dot.XXXXXX"; + strcpy(path, template); + int fd = mkstemp(path); + if(fd == -1) + error("open"); + ssize_t sz = 64L * BLOOMSIZE; + sz += 63 - (sz % 64); + if(ftruncate(fd, sz * sizeof(double)) == -1) + error("ftruncate"); + void *addr = mmap(NULL, 64L * BLOOMSIZE * sizeof(double), PROT_WRITE, MAP_SHARED, fd, 0); + if(addr == MAP_FAILED) + error("mmap"); + //bzero(addr, 64L * BLOOMSIZE * sizeof(double)); + unlink(path); + return addr; +} + +int +main(int argc, char **argv) +{ + enum {DOT, JAC, ENT} mode = DOT; + + ARGBEGIN{ + case 'h': + usage(argv0); + case 'j': + mode = JAC; + break; + case 'd': + mode = DOT; + break; + case 'e': + mode = ENT; + break; + default: + usage(argv0); + } ARGEND + + if(argc < 2) + usage(argv0); + + + uint64_t *blooms[argc]; + + for(int i = 0; i < argc; i++) + blooms[i] = map(argv[i]); + + if(mode == ENT){ + double *ent = mapent(); + + for(int k = 0; k < BLOOMSIZE; k++) + for(int i = 0; i < argc; i+=64){ + uint64_t tmp[64]; + for(int j = 0; j < 64; j++) + tmp[j] = i + j < argc ? blooms[i + j][k] : 0; + transpose64(tmp); + for(int j = 0; j < 64; j++){ + ent[j + 64 * k] += (double)__builtin_popcountll(tmp[j]); + } + } + + for(int i = 0; i < 64 * BLOOMSIZE; i++){ + double p = ent[i] / (double)argc; + ent[i] = p == 0 ? 0 : -p * log(p); + } + fprintf(stderr, "entropy calculated\n"); + + for(int i = 0; i < argc; i++) + for(int j = i; j < argc; j++){ + double dot = 0; + for(int k = 0; k < BLOOMSIZE; k++){ + uint64_t word = blooms[i][k] & blooms[j][k]; + for(int l = __builtin_ctzll(word); word; word ^= 1L << l, l = __builtin_ctzll(word)) + dot += ent[l + 64 * k]; + } + printf("%g\n", dot); + } + + return 0; + } + + for(int i = 0; i < argc; i++) + for(int j = i; j < argc; j++){ + uint64_t dot = 0; + uint64_t un = 0; + for(int k = 0; k < BLOOMSIZE; k++){ + dot += __builtin_popcountll(blooms[i][k] & blooms[j][k]); + un += __builtin_popcountll(blooms[i][k] | blooms[j][k]); + } + switch(mode){ + case JAC: + printf("%g\n", (double)dot / (double)un); + break; + default: + printf("%lu\n", dot); + break; + } + } + + return 0; +} |