// Copyright © 2016 by Justin Bedő // // Permission to use, copy, modify, and/or distribute this software for // any purpose with or without fee is hereby granted, provided that the // above copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL // WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED // WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR // BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES // OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, // WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, // ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS // SOFTWARE. #include #include #include #include #include #include #include #include #include #include #include #include"dict.h" void usage(char *cmd) { fprintf(stderr, "usage: %s [-dje] dict1 dict2 ...\n", cmd); exit(-1); } void transpose64(uint64_t A[64]) { int j, k; uint64_t m, t; m = 0x00000000FFFFFFFFul; for (j = 32; j != 0; j = j >> 1, m = m ^ (m << j)) for (k = 0; k < 64; k = (k + j + 1) & ~j) { t = (A[k] ^ (A[k+j] >> j)) & m; A[k] = A[k] ^ t; A[k+j] = A[k+j] ^ (t << j); } } uint64_t * map(char *path) { struct stat sb; int fd = open(path, O_RDONLY); if(fd == -1) error("open"); if(fstat(fd, &sb) == -1) error("fstat"); if(sb.st_size != BLOOMSIZE * sizeof(uint64_t)){ fprintf(stderr, "incompatible bloom table\n"); exit(-1); } void *addr = mmap(NULL, sb.st_size, PROT_READ, MAP_SHARED, fd, 0); if(addr == MAP_FAILED) error("mmap"); return addr; } double * mapent() { char path[PATH_MAX]; static char *template = "dot.XXXXXX"; strcpy(path, template); int fd = mkstemp(path); if(fd == -1) error("open"); ssize_t sz = 64L * BLOOMSIZE; sz += 63 - (sz % 64); if(ftruncate(fd, sz * sizeof(double)) == -1) error("ftruncate"); void *addr = mmap(NULL, 64L * BLOOMSIZE * sizeof(double), PROT_WRITE, MAP_SHARED, fd, 0); if(addr == MAP_FAILED) error("mmap"); //bzero(addr, 64L * BLOOMSIZE * sizeof(double)); unlink(path); return addr; } int main(int argc, char **argv) { enum {DOT, JAC, ENT} mode = DOT; ARGBEGIN{ case 'h': usage(argv0); case 'j': mode = JAC; break; case 'd': mode = DOT; break; case 'e': mode = ENT; break; default: usage(argv0); } ARGEND if(argc < 2) usage(argv0); uint64_t *blooms[argc]; for(int i = 0; i < argc; i++) blooms[i] = map(argv[i]); if(mode == ENT){ double *ent = mapent(); for(int k = 0; k < BLOOMSIZE; k++) for(int i = 0; i < argc; i+=64){ uint64_t tmp[64]; for(int j = 0; j < 64; j++) tmp[j] = i + j < argc ? blooms[i + j][k] : 0; transpose64(tmp); for(int j = 0; j < 64; j++){ ent[j + 64 * k] += (double)__builtin_popcountll(tmp[j]); } } for(int i = 0; i < 64 * BLOOMSIZE; i++){ double p = ent[i] / (double)argc; ent[i] = p == 0 ? 0 : -p * log(p); } fprintf(stderr, "entropy calculated\n"); for(int i = 0; i < argc; i++) for(int j = i; j < argc; j++){ double dot = 0; for(int k = 0; k < BLOOMSIZE; k++){ uint64_t word = blooms[i][k] & blooms[j][k]; for(int l = __builtin_ctzll(word); word; word ^= 1L << l, l = __builtin_ctzll(word)) dot += ent[l + 64 * k]; } printf("%g\n", dot); } return 0; } for(int i = 0; i < argc; i++) for(int j = i; j < argc; j++){ uint64_t dot = 0; uint64_t un = 0; for(int k = 0; k < BLOOMSIZE; k++){ dot += __builtin_popcountll(blooms[i][k] & blooms[j][k]); un += __builtin_popcountll(blooms[i][k] | blooms[j][k]); } switch(mode){ case JAC: printf("%g\n", (double)dot / (double)un); break; default: printf("%lu\n", dot); break; } } return 0; }