summaryrefslogtreecommitdiff
path: root/dot.c
diff options
context:
space:
mode:
Diffstat (limited to 'dot.c')
-rw-r--r--dot.c173
1 files changed, 173 insertions, 0 deletions
diff --git a/dot.c b/dot.c
new file mode 100644
index 0000000..e4375e2
--- /dev/null
+++ b/dot.c
@@ -0,0 +1,173 @@
+// Copyright © 2016 by Justin Bedő
+//
+// Permission to use, copy, modify, and/or distribute this software for
+// any purpose with or without fee is hereby granted, provided that the
+// above copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
+// WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
+// WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR
+// BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES
+// OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+// WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
+// ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+// SOFTWARE.
+
+#include<math.h>
+#include<fcntl.h>
+#include<stdint.h>
+#include<stdio.h>
+#include<stdlib.h>
+#include<string.h>
+#include<sys/mman.h>
+#include<sys/stat.h>
+#include<sys/types.h>
+#include<unistd.h>
+#include<zlib.h>
+#include"dict.h"
+
+void
+usage(char *cmd)
+{
+ fprintf(stderr, "usage: %s [-dje] dict1 dict2 ...\n", cmd);
+ exit(-1);
+}
+
+void
+transpose64(uint64_t A[64]) {
+ int j, k;
+ uint64_t m, t;
+
+ m = 0x00000000FFFFFFFFul;
+ for (j = 32; j != 0; j = j >> 1, m = m ^ (m << j))
+ for (k = 0; k < 64; k = (k + j + 1) & ~j) {
+ t = (A[k] ^ (A[k+j] >> j)) & m;
+ A[k] = A[k] ^ t;
+ A[k+j] = A[k+j] ^ (t << j);
+ }
+}
+
+uint64_t *
+map(char *path)
+{
+ struct stat sb;
+ int fd = open(path, O_RDONLY);
+ if(fd == -1)
+ error("open");
+ if(fstat(fd, &sb) == -1)
+ error("fstat");
+ if(sb.st_size != BLOOMSIZE * sizeof(uint64_t)){
+ fprintf(stderr, "incompatible bloom table\n");
+ exit(-1);
+ }
+ void *addr = mmap(NULL, sb.st_size, PROT_READ, MAP_SHARED, fd, 0);
+ if(addr == MAP_FAILED)
+ error("mmap");
+ return addr;
+}
+
+double *
+mapent()
+{
+ char path[PATH_MAX];
+ static char *template = "dot.XXXXXX";
+ strcpy(path, template);
+ int fd = mkstemp(path);
+ if(fd == -1)
+ error("open");
+ ssize_t sz = 64L * BLOOMSIZE;
+ sz += 63 - (sz % 64);
+ if(ftruncate(fd, sz * sizeof(double)) == -1)
+ error("ftruncate");
+ void *addr = mmap(NULL, 64L * BLOOMSIZE * sizeof(double), PROT_WRITE, MAP_SHARED, fd, 0);
+ if(addr == MAP_FAILED)
+ error("mmap");
+ //bzero(addr, 64L * BLOOMSIZE * sizeof(double));
+ unlink(path);
+ return addr;
+}
+
+int
+main(int argc, char **argv)
+{
+ enum {DOT, JAC, ENT} mode = DOT;
+
+ ARGBEGIN{
+ case 'h':
+ usage(argv0);
+ case 'j':
+ mode = JAC;
+ break;
+ case 'd':
+ mode = DOT;
+ break;
+ case 'e':
+ mode = ENT;
+ break;
+ default:
+ usage(argv0);
+ } ARGEND
+
+ if(argc < 2)
+ usage(argv0);
+
+
+ uint64_t *blooms[argc];
+
+ for(int i = 0; i < argc; i++)
+ blooms[i] = map(argv[i]);
+
+ if(mode == ENT){
+ double *ent = mapent();
+
+ for(int k = 0; k < BLOOMSIZE; k++)
+ for(int i = 0; i < argc; i+=64){
+ uint64_t tmp[64];
+ for(int j = 0; j < 64; j++)
+ tmp[j] = i + j < argc ? blooms[i + j][k] : 0;
+ transpose64(tmp);
+ for(int j = 0; j < 64; j++){
+ ent[j + 64 * k] += (double)__builtin_popcountll(tmp[j]);
+ }
+ }
+
+ for(int i = 0; i < 64 * BLOOMSIZE; i++){
+ double p = ent[i] / (double)argc;
+ ent[i] = p == 0 ? 0 : -p * log(p);
+ }
+ fprintf(stderr, "entropy calculated\n");
+
+ for(int i = 0; i < argc; i++)
+ for(int j = i; j < argc; j++){
+ double dot = 0;
+ for(int k = 0; k < BLOOMSIZE; k++){
+ uint64_t word = blooms[i][k] & blooms[j][k];
+ for(int l = __builtin_ctzll(word); word; word ^= 1L << l, l = __builtin_ctzll(word))
+ dot += ent[l + 64 * k];
+ }
+ printf("%g\n", dot);
+ }
+
+ return 0;
+ }
+
+ for(int i = 0; i < argc; i++)
+ for(int j = i; j < argc; j++){
+ uint64_t dot = 0;
+ uint64_t un = 0;
+ for(int k = 0; k < BLOOMSIZE; k++){
+ dot += __builtin_popcountll(blooms[i][k] & blooms[j][k]);
+ un += __builtin_popcountll(blooms[i][k] | blooms[j][k]);
+ }
+ switch(mode){
+ case JAC:
+ printf("%g\n", (double)dot / (double)un);
+ break;
+ default:
+ printf("%lu\n", dot);
+ break;
+ }
+ }
+
+ return 0;
+}