From e0ddb4b25a399d1769165806d5919c6af8c173cf Mon Sep 17 00:00:00 2001 From: Justin Bedo Date: Wed, 3 Aug 2016 01:39:24 +0000 Subject: initial implementation --- CMakeLists.txt | 8 +++ LICENSE | 13 +++++ dict.c | 172 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ dict.h | 54 ++++++++++++++++++ dot.c | 173 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 420 insertions(+) create mode 100644 CMakeLists.txt create mode 100644 LICENSE create mode 100644 dict.c create mode 100644 dict.h create mode 100644 dot.c diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000..f374de4 --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,8 @@ +cmake_minimum_required(VERSION 3.4) +project(genodict) +find_package(ZLIB) +add_executable(dict dict.c dict.h) +add_executable(dot dot.c dict.h) +target_link_libraries(dict ${ZLIB_LIBRARIES}) +target_link_libraries(dot m) +set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=native -O3") diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..c462bea --- /dev/null +++ b/LICENSE @@ -0,0 +1,13 @@ +Copyright © 2016 by Justin Bedő + +Permission to use, copy, modify, and/or distribute this software for any +purpose with or without fee is hereby granted, provided that the above +copyright notice and this permission notice appear in all copies. + +THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR +ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN +ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR +IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. diff --git a/dict.c b/dict.c new file mode 100644 index 0000000..7bb0e63 --- /dev/null +++ b/dict.c @@ -0,0 +1,172 @@ +// Copyright © 2016 by Justin Bedő +// +// Permission to use, copy, modify, and/or distribute this software for +// any purpose with or without fee is hereby granted, provided that the +// above copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL +// WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED +// WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR +// BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES +// OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, +// WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, +// ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS +// SOFTWARE. +// +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include"dict.h" + +//#define BITMIX + +typedef enum { + TINSEQ, + TOUTSEQ +} TState; + +typedef struct { + gzFile f; + TState s; +} tokeniser; + +tokeniser* +new_tokeniser(char *path) +{ + tokeniser *tok = malloc(sizeof(tokeniser)); + if(tok == NULL) + error("new_tokeniser"); + tok->f = gzopen(path, "rb"); + if(tok->f == NULL) + error("gzopen"); + tok->s = TOUTSEQ; + return tok; +} + +int +getTok(tokeniser *tok) +{ + int c; + switch(tok->s){ + case TOUTSEQ: + c = gzgetc(tok->f); + while(c != -1 && c != '@') c = gzgetc(tok->f); + while(c != -1 && c != '\n') c = gzgetc(tok->f); + tok->s = TINSEQ; + case TINSEQ: + c = gzgetc(tok->f); + while(c != -1 && c == '\n') c = gzgetc(tok->f); + switch(c){ + case -1: + return -1; + case 'A': + case 'a': + return 0; + case 'C': + case 'c': + return 1; + case 'G': + case 'g': + return 2; + case 'T': + case 't': + return 3; + case '+': + tok->s = TOUTSEQ; + default: + return -2; + } + } +} + +void +usage(char *name) +{ + fprintf(stderr, "usage: %s \n", name); + exit(-1); +} + +inline +uint64_t bitmix(uint64_t x) +{ + x ^= x >> 31; + x *= 0x7fb5d329728ea185; + x ^= x >> 27; + x *= 0x81dadef4bc2dd44d; + x ^= x >> 33; + return x; +} + +int +main(int argc, char **argv) +{ + if(argc != 3) + usage(argv[0]); + + tokeniser *t = new_tokeniser(argv[1]); + + // map bloom filter + uint64_t *bloom; + if(access(argv[2], F_OK) == -1){ + int bfd = open(argv[2], O_RDWR | O_CREAT, S_IRUSR | S_IWUSR); + if(bfd == -1) + error("open"); + if(ftruncate(bfd, BLOOMSIZE * sizeof(uint64_t)) == -1) + error("ftruncate"); + bloom = (uint64_t *)mmap(NULL, BLOOMSIZE * sizeof(uint64_t), PROT_WRITE, MAP_SHARED, bfd, 0); + if(bloom == MAP_FAILED) + error("mmap"); + bzero(bloom, BLOOMSIZE * sizeof(uint64_t)); + }else{ + struct stat sb; + int bfd = open(argv[2], O_RDWR); + if(bfd == -1) + error("open"); + if(fstat(bfd, &sb) == -1) + error("fstat"); + if(sb.st_size != BLOOMSIZE * sizeof(uint64_t)){ + fprintf(stderr, "incompatible bloom table\n"); + return -1; + } + bloom = (uint64_t *)mmap(NULL, BLOOMSIZE * sizeof(uint64_t), PROT_WRITE, MAP_SHARED, bfd, 0); + if(bloom == MAP_FAILED) + error("mmap"); + } + + int tok; + uint64_t term = 0; + int bits; + + while((tok = getTok(t)) != -1){ + if(tok < 0){ + term = 0; + continue; + } + + term ^= term << 3; + term += tok; + //term = (term * 16777619) ^ tok; + +#ifdef BITMIX + bits = bitmix(term); +#else + bits = term; +#endif + if(!bit(bloom,bits)){ + term = 0; + set(bloom,bits); + } + } + + int cnt = 0; + for(int i = 0; i < BLOOMSIZE; i++) + cnt += __builtin_popcountll(bloom[i]); + printf("%d bits out of %d set\n", cnt, BLOOMSIZE * 64); +} diff --git a/dict.h b/dict.h new file mode 100644 index 0000000..4e4aac5 --- /dev/null +++ b/dict.h @@ -0,0 +1,54 @@ +// Copyright © 2016 by Justin Bedő +// +// Permission to use, copy, modify, and/or distribute this software for +// any purpose with or without fee is hereby granted, provided that the +// above copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL +// WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED +// WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR +// BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES +// OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, +// WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, +// ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS +// SOFTWARE. + +#ifndef __dict_h +#define __dict_h + +#define error(msg) \ + do { perror(msg); exit(EXIT_FAILURE); } while (0) + +#define BLOOMDEG 23 +#define BLOOMSIZE (1 << BLOOMDEG) +#define bit(bf, i) \ + ((bf)[((i) >> 6) & (BLOOMSIZE - 1)] & (1UL << ((i) & 63))) +#define set(bf, i) \ + ((bf)[((i) >> 6) & (BLOOMSIZE - 1)] |= (1UL << ((i) & 63))) + +// Command line args +char *argv0; +#define SET(x) ((x) = 0) +#define USED(x) ((void)(x)) +#define ARGBEGIN for((argv0?0:(argv0=*argv)),argv++,argc--;\ + argv[0] && argv[0][0]=='-' && argv[0][1];\ + argc--, argv++) {\ + char *_args, *_argt;\ + char _argc;\ + _args = &argv[0][1];\ + if(_args[0]=='-' && _args[1]==0){\ + argc--; argv++; break;\ + }\ + _argc = 0;\ + while(*_args && (_args += (_argc = *_args, 1)))\ + switch(_argc) +#define ARGEND SET(_argt);USED(_argt);USED(_argc);USED(_args);}USED(argv);USED(argc); +#define ARGF() (_argt=_args, _args="",\ + (*_argt? _argt: argv[1]? (argc--, *++argv): 0)) +#define EARGF(x) (_argt=_args, _args="",\ + (*_argt? _argt: argv[1]? (argc--, *++argv): ((x), abort(), (char*)0))) + +#define ARGC() _argc + + +#endif diff --git a/dot.c b/dot.c new file mode 100644 index 0000000..e4375e2 --- /dev/null +++ b/dot.c @@ -0,0 +1,173 @@ +// Copyright © 2016 by Justin Bedő +// +// Permission to use, copy, modify, and/or distribute this software for +// any purpose with or without fee is hereby granted, provided that the +// above copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL +// WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED +// WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR +// BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES +// OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, +// WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, +// ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS +// SOFTWARE. + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include"dict.h" + +void +usage(char *cmd) +{ + fprintf(stderr, "usage: %s [-dje] dict1 dict2 ...\n", cmd); + exit(-1); +} + +void +transpose64(uint64_t A[64]) { + int j, k; + uint64_t m, t; + + m = 0x00000000FFFFFFFFul; + for (j = 32; j != 0; j = j >> 1, m = m ^ (m << j)) + for (k = 0; k < 64; k = (k + j + 1) & ~j) { + t = (A[k] ^ (A[k+j] >> j)) & m; + A[k] = A[k] ^ t; + A[k+j] = A[k+j] ^ (t << j); + } +} + +uint64_t * +map(char *path) +{ + struct stat sb; + int fd = open(path, O_RDONLY); + if(fd == -1) + error("open"); + if(fstat(fd, &sb) == -1) + error("fstat"); + if(sb.st_size != BLOOMSIZE * sizeof(uint64_t)){ + fprintf(stderr, "incompatible bloom table\n"); + exit(-1); + } + void *addr = mmap(NULL, sb.st_size, PROT_READ, MAP_SHARED, fd, 0); + if(addr == MAP_FAILED) + error("mmap"); + return addr; +} + +double * +mapent() +{ + char path[PATH_MAX]; + static char *template = "dot.XXXXXX"; + strcpy(path, template); + int fd = mkstemp(path); + if(fd == -1) + error("open"); + ssize_t sz = 64L * BLOOMSIZE; + sz += 63 - (sz % 64); + if(ftruncate(fd, sz * sizeof(double)) == -1) + error("ftruncate"); + void *addr = mmap(NULL, 64L * BLOOMSIZE * sizeof(double), PROT_WRITE, MAP_SHARED, fd, 0); + if(addr == MAP_FAILED) + error("mmap"); + //bzero(addr, 64L * BLOOMSIZE * sizeof(double)); + unlink(path); + return addr; +} + +int +main(int argc, char **argv) +{ + enum {DOT, JAC, ENT} mode = DOT; + + ARGBEGIN{ + case 'h': + usage(argv0); + case 'j': + mode = JAC; + break; + case 'd': + mode = DOT; + break; + case 'e': + mode = ENT; + break; + default: + usage(argv0); + } ARGEND + + if(argc < 2) + usage(argv0); + + + uint64_t *blooms[argc]; + + for(int i = 0; i < argc; i++) + blooms[i] = map(argv[i]); + + if(mode == ENT){ + double *ent = mapent(); + + for(int k = 0; k < BLOOMSIZE; k++) + for(int i = 0; i < argc; i+=64){ + uint64_t tmp[64]; + for(int j = 0; j < 64; j++) + tmp[j] = i + j < argc ? blooms[i + j][k] : 0; + transpose64(tmp); + for(int j = 0; j < 64; j++){ + ent[j + 64 * k] += (double)__builtin_popcountll(tmp[j]); + } + } + + for(int i = 0; i < 64 * BLOOMSIZE; i++){ + double p = ent[i] / (double)argc; + ent[i] = p == 0 ? 0 : -p * log(p); + } + fprintf(stderr, "entropy calculated\n"); + + for(int i = 0; i < argc; i++) + for(int j = i; j < argc; j++){ + double dot = 0; + for(int k = 0; k < BLOOMSIZE; k++){ + uint64_t word = blooms[i][k] & blooms[j][k]; + for(int l = __builtin_ctzll(word); word; word ^= 1L << l, l = __builtin_ctzll(word)) + dot += ent[l + 64 * k]; + } + printf("%g\n", dot); + } + + return 0; + } + + for(int i = 0; i < argc; i++) + for(int j = i; j < argc; j++){ + uint64_t dot = 0; + uint64_t un = 0; + for(int k = 0; k < BLOOMSIZE; k++){ + dot += __builtin_popcountll(blooms[i][k] & blooms[j][k]); + un += __builtin_popcountll(blooms[i][k] | blooms[j][k]); + } + switch(mode){ + case JAC: + printf("%g\n", (double)dot / (double)un); + break; + default: + printf("%lu\n", dot); + break; + } + } + + return 0; +} -- cgit v1.2.3