summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJustin Bedo <cu@cua0.org>2016-08-03 01:39:24 +0000
committerJustin Bedo <cu@cua0.org>2016-08-03 01:56:42 +0000
commite0ddb4b25a399d1769165806d5919c6af8c173cf (patch)
treebe7c26a1d643ad266b1a8797d1e06919cdec4634
initial implementation
-rw-r--r--CMakeLists.txt8
-rw-r--r--LICENSE13
-rw-r--r--dict.c172
-rw-r--r--dict.h54
-rw-r--r--dot.c173
5 files changed, 420 insertions, 0 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 0000000..f374de4
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,8 @@
+cmake_minimum_required(VERSION 3.4)
+project(genodict)
+find_package(ZLIB)
+add_executable(dict dict.c dict.h)
+add_executable(dot dot.c dict.h)
+target_link_libraries(dict ${ZLIB_LIBRARIES})
+target_link_libraries(dot m)
+set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=native -O3")
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..c462bea
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,13 @@
+Copyright © 2016 by Justin Bedő
+
+Permission to use, copy, modify, and/or distribute this software for any
+purpose with or without fee is hereby granted, provided that the above
+copyright notice and this permission notice appear in all copies.
+
+THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR
+IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
diff --git a/dict.c b/dict.c
new file mode 100644
index 0000000..7bb0e63
--- /dev/null
+++ b/dict.c
@@ -0,0 +1,172 @@
+// Copyright © 2016 by Justin Bedő
+//
+// Permission to use, copy, modify, and/or distribute this software for
+// any purpose with or without fee is hereby granted, provided that the
+// above copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
+// WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
+// WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR
+// BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES
+// OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+// WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
+// ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+// SOFTWARE.
+//
+#include<fcntl.h>
+#include<stdint.h>
+#include<stdio.h>
+#include<stdlib.h>
+#include<string.h>
+#include<sys/mman.h>
+#include<sys/stat.h>
+#include<sys/types.h>
+#include<unistd.h>
+#include<zlib.h>
+#include"dict.h"
+
+//#define BITMIX
+
+typedef enum {
+ TINSEQ,
+ TOUTSEQ
+} TState;
+
+typedef struct {
+ gzFile f;
+ TState s;
+} tokeniser;
+
+tokeniser*
+new_tokeniser(char *path)
+{
+ tokeniser *tok = malloc(sizeof(tokeniser));
+ if(tok == NULL)
+ error("new_tokeniser");
+ tok->f = gzopen(path, "rb");
+ if(tok->f == NULL)
+ error("gzopen");
+ tok->s = TOUTSEQ;
+ return tok;
+}
+
+int
+getTok(tokeniser *tok)
+{
+ int c;
+ switch(tok->s){
+ case TOUTSEQ:
+ c = gzgetc(tok->f);
+ while(c != -1 && c != '@') c = gzgetc(tok->f);
+ while(c != -1 && c != '\n') c = gzgetc(tok->f);
+ tok->s = TINSEQ;
+ case TINSEQ:
+ c = gzgetc(tok->f);
+ while(c != -1 && c == '\n') c = gzgetc(tok->f);
+ switch(c){
+ case -1:
+ return -1;
+ case 'A':
+ case 'a':
+ return 0;
+ case 'C':
+ case 'c':
+ return 1;
+ case 'G':
+ case 'g':
+ return 2;
+ case 'T':
+ case 't':
+ return 3;
+ case '+':
+ tok->s = TOUTSEQ;
+ default:
+ return -2;
+ }
+ }
+}
+
+void
+usage(char *name)
+{
+ fprintf(stderr, "usage: %s <input fastq> <bloom output>\n", name);
+ exit(-1);
+}
+
+inline
+uint64_t bitmix(uint64_t x)
+{
+ x ^= x >> 31;
+ x *= 0x7fb5d329728ea185;
+ x ^= x >> 27;
+ x *= 0x81dadef4bc2dd44d;
+ x ^= x >> 33;
+ return x;
+}
+
+int
+main(int argc, char **argv)
+{
+ if(argc != 3)
+ usage(argv[0]);
+
+ tokeniser *t = new_tokeniser(argv[1]);
+
+ // map bloom filter
+ uint64_t *bloom;
+ if(access(argv[2], F_OK) == -1){
+ int bfd = open(argv[2], O_RDWR | O_CREAT, S_IRUSR | S_IWUSR);
+ if(bfd == -1)
+ error("open");
+ if(ftruncate(bfd, BLOOMSIZE * sizeof(uint64_t)) == -1)
+ error("ftruncate");
+ bloom = (uint64_t *)mmap(NULL, BLOOMSIZE * sizeof(uint64_t), PROT_WRITE, MAP_SHARED, bfd, 0);
+ if(bloom == MAP_FAILED)
+ error("mmap");
+ bzero(bloom, BLOOMSIZE * sizeof(uint64_t));
+ }else{
+ struct stat sb;
+ int bfd = open(argv[2], O_RDWR);
+ if(bfd == -1)
+ error("open");
+ if(fstat(bfd, &sb) == -1)
+ error("fstat");
+ if(sb.st_size != BLOOMSIZE * sizeof(uint64_t)){
+ fprintf(stderr, "incompatible bloom table\n");
+ return -1;
+ }
+ bloom = (uint64_t *)mmap(NULL, BLOOMSIZE * sizeof(uint64_t), PROT_WRITE, MAP_SHARED, bfd, 0);
+ if(bloom == MAP_FAILED)
+ error("mmap");
+ }
+
+ int tok;
+ uint64_t term = 0;
+ int bits;
+
+ while((tok = getTok(t)) != -1){
+ if(tok < 0){
+ term = 0;
+ continue;
+ }
+
+ term ^= term << 3;
+ term += tok;
+ //term = (term * 16777619) ^ tok;
+
+#ifdef BITMIX
+ bits = bitmix(term);
+#else
+ bits = term;
+#endif
+ if(!bit(bloom,bits)){
+ term = 0;
+ set(bloom,bits);
+ }
+ }
+
+ int cnt = 0;
+ for(int i = 0; i < BLOOMSIZE; i++)
+ cnt += __builtin_popcountll(bloom[i]);
+ printf("%d bits out of %d set\n", cnt, BLOOMSIZE * 64);
+}
diff --git a/dict.h b/dict.h
new file mode 100644
index 0000000..4e4aac5
--- /dev/null
+++ b/dict.h
@@ -0,0 +1,54 @@
+// Copyright © 2016 by Justin Bedő
+//
+// Permission to use, copy, modify, and/or distribute this software for
+// any purpose with or without fee is hereby granted, provided that the
+// above copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
+// WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
+// WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR
+// BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES
+// OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+// WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
+// ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+// SOFTWARE.
+
+#ifndef __dict_h
+#define __dict_h
+
+#define error(msg) \
+ do { perror(msg); exit(EXIT_FAILURE); } while (0)
+
+#define BLOOMDEG 23
+#define BLOOMSIZE (1 << BLOOMDEG)
+#define bit(bf, i) \
+ ((bf)[((i) >> 6) & (BLOOMSIZE - 1)] & (1UL << ((i) & 63)))
+#define set(bf, i) \
+ ((bf)[((i) >> 6) & (BLOOMSIZE - 1)] |= (1UL << ((i) & 63)))
+
+// Command line args
+char *argv0;
+#define SET(x) ((x) = 0)
+#define USED(x) ((void)(x))
+#define ARGBEGIN for((argv0?0:(argv0=*argv)),argv++,argc--;\
+ argv[0] && argv[0][0]=='-' && argv[0][1];\
+ argc--, argv++) {\
+ char *_args, *_argt;\
+ char _argc;\
+ _args = &argv[0][1];\
+ if(_args[0]=='-' && _args[1]==0){\
+ argc--; argv++; break;\
+ }\
+ _argc = 0;\
+ while(*_args && (_args += (_argc = *_args, 1)))\
+ switch(_argc)
+#define ARGEND SET(_argt);USED(_argt);USED(_argc);USED(_args);}USED(argv);USED(argc);
+#define ARGF() (_argt=_args, _args="",\
+ (*_argt? _argt: argv[1]? (argc--, *++argv): 0))
+#define EARGF(x) (_argt=_args, _args="",\
+ (*_argt? _argt: argv[1]? (argc--, *++argv): ((x), abort(), (char*)0)))
+
+#define ARGC() _argc
+
+
+#endif
diff --git a/dot.c b/dot.c
new file mode 100644
index 0000000..e4375e2
--- /dev/null
+++ b/dot.c
@@ -0,0 +1,173 @@
+// Copyright © 2016 by Justin Bedő
+//
+// Permission to use, copy, modify, and/or distribute this software for
+// any purpose with or without fee is hereby granted, provided that the
+// above copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
+// WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
+// WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR
+// BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES
+// OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+// WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
+// ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+// SOFTWARE.
+
+#include<math.h>
+#include<fcntl.h>
+#include<stdint.h>
+#include<stdio.h>
+#include<stdlib.h>
+#include<string.h>
+#include<sys/mman.h>
+#include<sys/stat.h>
+#include<sys/types.h>
+#include<unistd.h>
+#include<zlib.h>
+#include"dict.h"
+
+void
+usage(char *cmd)
+{
+ fprintf(stderr, "usage: %s [-dje] dict1 dict2 ...\n", cmd);
+ exit(-1);
+}
+
+void
+transpose64(uint64_t A[64]) {
+ int j, k;
+ uint64_t m, t;
+
+ m = 0x00000000FFFFFFFFul;
+ for (j = 32; j != 0; j = j >> 1, m = m ^ (m << j))
+ for (k = 0; k < 64; k = (k + j + 1) & ~j) {
+ t = (A[k] ^ (A[k+j] >> j)) & m;
+ A[k] = A[k] ^ t;
+ A[k+j] = A[k+j] ^ (t << j);
+ }
+}
+
+uint64_t *
+map(char *path)
+{
+ struct stat sb;
+ int fd = open(path, O_RDONLY);
+ if(fd == -1)
+ error("open");
+ if(fstat(fd, &sb) == -1)
+ error("fstat");
+ if(sb.st_size != BLOOMSIZE * sizeof(uint64_t)){
+ fprintf(stderr, "incompatible bloom table\n");
+ exit(-1);
+ }
+ void *addr = mmap(NULL, sb.st_size, PROT_READ, MAP_SHARED, fd, 0);
+ if(addr == MAP_FAILED)
+ error("mmap");
+ return addr;
+}
+
+double *
+mapent()
+{
+ char path[PATH_MAX];
+ static char *template = "dot.XXXXXX";
+ strcpy(path, template);
+ int fd = mkstemp(path);
+ if(fd == -1)
+ error("open");
+ ssize_t sz = 64L * BLOOMSIZE;
+ sz += 63 - (sz % 64);
+ if(ftruncate(fd, sz * sizeof(double)) == -1)
+ error("ftruncate");
+ void *addr = mmap(NULL, 64L * BLOOMSIZE * sizeof(double), PROT_WRITE, MAP_SHARED, fd, 0);
+ if(addr == MAP_FAILED)
+ error("mmap");
+ //bzero(addr, 64L * BLOOMSIZE * sizeof(double));
+ unlink(path);
+ return addr;
+}
+
+int
+main(int argc, char **argv)
+{
+ enum {DOT, JAC, ENT} mode = DOT;
+
+ ARGBEGIN{
+ case 'h':
+ usage(argv0);
+ case 'j':
+ mode = JAC;
+ break;
+ case 'd':
+ mode = DOT;
+ break;
+ case 'e':
+ mode = ENT;
+ break;
+ default:
+ usage(argv0);
+ } ARGEND
+
+ if(argc < 2)
+ usage(argv0);
+
+
+ uint64_t *blooms[argc];
+
+ for(int i = 0; i < argc; i++)
+ blooms[i] = map(argv[i]);
+
+ if(mode == ENT){
+ double *ent = mapent();
+
+ for(int k = 0; k < BLOOMSIZE; k++)
+ for(int i = 0; i < argc; i+=64){
+ uint64_t tmp[64];
+ for(int j = 0; j < 64; j++)
+ tmp[j] = i + j < argc ? blooms[i + j][k] : 0;
+ transpose64(tmp);
+ for(int j = 0; j < 64; j++){
+ ent[j + 64 * k] += (double)__builtin_popcountll(tmp[j]);
+ }
+ }
+
+ for(int i = 0; i < 64 * BLOOMSIZE; i++){
+ double p = ent[i] / (double)argc;
+ ent[i] = p == 0 ? 0 : -p * log(p);
+ }
+ fprintf(stderr, "entropy calculated\n");
+
+ for(int i = 0; i < argc; i++)
+ for(int j = i; j < argc; j++){
+ double dot = 0;
+ for(int k = 0; k < BLOOMSIZE; k++){
+ uint64_t word = blooms[i][k] & blooms[j][k];
+ for(int l = __builtin_ctzll(word); word; word ^= 1L << l, l = __builtin_ctzll(word))
+ dot += ent[l + 64 * k];
+ }
+ printf("%g\n", dot);
+ }
+
+ return 0;
+ }
+
+ for(int i = 0; i < argc; i++)
+ for(int j = i; j < argc; j++){
+ uint64_t dot = 0;
+ uint64_t un = 0;
+ for(int k = 0; k < BLOOMSIZE; k++){
+ dot += __builtin_popcountll(blooms[i][k] & blooms[j][k]);
+ un += __builtin_popcountll(blooms[i][k] | blooms[j][k]);
+ }
+ switch(mode){
+ case JAC:
+ printf("%g\n", (double)dot / (double)un);
+ break;
+ default:
+ printf("%lu\n", dot);
+ break;
+ }
+ }
+
+ return 0;
+}