summaryrefslogtreecommitdiff
path: root/dict.c
diff options
context:
space:
mode:
Diffstat (limited to 'dict.c')
-rw-r--r--dict.c172
1 files changed, 172 insertions, 0 deletions
diff --git a/dict.c b/dict.c
new file mode 100644
index 0000000..7bb0e63
--- /dev/null
+++ b/dict.c
@@ -0,0 +1,172 @@
+// Copyright © 2016 by Justin Bedő
+//
+// Permission to use, copy, modify, and/or distribute this software for
+// any purpose with or without fee is hereby granted, provided that the
+// above copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
+// WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
+// WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR
+// BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES
+// OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+// WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
+// ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+// SOFTWARE.
+//
+#include<fcntl.h>
+#include<stdint.h>
+#include<stdio.h>
+#include<stdlib.h>
+#include<string.h>
+#include<sys/mman.h>
+#include<sys/stat.h>
+#include<sys/types.h>
+#include<unistd.h>
+#include<zlib.h>
+#include"dict.h"
+
+//#define BITMIX
+
+typedef enum {
+ TINSEQ,
+ TOUTSEQ
+} TState;
+
+typedef struct {
+ gzFile f;
+ TState s;
+} tokeniser;
+
+tokeniser*
+new_tokeniser(char *path)
+{
+ tokeniser *tok = malloc(sizeof(tokeniser));
+ if(tok == NULL)
+ error("new_tokeniser");
+ tok->f = gzopen(path, "rb");
+ if(tok->f == NULL)
+ error("gzopen");
+ tok->s = TOUTSEQ;
+ return tok;
+}
+
+int
+getTok(tokeniser *tok)
+{
+ int c;
+ switch(tok->s){
+ case TOUTSEQ:
+ c = gzgetc(tok->f);
+ while(c != -1 && c != '@') c = gzgetc(tok->f);
+ while(c != -1 && c != '\n') c = gzgetc(tok->f);
+ tok->s = TINSEQ;
+ case TINSEQ:
+ c = gzgetc(tok->f);
+ while(c != -1 && c == '\n') c = gzgetc(tok->f);
+ switch(c){
+ case -1:
+ return -1;
+ case 'A':
+ case 'a':
+ return 0;
+ case 'C':
+ case 'c':
+ return 1;
+ case 'G':
+ case 'g':
+ return 2;
+ case 'T':
+ case 't':
+ return 3;
+ case '+':
+ tok->s = TOUTSEQ;
+ default:
+ return -2;
+ }
+ }
+}
+
+void
+usage(char *name)
+{
+ fprintf(stderr, "usage: %s <input fastq> <bloom output>\n", name);
+ exit(-1);
+}
+
+inline
+uint64_t bitmix(uint64_t x)
+{
+ x ^= x >> 31;
+ x *= 0x7fb5d329728ea185;
+ x ^= x >> 27;
+ x *= 0x81dadef4bc2dd44d;
+ x ^= x >> 33;
+ return x;
+}
+
+int
+main(int argc, char **argv)
+{
+ if(argc != 3)
+ usage(argv[0]);
+
+ tokeniser *t = new_tokeniser(argv[1]);
+
+ // map bloom filter
+ uint64_t *bloom;
+ if(access(argv[2], F_OK) == -1){
+ int bfd = open(argv[2], O_RDWR | O_CREAT, S_IRUSR | S_IWUSR);
+ if(bfd == -1)
+ error("open");
+ if(ftruncate(bfd, BLOOMSIZE * sizeof(uint64_t)) == -1)
+ error("ftruncate");
+ bloom = (uint64_t *)mmap(NULL, BLOOMSIZE * sizeof(uint64_t), PROT_WRITE, MAP_SHARED, bfd, 0);
+ if(bloom == MAP_FAILED)
+ error("mmap");
+ bzero(bloom, BLOOMSIZE * sizeof(uint64_t));
+ }else{
+ struct stat sb;
+ int bfd = open(argv[2], O_RDWR);
+ if(bfd == -1)
+ error("open");
+ if(fstat(bfd, &sb) == -1)
+ error("fstat");
+ if(sb.st_size != BLOOMSIZE * sizeof(uint64_t)){
+ fprintf(stderr, "incompatible bloom table\n");
+ return -1;
+ }
+ bloom = (uint64_t *)mmap(NULL, BLOOMSIZE * sizeof(uint64_t), PROT_WRITE, MAP_SHARED, bfd, 0);
+ if(bloom == MAP_FAILED)
+ error("mmap");
+ }
+
+ int tok;
+ uint64_t term = 0;
+ int bits;
+
+ while((tok = getTok(t)) != -1){
+ if(tok < 0){
+ term = 0;
+ continue;
+ }
+
+ term ^= term << 3;
+ term += tok;
+ //term = (term * 16777619) ^ tok;
+
+#ifdef BITMIX
+ bits = bitmix(term);
+#else
+ bits = term;
+#endif
+ if(!bit(bloom,bits)){
+ term = 0;
+ set(bloom,bits);
+ }
+ }
+
+ int cnt = 0;
+ for(int i = 0; i < BLOOMSIZE; i++)
+ cnt += __builtin_popcountll(bloom[i]);
+ printf("%d bits out of %d set\n", cnt, BLOOMSIZE * 64);
+}