diff options
Diffstat (limited to 'dict.c')
-rw-r--r-- | dict.c | 172 |
1 files changed, 172 insertions, 0 deletions
@@ -0,0 +1,172 @@ +// Copyright © 2016 by Justin Bedő +// +// Permission to use, copy, modify, and/or distribute this software for +// any purpose with or without fee is hereby granted, provided that the +// above copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL +// WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED +// WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR +// BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES +// OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, +// WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, +// ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS +// SOFTWARE. +// +#include<fcntl.h> +#include<stdint.h> +#include<stdio.h> +#include<stdlib.h> +#include<string.h> +#include<sys/mman.h> +#include<sys/stat.h> +#include<sys/types.h> +#include<unistd.h> +#include<zlib.h> +#include"dict.h" + +//#define BITMIX + +typedef enum { + TINSEQ, + TOUTSEQ +} TState; + +typedef struct { + gzFile f; + TState s; +} tokeniser; + +tokeniser* +new_tokeniser(char *path) +{ + tokeniser *tok = malloc(sizeof(tokeniser)); + if(tok == NULL) + error("new_tokeniser"); + tok->f = gzopen(path, "rb"); + if(tok->f == NULL) + error("gzopen"); + tok->s = TOUTSEQ; + return tok; +} + +int +getTok(tokeniser *tok) +{ + int c; + switch(tok->s){ + case TOUTSEQ: + c = gzgetc(tok->f); + while(c != -1 && c != '@') c = gzgetc(tok->f); + while(c != -1 && c != '\n') c = gzgetc(tok->f); + tok->s = TINSEQ; + case TINSEQ: + c = gzgetc(tok->f); + while(c != -1 && c == '\n') c = gzgetc(tok->f); + switch(c){ + case -1: + return -1; + case 'A': + case 'a': + return 0; + case 'C': + case 'c': + return 1; + case 'G': + case 'g': + return 2; + case 'T': + case 't': + return 3; + case '+': + tok->s = TOUTSEQ; + default: + return -2; + } + } +} + +void +usage(char *name) +{ + fprintf(stderr, "usage: %s <input fastq> <bloom output>\n", name); + exit(-1); +} + +inline +uint64_t bitmix(uint64_t x) +{ + x ^= x >> 31; + x *= 0x7fb5d329728ea185; + x ^= x >> 27; + x *= 0x81dadef4bc2dd44d; + x ^= x >> 33; + return x; +} + +int +main(int argc, char **argv) +{ + if(argc != 3) + usage(argv[0]); + + tokeniser *t = new_tokeniser(argv[1]); + + // map bloom filter + uint64_t *bloom; + if(access(argv[2], F_OK) == -1){ + int bfd = open(argv[2], O_RDWR | O_CREAT, S_IRUSR | S_IWUSR); + if(bfd == -1) + error("open"); + if(ftruncate(bfd, BLOOMSIZE * sizeof(uint64_t)) == -1) + error("ftruncate"); + bloom = (uint64_t *)mmap(NULL, BLOOMSIZE * sizeof(uint64_t), PROT_WRITE, MAP_SHARED, bfd, 0); + if(bloom == MAP_FAILED) + error("mmap"); + bzero(bloom, BLOOMSIZE * sizeof(uint64_t)); + }else{ + struct stat sb; + int bfd = open(argv[2], O_RDWR); + if(bfd == -1) + error("open"); + if(fstat(bfd, &sb) == -1) + error("fstat"); + if(sb.st_size != BLOOMSIZE * sizeof(uint64_t)){ + fprintf(stderr, "incompatible bloom table\n"); + return -1; + } + bloom = (uint64_t *)mmap(NULL, BLOOMSIZE * sizeof(uint64_t), PROT_WRITE, MAP_SHARED, bfd, 0); + if(bloom == MAP_FAILED) + error("mmap"); + } + + int tok; + uint64_t term = 0; + int bits; + + while((tok = getTok(t)) != -1){ + if(tok < 0){ + term = 0; + continue; + } + + term ^= term << 3; + term += tok; + //term = (term * 16777619) ^ tok; + +#ifdef BITMIX + bits = bitmix(term); +#else + bits = term; +#endif + if(!bit(bloom,bits)){ + term = 0; + set(bloom,bits); + } + } + + int cnt = 0; + for(int i = 0; i < BLOOMSIZE; i++) + cnt += __builtin_popcountll(bloom[i]); + printf("%d bits out of %d set\n", cnt, BLOOMSIZE * 64); +} |