// Copyright © 2016 by Justin Bedő // // Permission to use, copy, modify, and/or distribute this software for // any purpose with or without fee is hereby granted, provided that the // above copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL // WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED // WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR // BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES // OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, // WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, // ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS // SOFTWARE. // #include #include #include #include #include #include #include #include #include #include #include"dict.h" //#define BITMIX typedef enum { TINSEQ, TOUTSEQ } TState; typedef struct { gzFile f; TState s; } tokeniser; tokeniser* new_tokeniser(char *path) { tokeniser *tok = malloc(sizeof(tokeniser)); if(tok == NULL) error("new_tokeniser"); tok->f = gzopen(path, "rb"); if(tok->f == NULL) error("gzopen"); tok->s = TOUTSEQ; return tok; } // Fastq tokeniser int getTokQ(tokeniser *tok) { int c; switch(tok->s){ case TOUTSEQ: c = gzgetc(tok->f); while(c != -1 && c != '@') c = gzgetc(tok->f); while(c != -1 && c != '\n') c = gzgetc(tok->f); tok->s = TINSEQ; case TINSEQ: c = gzgetc(tok->f); while(c != -1 && c == '\n') c = gzgetc(tok->f); switch(c){ case -1: return -1; case 'A': case 'a': return 0; case 'C': case 'c': return 1; case 'G': case 'g': return 2; case 'T': case 't': return 3; case '+': tok->s = TOUTSEQ; default: return -2; } } } // Fasta tokeniser int getTokA(tokeniser *tok) { int c; switch(tok->s){ case TOUTSEQ: c = gzgetc(tok->f); while(c != -1 && c != '>') c = gzgetc(tok->f); while(c != -1 && c != '\n') c = gzgetc(tok->f); tok->s = TINSEQ; case TINSEQ: c = gzgetc(tok->f); while(c != -1 && c == '\n') c = gzgetc(tok->f); switch(c){ case -1: return -1; case 'A': case 'a': return 0; case 'C': case 'c': return 1; case 'G': case 'g': return 2; case 'T': case 't': return 3; case '>': while(c != -1 && c != '\n') c = gzgetc(tok->f); default: return -2; } } } void usage(void) { fprintf(stderr, "usage: %s [-a] \n", argv0); fprintf(stderr, "\t-a: assume input is fasta, default is fastq\n"); exit(-1); } inline uint64_t bitmix(uint64_t x) { x ^= x >> 31; x *= 0x7fb5d329728ea185; x ^= x >> 27; x *= 0x81dadef4bc2dd44d; x ^= x >> 33; return x; } int main(int argc, char **argv) { int (*getTok)(tokeniser *) = getTokQ; ARGBEGIN{ case 'a': getTok = getTokA; break; case 'h': default: usage(); }ARGEND; if(argc != 2) usage(); tokeniser *t = new_tokeniser(argv[0]); // map bloom filter uint64_t *bloom; if(access(argv[1], F_OK) == -1){ int bfd = open(argv[1], O_RDWR | O_CREAT, S_IRUSR | S_IWUSR); if(bfd == -1) error("open"); if(ftruncate(bfd, BLOOMSIZE * sizeof(uint64_t)) == -1) error("ftruncate"); bloom = (uint64_t *)mmap(NULL, BLOOMSIZE * sizeof(uint64_t), PROT_WRITE, MAP_SHARED, bfd, 0); if(bloom == MAP_FAILED) error("mmap"); bzero(bloom, BLOOMSIZE * sizeof(uint64_t)); }else{ struct stat sb; int bfd = open(argv[1], O_RDWR); if(bfd == -1) error("open"); if(fstat(bfd, &sb) == -1) error("fstat"); if(sb.st_size != BLOOMSIZE * sizeof(uint64_t)){ fprintf(stderr, "incompatible bloom table\n"); return -1; } bloom = (uint64_t *)mmap(NULL, BLOOMSIZE * sizeof(uint64_t), PROT_WRITE, MAP_SHARED, bfd, 0); if(bloom == MAP_FAILED) error("mmap"); } int tok; uint64_t term = 0; int bits; while((tok = getTok(t)) != -1){ if(tok < 0){ term = 0; continue; } term ^= term << 3; term += tok; //term = (term * 16777619) ^ tok; #ifdef BITMIX bits = bitmix(term); #else bits = term; #endif if(!bit(bloom,bits)){ term = 0; set(bloom,bits); } } int cnt = 0; for(int i = 0; i < BLOOMSIZE; i++) cnt += __builtin_popcountll(bloom[i]); printf("%d bits out of %d set\n", cnt, BLOOMSIZE * 64); }