From 5fd056c15a0ea0eb28d0bc7cac84d6fa7fe0a3f3 Mon Sep 17 00:00:00 2001 From: Justin Bedo Date: Fri, 30 Sep 2016 13:23:10 +1000 Subject: add fasta parser --- dict.c | 67 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 58 insertions(+), 9 deletions(-) diff --git a/dict.c b/dict.c index 7bb0e63..53e18bd 100644 --- a/dict.c +++ b/dict.c @@ -50,8 +50,9 @@ new_tokeniser(char *path) return tok; } +// Fastq tokeniser int -getTok(tokeniser *tok) +getTokQ(tokeniser *tok) { int c; switch(tok->s){ @@ -86,10 +87,48 @@ getTok(tokeniser *tok) } } +// Fasta tokeniser +int +getTokA(tokeniser *tok) +{ + int c; + switch(tok->s){ + case TOUTSEQ: + c = gzgetc(tok->f); + while(c != -1 && c != '>') c = gzgetc(tok->f); + while(c != -1 && c != '\n') c = gzgetc(tok->f); + tok->s = TINSEQ; + case TINSEQ: + c = gzgetc(tok->f); + while(c != -1 && c == '\n') c = gzgetc(tok->f); + switch(c){ + case -1: + return -1; + case 'A': + case 'a': + return 0; + case 'C': + case 'c': + return 1; + case 'G': + case 'g': + return 2; + case 'T': + case 't': + return 3; + case '>': + while(c != -1 && c != '\n') c = gzgetc(tok->f); + default: + return -2; + } + } +} + void -usage(char *name) +usage(void) { - fprintf(stderr, "usage: %s \n", name); + fprintf(stderr, "usage: %s [-a] \n", argv0); + fprintf(stderr, "\t-a: assume input is fasta, default is fastq\n"); exit(-1); } @@ -107,15 +146,25 @@ uint64_t bitmix(uint64_t x) int main(int argc, char **argv) { - if(argc != 3) - usage(argv[0]); + int (*getTok)(tokeniser *) = getTokQ; + ARGBEGIN{ + case 'a': + getTok = getTokA; + break; + case 'h': + default: + usage(); + }ARGEND; + + if(argc != 2) + usage(); - tokeniser *t = new_tokeniser(argv[1]); + tokeniser *t = new_tokeniser(argv[0]); // map bloom filter uint64_t *bloom; - if(access(argv[2], F_OK) == -1){ - int bfd = open(argv[2], O_RDWR | O_CREAT, S_IRUSR | S_IWUSR); + if(access(argv[1], F_OK) == -1){ + int bfd = open(argv[1], O_RDWR | O_CREAT, S_IRUSR | S_IWUSR); if(bfd == -1) error("open"); if(ftruncate(bfd, BLOOMSIZE * sizeof(uint64_t)) == -1) @@ -126,7 +175,7 @@ main(int argc, char **argv) bzero(bloom, BLOOMSIZE * sizeof(uint64_t)); }else{ struct stat sb; - int bfd = open(argv[2], O_RDWR); + int bfd = open(argv[1], O_RDWR); if(bfd == -1) error("open"); if(fstat(bfd, &sb) == -1) -- cgit v1.2.3