diff options
| author | Justin Bedo <cu@cua0.org> | 2016-08-03 01:39:24 +0000 | 
|---|---|---|
| committer | Justin Bedo <cu@cua0.org> | 2016-08-03 01:56:42 +0000 | 
| commit | e0ddb4b25a399d1769165806d5919c6af8c173cf (patch) | |
| tree | be7c26a1d643ad266b1a8797d1e06919cdec4634 | |
initial implementation
| -rw-r--r-- | CMakeLists.txt | 8 | ||||
| -rw-r--r-- | LICENSE | 13 | ||||
| -rw-r--r-- | dict.c | 172 | ||||
| -rw-r--r-- | dict.h | 54 | ||||
| -rw-r--r-- | dot.c | 173 | 
5 files changed, 420 insertions, 0 deletions
| diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000..f374de4 --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,8 @@ +cmake_minimum_required(VERSION 3.4) +project(genodict) +find_package(ZLIB) +add_executable(dict dict.c dict.h) +add_executable(dot dot.c dict.h) +target_link_libraries(dict ${ZLIB_LIBRARIES}) +target_link_libraries(dot m) +set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=native -O3") @@ -0,0 +1,13 @@ +Copyright © 2016 by Justin Bedő + +Permission to use, copy, modify, and/or distribute this software for any +purpose with or without fee is hereby granted, provided that the above +copyright notice and this permission notice appear in all copies. + +THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR +ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN +ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR +IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. @@ -0,0 +1,172 @@ +// Copyright © 2016 by Justin Bedő +// +// Permission to use, copy, modify, and/or distribute this software for +// any purpose with or without fee is hereby granted, provided that the +// above copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL +// WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED +// WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR +// BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES +// OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, +// WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, +// ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS +// SOFTWARE. +// +#include<fcntl.h> +#include<stdint.h> +#include<stdio.h> +#include<stdlib.h> +#include<string.h> +#include<sys/mman.h> +#include<sys/stat.h> +#include<sys/types.h> +#include<unistd.h> +#include<zlib.h> +#include"dict.h" + +//#define BITMIX + +typedef enum { +  TINSEQ, +  TOUTSEQ +} TState; + +typedef struct { +  gzFile f; +  TState s; +} tokeniser; + +tokeniser* +new_tokeniser(char *path) +{ +  tokeniser *tok = malloc(sizeof(tokeniser)); +  if(tok == NULL) +    error("new_tokeniser"); +  tok->f = gzopen(path, "rb"); +  if(tok->f == NULL) +    error("gzopen"); +  tok->s = TOUTSEQ; +  return tok; +} + +int +getTok(tokeniser *tok) +{ +  int c; +  switch(tok->s){ +    case TOUTSEQ: +      c = gzgetc(tok->f); +      while(c != -1 && c != '@') c = gzgetc(tok->f); +      while(c != -1 && c != '\n') c = gzgetc(tok->f); +      tok->s = TINSEQ; +    case TINSEQ: +      c = gzgetc(tok->f); +      while(c != -1 && c == '\n') c = gzgetc(tok->f); +      switch(c){ +        case -1: +          return -1; +        case 'A': +        case 'a': +          return 0; +        case 'C': +        case 'c': +          return 1; +        case 'G': +        case 'g': +          return 2; +        case 'T': +        case 't': +          return 3; +        case '+': +          tok->s = TOUTSEQ; +        default: +          return -2; +      } +  } +} + +void +usage(char *name) +{ +  fprintf(stderr, "usage: %s <input fastq> <bloom output>\n", name); +  exit(-1); +} + +inline +uint64_t bitmix(uint64_t x) +{ +  x ^= x >> 31; +  x *= 0x7fb5d329728ea185; +  x ^= x >> 27; +  x *= 0x81dadef4bc2dd44d; +  x ^= x >> 33; +  return x; +} + +int +main(int argc, char **argv) +{ +  if(argc != 3) +    usage(argv[0]); + +  tokeniser *t = new_tokeniser(argv[1]); + +  // map bloom filter +  uint64_t *bloom; +  if(access(argv[2], F_OK) == -1){ +    int bfd = open(argv[2], O_RDWR | O_CREAT, S_IRUSR | S_IWUSR); +    if(bfd == -1) +      error("open"); +    if(ftruncate(bfd, BLOOMSIZE * sizeof(uint64_t)) == -1) +      error("ftruncate"); +    bloom = (uint64_t *)mmap(NULL, BLOOMSIZE * sizeof(uint64_t), PROT_WRITE, MAP_SHARED, bfd, 0); +    if(bloom == MAP_FAILED) +      error("mmap"); +    bzero(bloom, BLOOMSIZE * sizeof(uint64_t)); +  }else{ +    struct stat sb; +    int bfd = open(argv[2], O_RDWR); +    if(bfd == -1) +      error("open"); +    if(fstat(bfd, &sb) == -1) +      error("fstat"); +    if(sb.st_size != BLOOMSIZE * sizeof(uint64_t)){ +      fprintf(stderr, "incompatible bloom table\n"); +      return -1; +    } +    bloom = (uint64_t *)mmap(NULL, BLOOMSIZE * sizeof(uint64_t), PROT_WRITE, MAP_SHARED, bfd, 0); +    if(bloom == MAP_FAILED) +      error("mmap"); +  } + +  int tok; +  uint64_t term = 0; +  int bits; + +  while((tok = getTok(t)) != -1){ +    if(tok < 0){ +      term = 0; +      continue; +    } + +    term ^= term << 3; +    term += tok; +    //term = (term * 16777619) ^ tok; + +#ifdef BITMIX +    bits = bitmix(term); +#else +    bits = term; +#endif +    if(!bit(bloom,bits)){ +      term = 0; +      set(bloom,bits); +    } +  } + +  int cnt = 0; +  for(int i = 0; i < BLOOMSIZE; i++) +    cnt += __builtin_popcountll(bloom[i]); +  printf("%d bits out of %d set\n", cnt, BLOOMSIZE * 64); +} @@ -0,0 +1,54 @@ +// Copyright © 2016 by Justin Bedő +// +// Permission to use, copy, modify, and/or distribute this software for +// any purpose with or without fee is hereby granted, provided that the +// above copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL +// WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED +// WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR +// BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES +// OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, +// WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, +// ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS +// SOFTWARE. + +#ifndef __dict_h +#define __dict_h + +#define error(msg) \ +  do { perror(msg); exit(EXIT_FAILURE); } while (0) + +#define BLOOMDEG 23 +#define BLOOMSIZE (1 << BLOOMDEG) +#define bit(bf, i) \ +  ((bf)[((i) >> 6) & (BLOOMSIZE - 1)] &  (1UL << ((i) & 63))) +#define set(bf, i) \ +  ((bf)[((i) >> 6) & (BLOOMSIZE - 1)] |= (1UL << ((i) & 63))) + +// Command line args +char *argv0; +#define SET(x) ((x) = 0) +#define USED(x) ((void)(x)) +#define ARGBEGIN  for((argv0?0:(argv0=*argv)),argv++,argc--;\ +          argv[0] && argv[0][0]=='-' && argv[0][1];\ +          argc--, argv++) {\ +        char *_args, *_argt;\ +        char _argc;\ +        _args = &argv[0][1];\ +        if(_args[0]=='-' && _args[1]==0){\ +          argc--; argv++; break;\ +        }\ +        _argc = 0;\ +        while(*_args && (_args += (_argc = *_args, 1)))\ +        switch(_argc) +#define ARGEND    SET(_argt);USED(_argt);USED(_argc);USED(_args);}USED(argv);USED(argc); +#define ARGF()    (_argt=_args, _args="",\ +        (*_argt? _argt: argv[1]? (argc--, *++argv): 0)) +#define EARGF(x)  (_argt=_args, _args="",\ +        (*_argt? _argt: argv[1]? (argc--, *++argv): ((x), abort(), (char*)0))) + +#define ARGC()    _argc + + +#endif @@ -0,0 +1,173 @@ +// Copyright © 2016 by Justin Bedő +// +// Permission to use, copy, modify, and/or distribute this software for +// any purpose with or without fee is hereby granted, provided that the +// above copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL +// WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED +// WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR +// BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES +// OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, +// WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, +// ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS +// SOFTWARE. + +#include<math.h> +#include<fcntl.h> +#include<stdint.h> +#include<stdio.h> +#include<stdlib.h> +#include<string.h> +#include<sys/mman.h> +#include<sys/stat.h> +#include<sys/types.h> +#include<unistd.h> +#include<zlib.h> +#include"dict.h" + +void +usage(char *cmd) +{ +  fprintf(stderr, "usage: %s [-dje] dict1 dict2 ...\n", cmd); +  exit(-1); +} + +void +transpose64(uint64_t A[64]) { +  int j, k; +  uint64_t m, t; + +  m = 0x00000000FFFFFFFFul; +  for (j = 32; j != 0; j = j >> 1, m = m ^ (m << j)) +    for (k = 0; k < 64; k = (k + j + 1) & ~j) { +      t = (A[k] ^ (A[k+j] >> j)) & m; +      A[k] = A[k] ^ t; +      A[k+j] = A[k+j] ^ (t << j); +    } +} + +uint64_t * +map(char *path) +{ +  struct stat sb; +  int fd = open(path, O_RDONLY); +  if(fd == -1) +    error("open"); +  if(fstat(fd, &sb) == -1) +    error("fstat"); +  if(sb.st_size != BLOOMSIZE * sizeof(uint64_t)){ +    fprintf(stderr, "incompatible bloom table\n"); +    exit(-1); +  } +  void *addr = mmap(NULL, sb.st_size, PROT_READ, MAP_SHARED, fd, 0); +  if(addr == MAP_FAILED) +    error("mmap"); +  return addr; +} + +double * +mapent() +{ +  char path[PATH_MAX]; +  static char *template = "dot.XXXXXX"; +  strcpy(path, template); +  int fd = mkstemp(path); +  if(fd == -1) +    error("open"); +  ssize_t sz = 64L * BLOOMSIZE; +  sz += 63 - (sz % 64); +  if(ftruncate(fd, sz * sizeof(double)) == -1) +    error("ftruncate"); +  void *addr = mmap(NULL, 64L * BLOOMSIZE * sizeof(double), PROT_WRITE, MAP_SHARED, fd, 0); +  if(addr == MAP_FAILED) +    error("mmap"); +  //bzero(addr, 64L * BLOOMSIZE * sizeof(double)); +  unlink(path); +  return addr; +} + +int +main(int argc, char **argv) +{ +  enum {DOT, JAC, ENT} mode = DOT; + +  ARGBEGIN{ +    case 'h': +      usage(argv0); +    case 'j': +      mode = JAC; +      break; +    case 'd': +      mode = DOT; +      break; +    case 'e': +      mode = ENT; +      break; +    default: +      usage(argv0); +  } ARGEND + +  if(argc < 2) +    usage(argv0); + + +  uint64_t *blooms[argc]; + +  for(int i = 0; i < argc; i++) +    blooms[i] = map(argv[i]); + +  if(mode == ENT){ +    double *ent = mapent(); + +    for(int k = 0; k < BLOOMSIZE; k++) +      for(int i = 0; i < argc; i+=64){ +        uint64_t tmp[64]; +        for(int j = 0; j < 64; j++) +          tmp[j] = i + j < argc ? blooms[i + j][k] : 0; +        transpose64(tmp); +        for(int j = 0; j < 64; j++){ +          ent[j + 64 * k] += (double)__builtin_popcountll(tmp[j]); +        } +      } + +    for(int i = 0; i < 64 * BLOOMSIZE; i++){ +      double p = ent[i] / (double)argc; +      ent[i] = p == 0 ? 0 : -p * log(p); +    } +    fprintf(stderr, "entropy calculated\n"); + +    for(int i = 0; i < argc; i++) +      for(int j = i; j < argc; j++){ +        double dot = 0; +        for(int k = 0; k < BLOOMSIZE; k++){ +          uint64_t word = blooms[i][k] & blooms[j][k]; +          for(int l = __builtin_ctzll(word); word; word ^= 1L << l, l = __builtin_ctzll(word)) +            dot += ent[l + 64 * k]; +        } +        printf("%g\n", dot); +      } + +    return 0; +  } + +  for(int i = 0; i < argc; i++) +    for(int j = i; j < argc; j++){ +      uint64_t dot = 0; +      uint64_t un = 0; +      for(int k = 0; k < BLOOMSIZE; k++){ +        dot += __builtin_popcountll(blooms[i][k] & blooms[j][k]); +        un += __builtin_popcountll(blooms[i][k] | blooms[j][k]); +      } +      switch(mode){ +        case JAC: +          printf("%g\n", (double)dot / (double)un); +          break; +        default: +          printf("%lu\n", dot); +          break; +      } +    } + +  return 0; +} | 
