summaryrefslogtreecommitdiff
path: root/dedumi.hs
blob: 01d9b373ef648eb7b1ff3ed97ea08c113179254d (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
{-# LANGUAGE DataKinds #-}
{-# LANGUAGE LambdaCase #-}
{-# LANGUAGE NumericUnderscores #-}
{-# LANGUAGE TypeApplications #-}
{-# LANGUAGE TypeFamilies #-}

module Main where

import Data.ByteString (ByteString)
import qualified Data.ByteString as B
import Data.Cuckoo
import Data.FastQ
import Data.Function
import Data.Typeable
import Lens.Micro
import qualified Streamly.Data.Stream as S
import System.Environment
import Prelude hiding (reads)

instance CuckooFilterHash ByteString where
  cuckooHash (Salt s) = saltedFnv1aByteString s
  cuckooFingerprint (Salt s) = saltedSipHashByteString s
  {-# INLINE cuckooHash #-}
  {-# INLINE cuckooFingerprint #-}

umiLength = 8

trim x =
  x
    & reads . _1 . nucs %~ B.drop umiLength
    & reads . _2 . nucs %~ B.drop umiLength
    & reads . _1 . qual %~ B.drop umiLength
    & reads . _2 . qual %~ B.drop umiLength

insert' f x =
  let y = B.take umiLength (x ^. reads . _1 . nucs) <> B.take umiLength (x ^. reads . _2 . nucs)
   in member f y >>= \case
        True -> pure True
        False ->
          insert f y >>= \case
            True -> pure False
            False -> error "filter full"

main = do
  [p1, p2, p3, p4] <- getArgs

  f <- newCuckooFilter @4 @13 @ByteString 0 20_000_000

  parse p1 p2
    & S.filterM (insert' f)
    & fmap trim
    & unparse p3 p4

  pure ()