blob: aa682f16cc5e8c7eb2bb2dc792d8f8a747038312 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
|
{-# LANGUAGE DataKinds #-}
{-# LANGUAGE LambdaCase #-}
{-# LANGUAGE NumericUnderscores #-}
{-# LANGUAGE TypeApplications #-}
{-# LANGUAGE TypeFamilies #-}
{-# OPTIONS_GHC -Wno-orphans #-}
module Main where
import Data.ByteString (ByteString)
import qualified Data.ByteString as B
import Data.Cuckoo
import Data.FastQ
import Data.Function
import GHC.Prim (RealWorld)
import GHC.TypeLits
import Lens.Micro
import qualified Streamly.Data.Stream as S
import System.Environment
instance CuckooFilterHash ByteString where
cuckooHash (Salt s) = saltedFnv1aByteString s
cuckooFingerprint (Salt s) = saltedSipHashByteString s
{-# INLINE cuckooHash #-}
{-# INLINE cuckooFingerprint #-}
umiLength :: Int
umiLength = 8
extraHashBases :: Int
extraHashBases = 4
trim :: ReadPair -> ReadPair
trim x =
x
& _1 . nucs %~ B.drop umiLength
& _2 . nucs %~ B.drop umiLength
& _1 . qual %~ B.drop umiLength
& _2 . qual %~ B.drop umiLength
insert' :: (KnownNat b, KnownNat f) => CuckooFilter RealWorld b f ByteString -> ReadPair -> IO Bool
insert' f x =
let y = B.take (umiLength + extraHashBases) (x ^. _1 . nucs) <> B.take (umiLength + extraHashBases) (x ^. _2 . nucs)
in member f y >>= \case
True -> pure True
False ->
insert f y >>= \case
True -> pure False
False -> error "filter full"
main :: IO ()
main = do
[p1, p2, p3, p4] <- getArgs
f <- newCuckooFilter @4 @13 @ByteString 0 200_000_000
parse p1 p2
& S.filterM (insert' f)
& fmap trim
& unparse p3 p4
|