summaryrefslogtreecommitdiff
path: root/dedumi.hs
diff options
context:
space:
mode:
Diffstat (limited to 'dedumi.hs')
-rw-r--r--dedumi.hs54
1 files changed, 54 insertions, 0 deletions
diff --git a/dedumi.hs b/dedumi.hs
index e69de29..01d9b37 100644
--- a/dedumi.hs
+++ b/dedumi.hs
@@ -0,0 +1,54 @@
+{-# LANGUAGE DataKinds #-}
+{-# LANGUAGE LambdaCase #-}
+{-# LANGUAGE NumericUnderscores #-}
+{-# LANGUAGE TypeApplications #-}
+{-# LANGUAGE TypeFamilies #-}
+
+module Main where
+
+import Data.ByteString (ByteString)
+import qualified Data.ByteString as B
+import Data.Cuckoo
+import Data.FastQ
+import Data.Function
+import Data.Typeable
+import Lens.Micro
+import qualified Streamly.Data.Stream as S
+import System.Environment
+import Prelude hiding (reads)
+
+instance CuckooFilterHash ByteString where
+ cuckooHash (Salt s) = saltedFnv1aByteString s
+ cuckooFingerprint (Salt s) = saltedSipHashByteString s
+ {-# INLINE cuckooHash #-}
+ {-# INLINE cuckooFingerprint #-}
+
+umiLength = 8
+
+trim x =
+ x
+ & reads . _1 . nucs %~ B.drop umiLength
+ & reads . _2 . nucs %~ B.drop umiLength
+ & reads . _1 . qual %~ B.drop umiLength
+ & reads . _2 . qual %~ B.drop umiLength
+
+insert' f x =
+ let y = B.take umiLength (x ^. reads . _1 . nucs) <> B.take umiLength (x ^. reads . _2 . nucs)
+ in member f y >>= \case
+ True -> pure True
+ False ->
+ insert f y >>= \case
+ True -> pure False
+ False -> error "filter full"
+
+main = do
+ [p1, p2, p3, p4] <- getArgs
+
+ f <- newCuckooFilter @4 @13 @ByteString 0 20_000_000
+
+ parse p1 p2
+ & S.filterM (insert' f)
+ & fmap trim
+ & unparse p3 p4
+
+ pure ()