summaryrefslogtreecommitdiff
path: root/dedumi.hs
diff options
context:
space:
mode:
authorJustin Bedo <cu@cua0.org>2024-03-19 16:03:38 +1100
committerJustin Bedo <cu@cua0.org>2024-03-20 16:04:24 +1100
commitafba6bce9c2e8ff77da2848f57f443088861db5d (patch)
tree9242fb357da4ff6ed9684d25ff150f4da1981360 /dedumi.hs
parentb7dcf3b7452e29fc4366852fe8ff6e3fa94ae555 (diff)
add benchmarking
Diffstat (limited to 'dedumi.hs')
-rw-r--r--dedumi.hs74
1 files changed, 2 insertions, 72 deletions
diff --git a/dedumi.hs b/dedumi.hs
index 3317a53..d26d48a 100644
--- a/dedumi.hs
+++ b/dedumi.hs
@@ -1,78 +1,8 @@
-{-# LANGUAGE DataKinds #-}
-{-# LANGUAGE DeriveGeneric #-}
-{-# LANGUAGE FlexibleInstances #-}
-{-# LANGUAGE LambdaCase #-}
{-# LANGUAGE OverloadedStrings #-}
-{-# LANGUAGE StandaloneDeriving #-}
-{-# LANGUAGE TypeApplications #-}
-{-# LANGUAGE TypeFamilies #-}
-{-# LANGUAGE TypeOperators #-}
-{-# OPTIONS_GHC -Wno-orphans #-}
-
module Main where
-import Data.ByteString (ByteString)
-import qualified Data.ByteString as B
-import Data.Cuckoo
-import Data.FastQ
-import Data.Function
-import GHC.Prim (RealWorld)
-import GHC.TypeLits
-import Lens.Micro
+import Dedumi
import Options.Generic
-import qualified Streamly.Data.Stream as S
-
-newtype NoLabel a = NoLabel {unNoLabel :: a} deriving (Generic)
-
-data Options w = Options
- { umiLength :: w ::: Natural <?> "length of UMI prefix" <!> "8",
- extraHashBases :: w ::: Natural <?> "extra hash bases to use for location proxy" <!> "4",
- filterSize :: w ::: Natural <?> "Cuckoo filter size" <!> "200000000",
- input1 :: w ::: NoLabel FilePath <?> "input fastq 1 path",
- input2 :: w ::: NoLabel FilePath <?> "input fastq 2 path",
- output1 :: w ::: NoLabel FilePath <?> "output fastq 1 path",
- output2 :: w ::: NoLabel FilePath <?> "output fastq 2 path"
- }
- deriving (Generic)
-
-instance ParseFields a => ParseRecord (NoLabel a)
-
-instance ParseFields a => ParseFields (NoLabel a) where
- parseFields msg _ _ def = fmap NoLabel (parseFields msg Nothing Nothing def)
-
-instance ParseRecord (Options Wrapped)
-
-instance CuckooFilterHash ByteString where
- cuckooHash (Salt s) = saltedFnv1aByteString s
- cuckooFingerprint (Salt s) = saltedSipHashByteString s
- {-# INLINE cuckooHash #-}
- {-# INLINE cuckooFingerprint #-}
-
-trim :: Int -> ReadPair -> ReadPair
-trim sz x =
- x
- & _1 . nucs %~ B.drop sz
- & _2 . nucs %~ B.drop sz
- & _1 . qual %~ B.drop sz
- & _2 . qual %~ B.drop sz
-
-insert' :: (KnownNat b, KnownNat f) => Int -> CuckooFilter RealWorld b f ByteString -> ReadPair -> IO Bool
-insert' sz f x =
- let y = B.take sz (x ^. _1 . nucs) <> B.take sz (x ^. _2 . nucs)
- in member f y >>= \case
- True -> pure False
- False ->
- insert f y >>= \case
- True -> pure True
- False -> error "filter full"
main :: IO ()
-main = do
- opts <- unwrapRecord "UMI deduplication"
-
- f <- newCuckooFilter @4 @13 @ByteString 0 (filterSize opts)
-
- parse (unNoLabel $ input1 opts) (unNoLabel $ input2 opts)
- & S.filterM (insert' (fromIntegral $ umiLength opts + extraHashBases opts) f)
- & fmap (trim . fromIntegral $ umiLength opts)
- & unparse (unNoLabel $ output1 opts) (unNoLabel $ output2 opts)
+main = unwrapRecord "UMI deduplication" >>= go