From afba6bce9c2e8ff77da2848f57f443088861db5d Mon Sep 17 00:00:00 2001 From: Justin Bedo Date: Tue, 19 Mar 2024 16:03:38 +1100 Subject: add benchmarking --- dedumi.hs | 74 ++------------------------------------------------------------- 1 file changed, 2 insertions(+), 72 deletions(-) (limited to 'dedumi.hs') diff --git a/dedumi.hs b/dedumi.hs index 3317a53..d26d48a 100644 --- a/dedumi.hs +++ b/dedumi.hs @@ -1,78 +1,8 @@ -{-# LANGUAGE DataKinds #-} -{-# LANGUAGE DeriveGeneric #-} -{-# LANGUAGE FlexibleInstances #-} -{-# LANGUAGE LambdaCase #-} {-# LANGUAGE OverloadedStrings #-} -{-# LANGUAGE StandaloneDeriving #-} -{-# LANGUAGE TypeApplications #-} -{-# LANGUAGE TypeFamilies #-} -{-# LANGUAGE TypeOperators #-} -{-# OPTIONS_GHC -Wno-orphans #-} - module Main where -import Data.ByteString (ByteString) -import qualified Data.ByteString as B -import Data.Cuckoo -import Data.FastQ -import Data.Function -import GHC.Prim (RealWorld) -import GHC.TypeLits -import Lens.Micro +import Dedumi import Options.Generic -import qualified Streamly.Data.Stream as S - -newtype NoLabel a = NoLabel {unNoLabel :: a} deriving (Generic) - -data Options w = Options - { umiLength :: w ::: Natural "length of UMI prefix" "8", - extraHashBases :: w ::: Natural "extra hash bases to use for location proxy" "4", - filterSize :: w ::: Natural "Cuckoo filter size" "200000000", - input1 :: w ::: NoLabel FilePath "input fastq 1 path", - input2 :: w ::: NoLabel FilePath "input fastq 2 path", - output1 :: w ::: NoLabel FilePath "output fastq 1 path", - output2 :: w ::: NoLabel FilePath "output fastq 2 path" - } - deriving (Generic) - -instance ParseFields a => ParseRecord (NoLabel a) - -instance ParseFields a => ParseFields (NoLabel a) where - parseFields msg _ _ def = fmap NoLabel (parseFields msg Nothing Nothing def) - -instance ParseRecord (Options Wrapped) - -instance CuckooFilterHash ByteString where - cuckooHash (Salt s) = saltedFnv1aByteString s - cuckooFingerprint (Salt s) = saltedSipHashByteString s - {-# INLINE cuckooHash #-} - {-# INLINE cuckooFingerprint #-} - -trim :: Int -> ReadPair -> ReadPair -trim sz x = - x - & _1 . nucs %~ B.drop sz - & _2 . nucs %~ B.drop sz - & _1 . qual %~ B.drop sz - & _2 . qual %~ B.drop sz - -insert' :: (KnownNat b, KnownNat f) => Int -> CuckooFilter RealWorld b f ByteString -> ReadPair -> IO Bool -insert' sz f x = - let y = B.take sz (x ^. _1 . nucs) <> B.take sz (x ^. _2 . nucs) - in member f y >>= \case - True -> pure False - False -> - insert f y >>= \case - True -> pure True - False -> error "filter full" main :: IO () -main = do - opts <- unwrapRecord "UMI deduplication" - - f <- newCuckooFilter @4 @13 @ByteString 0 (filterSize opts) - - parse (unNoLabel $ input1 opts) (unNoLabel $ input2 opts) - & S.filterM (insert' (fromIntegral $ umiLength opts + extraHashBases opts) f) - & fmap (trim . fromIntegral $ umiLength opts) - & unparse (unNoLabel $ output1 opts) (unNoLabel $ output2 opts) +main = unwrapRecord "UMI deduplication" >>= go -- cgit v1.2.3