diff options
author | Justin Bedo <cu@cua0.org> | 2024-03-19 16:03:38 +1100 |
---|---|---|
committer | Justin Bedo <cu@cua0.org> | 2024-03-20 16:04:24 +1100 |
commit | afba6bce9c2e8ff77da2848f57f443088861db5d (patch) | |
tree | 9242fb357da4ff6ed9684d25ff150f4da1981360 | |
parent | b7dcf3b7452e29fc4366852fe8ff6e3fa94ae555 (diff) |
add benchmarking
-rw-r--r-- | Dedumi.hs | 76 | ||||
-rw-r--r-- | bench.hs | 50 | ||||
-rw-r--r-- | dedumi.hs | 74 | ||||
-rw-r--r-- | package.yaml | 12 |
4 files changed, 140 insertions, 72 deletions
diff --git a/Dedumi.hs b/Dedumi.hs new file mode 100644 index 0000000..008597a --- /dev/null +++ b/Dedumi.hs @@ -0,0 +1,76 @@ +{-# LANGUAGE DataKinds #-} +{-# LANGUAGE DeriveGeneric #-} +{-# LANGUAGE FlexibleInstances #-} +{-# LANGUAGE FlexibleContexts #-} +{-# LANGUAGE LambdaCase #-} +{-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE StandaloneDeriving #-} +{-# LANGUAGE TypeApplications #-} +{-# LANGUAGE TypeFamilies #-} +{-# LANGUAGE TypeOperators #-} +{-# OPTIONS_GHC -Wno-orphans #-} + +module Dedumi where + +import Data.ByteString (ByteString) +import qualified Data.ByteString as B +import Data.Cuckoo +import Data.FastQ +import Data.Function +import GHC.Prim (RealWorld) +import GHC.TypeLits +import Lens.Micro +import Options.Generic +import qualified Streamly.Data.Stream as S + +newtype NoLabel a = NoLabel {unNoLabel :: a} deriving (Generic, Show) + +data Options w = Options + { umiLength :: w ::: Natural <?> "length of UMI prefix" <!> "8", + extraHashBases :: w ::: Natural <?> "extra hash bases to use for location proxy" <!> "4", + filterSize :: w ::: Natural <?> "Cuckoo filter size" <!> "200000000", + input1 :: w ::: NoLabel FilePath <?> "input fastq 1 path", + input2 :: w ::: NoLabel FilePath <?> "input fastq 2 path", + output1 :: w ::: NoLabel FilePath <?> "output fastq 1 path", + output2 :: w ::: NoLabel FilePath <?> "output fastq 2 path" + } + deriving (Generic) + +instance ParseFields a => ParseRecord (NoLabel a) + +instance ParseFields a => ParseFields (NoLabel a) where + parseFields msg _ _ def = fmap NoLabel (parseFields msg Nothing Nothing def) + +instance ParseRecord (Options Wrapped) + +instance CuckooFilterHash ByteString where + cuckooHash (Salt s) = saltedFnv1aByteString s + cuckooFingerprint (Salt s) = saltedSipHashByteString s + {-# INLINE cuckooHash #-} + {-# INLINE cuckooFingerprint #-} + +trim :: Int -> ReadPair -> ReadPair +trim sz x = + x + & _1 . nucs %~ B.drop sz + & _2 . nucs %~ B.drop sz + & _1 . qual %~ B.drop sz + & _2 . qual %~ B.drop sz + +insert' :: (KnownNat b, KnownNat f) => Int -> CuckooFilter RealWorld b f ByteString -> ReadPair -> IO Bool +insert' sz f x = + let y = B.take sz (x ^. _1 . nucs) <> B.take sz (x ^. _2 . nucs) + in member f y >>= \case + True -> pure False + False -> + insert f y >>= \case + True -> pure True + False -> error "filter full" + +go opts = do + f <- newCuckooFilter @4 @13 @ByteString 0 (filterSize opts) + + parse (unNoLabel $ input1 opts) (unNoLabel $ input2 opts) + & S.filterM (insert' (fromIntegral $ umiLength opts + extraHashBases opts) f) + & fmap (trim . fromIntegral $ umiLength opts) + & unparse (unNoLabel $ output1 opts) (unNoLabel $ output2 opts) diff --git a/bench.hs b/bench.hs new file mode 100644 index 0000000..561eef1 --- /dev/null +++ b/bench.hs @@ -0,0 +1,50 @@ +{-# LANGUAGE DataKinds #-} +{-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE QuasiQuotes #-} +{-# LANGUAGE ScopedTypeVariables #-} + +module Main where + +import Control.Monad +import Criterion +import Criterion.Main +import qualified Data.ByteString.Char8 as B +import Data.FastQ +import Data.Function +import qualified Streamly.Data.Fold as F +import qualified Streamly.Data.Stream as S +import qualified Streamly.Data.Unfold as U +import Test.QuickCheck.Arbitrary +import Test.QuickCheck.Gen +import Prelude hiding (Read) +import System.IO.Temp +import Options.Generic +import qualified Dedumi as D +import Data.Maybe +import qualified Data.Text as T +import Debug.Trace + +instance Arbitrary Read where + arbitrary = Read <$> genStr <*> genStr <*> genStr + where + genBS = fmap B.pack . listOf1 . elements + genStr = genBS $ [' ' .. '~'] + +--sizes = [2 ^ i | i <- [11 .. 19]] +sizes = [2*8192] + +setupEnv root size = do + let fileroot = root <> "/" <> show size + a = fileroot <> "a" + b = fileroot <> "b" + rp <- replicateM size $ generate $ arbitrary + S.unfold U.fromList rp & unparse a b + pure (a, b) + +main :: IO () +main = withSystemTempDirectory "dedumi-bench" $ \tmpdir -> + defaultMain + [ bgroup "main" $ map (\s -> env (setupEnv tmpdir s) $ \ ~(a,b) -> bench (show s) (run (T.pack a) (T.pack b))) sizes + ] + +run fq1 fq2 = nfAppIO D.go $ fromJust $ unwrapRecordPure [fq1, fq2, "/dev/null", "/dev/null"] @@ -1,78 +1,8 @@ -{-# LANGUAGE DataKinds #-} -{-# LANGUAGE DeriveGeneric #-} -{-# LANGUAGE FlexibleInstances #-} -{-# LANGUAGE LambdaCase #-} {-# LANGUAGE OverloadedStrings #-} -{-# LANGUAGE StandaloneDeriving #-} -{-# LANGUAGE TypeApplications #-} -{-# LANGUAGE TypeFamilies #-} -{-# LANGUAGE TypeOperators #-} -{-# OPTIONS_GHC -Wno-orphans #-} - module Main where -import Data.ByteString (ByteString) -import qualified Data.ByteString as B -import Data.Cuckoo -import Data.FastQ -import Data.Function -import GHC.Prim (RealWorld) -import GHC.TypeLits -import Lens.Micro +import Dedumi import Options.Generic -import qualified Streamly.Data.Stream as S - -newtype NoLabel a = NoLabel {unNoLabel :: a} deriving (Generic) - -data Options w = Options - { umiLength :: w ::: Natural <?> "length of UMI prefix" <!> "8", - extraHashBases :: w ::: Natural <?> "extra hash bases to use for location proxy" <!> "4", - filterSize :: w ::: Natural <?> "Cuckoo filter size" <!> "200000000", - input1 :: w ::: NoLabel FilePath <?> "input fastq 1 path", - input2 :: w ::: NoLabel FilePath <?> "input fastq 2 path", - output1 :: w ::: NoLabel FilePath <?> "output fastq 1 path", - output2 :: w ::: NoLabel FilePath <?> "output fastq 2 path" - } - deriving (Generic) - -instance ParseFields a => ParseRecord (NoLabel a) - -instance ParseFields a => ParseFields (NoLabel a) where - parseFields msg _ _ def = fmap NoLabel (parseFields msg Nothing Nothing def) - -instance ParseRecord (Options Wrapped) - -instance CuckooFilterHash ByteString where - cuckooHash (Salt s) = saltedFnv1aByteString s - cuckooFingerprint (Salt s) = saltedSipHashByteString s - {-# INLINE cuckooHash #-} - {-# INLINE cuckooFingerprint #-} - -trim :: Int -> ReadPair -> ReadPair -trim sz x = - x - & _1 . nucs %~ B.drop sz - & _2 . nucs %~ B.drop sz - & _1 . qual %~ B.drop sz - & _2 . qual %~ B.drop sz - -insert' :: (KnownNat b, KnownNat f) => Int -> CuckooFilter RealWorld b f ByteString -> ReadPair -> IO Bool -insert' sz f x = - let y = B.take sz (x ^. _1 . nucs) <> B.take sz (x ^. _2 . nucs) - in member f y >>= \case - True -> pure False - False -> - insert f y >>= \case - True -> pure True - False -> error "filter full" main :: IO () -main = do - opts <- unwrapRecord "UMI deduplication" - - f <- newCuckooFilter @4 @13 @ByteString 0 (filterSize opts) - - parse (unNoLabel $ input1 opts) (unNoLabel $ input2 opts) - & S.filterM (insert' (fromIntegral $ umiLength opts + extraHashBases opts) f) - & fmap (trim . fromIntegral $ umiLength opts) - & unparse (unNoLabel $ output1 opts) (unNoLabel $ output2 opts) +main = unwrapRecord "UMI deduplication" >>= go diff --git a/package.yaml b/package.yaml index 515b9ff..56d6dc3 100644 --- a/package.yaml +++ b/package.yaml @@ -19,6 +19,7 @@ executables: ghc-options: [-O2, -fdicts-strict, -fspec-constr-recursive=16, -fmax-worker-args=16, -fplugin=Fusion.Plugin, -Wall, -Wno-name-shadowing] other-modules: - Data.FastQ + - Dedumi tests: tests: @@ -28,3 +29,14 @@ tests: - temporary other-modules: - Data.FastQ + + bench: + main: bench.hs + dependencies: + - QuickCheck + - criterion + - temporary + - text + other-modules: + - Data.FastQ + - Dedumi |