summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJustin Bedo <cu@cua0.org>2024-03-19 16:03:38 +1100
committerJustin Bedo <cu@cua0.org>2024-03-20 16:04:24 +1100
commitafba6bce9c2e8ff77da2848f57f443088861db5d (patch)
tree9242fb357da4ff6ed9684d25ff150f4da1981360
parentb7dcf3b7452e29fc4366852fe8ff6e3fa94ae555 (diff)
add benchmarking
-rw-r--r--Dedumi.hs76
-rw-r--r--bench.hs50
-rw-r--r--dedumi.hs74
-rw-r--r--package.yaml12
4 files changed, 140 insertions, 72 deletions
diff --git a/Dedumi.hs b/Dedumi.hs
new file mode 100644
index 0000000..008597a
--- /dev/null
+++ b/Dedumi.hs
@@ -0,0 +1,76 @@
+{-# LANGUAGE DataKinds #-}
+{-# LANGUAGE DeriveGeneric #-}
+{-# LANGUAGE FlexibleInstances #-}
+{-# LANGUAGE FlexibleContexts #-}
+{-# LANGUAGE LambdaCase #-}
+{-# LANGUAGE OverloadedStrings #-}
+{-# LANGUAGE StandaloneDeriving #-}
+{-# LANGUAGE TypeApplications #-}
+{-# LANGUAGE TypeFamilies #-}
+{-# LANGUAGE TypeOperators #-}
+{-# OPTIONS_GHC -Wno-orphans #-}
+
+module Dedumi where
+
+import Data.ByteString (ByteString)
+import qualified Data.ByteString as B
+import Data.Cuckoo
+import Data.FastQ
+import Data.Function
+import GHC.Prim (RealWorld)
+import GHC.TypeLits
+import Lens.Micro
+import Options.Generic
+import qualified Streamly.Data.Stream as S
+
+newtype NoLabel a = NoLabel {unNoLabel :: a} deriving (Generic, Show)
+
+data Options w = Options
+ { umiLength :: w ::: Natural <?> "length of UMI prefix" <!> "8",
+ extraHashBases :: w ::: Natural <?> "extra hash bases to use for location proxy" <!> "4",
+ filterSize :: w ::: Natural <?> "Cuckoo filter size" <!> "200000000",
+ input1 :: w ::: NoLabel FilePath <?> "input fastq 1 path",
+ input2 :: w ::: NoLabel FilePath <?> "input fastq 2 path",
+ output1 :: w ::: NoLabel FilePath <?> "output fastq 1 path",
+ output2 :: w ::: NoLabel FilePath <?> "output fastq 2 path"
+ }
+ deriving (Generic)
+
+instance ParseFields a => ParseRecord (NoLabel a)
+
+instance ParseFields a => ParseFields (NoLabel a) where
+ parseFields msg _ _ def = fmap NoLabel (parseFields msg Nothing Nothing def)
+
+instance ParseRecord (Options Wrapped)
+
+instance CuckooFilterHash ByteString where
+ cuckooHash (Salt s) = saltedFnv1aByteString s
+ cuckooFingerprint (Salt s) = saltedSipHashByteString s
+ {-# INLINE cuckooHash #-}
+ {-# INLINE cuckooFingerprint #-}
+
+trim :: Int -> ReadPair -> ReadPair
+trim sz x =
+ x
+ & _1 . nucs %~ B.drop sz
+ & _2 . nucs %~ B.drop sz
+ & _1 . qual %~ B.drop sz
+ & _2 . qual %~ B.drop sz
+
+insert' :: (KnownNat b, KnownNat f) => Int -> CuckooFilter RealWorld b f ByteString -> ReadPair -> IO Bool
+insert' sz f x =
+ let y = B.take sz (x ^. _1 . nucs) <> B.take sz (x ^. _2 . nucs)
+ in member f y >>= \case
+ True -> pure False
+ False ->
+ insert f y >>= \case
+ True -> pure True
+ False -> error "filter full"
+
+go opts = do
+ f <- newCuckooFilter @4 @13 @ByteString 0 (filterSize opts)
+
+ parse (unNoLabel $ input1 opts) (unNoLabel $ input2 opts)
+ & S.filterM (insert' (fromIntegral $ umiLength opts + extraHashBases opts) f)
+ & fmap (trim . fromIntegral $ umiLength opts)
+ & unparse (unNoLabel $ output1 opts) (unNoLabel $ output2 opts)
diff --git a/bench.hs b/bench.hs
new file mode 100644
index 0000000..561eef1
--- /dev/null
+++ b/bench.hs
@@ -0,0 +1,50 @@
+{-# LANGUAGE DataKinds #-}
+{-# LANGUAGE OverloadedStrings #-}
+{-# LANGUAGE QuasiQuotes #-}
+{-# LANGUAGE ScopedTypeVariables #-}
+
+module Main where
+
+import Control.Monad
+import Criterion
+import Criterion.Main
+import qualified Data.ByteString.Char8 as B
+import Data.FastQ
+import Data.Function
+import qualified Streamly.Data.Fold as F
+import qualified Streamly.Data.Stream as S
+import qualified Streamly.Data.Unfold as U
+import Test.QuickCheck.Arbitrary
+import Test.QuickCheck.Gen
+import Prelude hiding (Read)
+import System.IO.Temp
+import Options.Generic
+import qualified Dedumi as D
+import Data.Maybe
+import qualified Data.Text as T
+import Debug.Trace
+
+instance Arbitrary Read where
+ arbitrary = Read <$> genStr <*> genStr <*> genStr
+ where
+ genBS = fmap B.pack . listOf1 . elements
+ genStr = genBS $ [' ' .. '~']
+
+--sizes = [2 ^ i | i <- [11 .. 19]]
+sizes = [2*8192]
+
+setupEnv root size = do
+ let fileroot = root <> "/" <> show size
+ a = fileroot <> "a"
+ b = fileroot <> "b"
+ rp <- replicateM size $ generate $ arbitrary
+ S.unfold U.fromList rp & unparse a b
+ pure (a, b)
+
+main :: IO ()
+main = withSystemTempDirectory "dedumi-bench" $ \tmpdir ->
+ defaultMain
+ [ bgroup "main" $ map (\s -> env (setupEnv tmpdir s) $ \ ~(a,b) -> bench (show s) (run (T.pack a) (T.pack b))) sizes
+ ]
+
+run fq1 fq2 = nfAppIO D.go $ fromJust $ unwrapRecordPure [fq1, fq2, "/dev/null", "/dev/null"]
diff --git a/dedumi.hs b/dedumi.hs
index 3317a53..d26d48a 100644
--- a/dedumi.hs
+++ b/dedumi.hs
@@ -1,78 +1,8 @@
-{-# LANGUAGE DataKinds #-}
-{-# LANGUAGE DeriveGeneric #-}
-{-# LANGUAGE FlexibleInstances #-}
-{-# LANGUAGE LambdaCase #-}
{-# LANGUAGE OverloadedStrings #-}
-{-# LANGUAGE StandaloneDeriving #-}
-{-# LANGUAGE TypeApplications #-}
-{-# LANGUAGE TypeFamilies #-}
-{-# LANGUAGE TypeOperators #-}
-{-# OPTIONS_GHC -Wno-orphans #-}
-
module Main where
-import Data.ByteString (ByteString)
-import qualified Data.ByteString as B
-import Data.Cuckoo
-import Data.FastQ
-import Data.Function
-import GHC.Prim (RealWorld)
-import GHC.TypeLits
-import Lens.Micro
+import Dedumi
import Options.Generic
-import qualified Streamly.Data.Stream as S
-
-newtype NoLabel a = NoLabel {unNoLabel :: a} deriving (Generic)
-
-data Options w = Options
- { umiLength :: w ::: Natural <?> "length of UMI prefix" <!> "8",
- extraHashBases :: w ::: Natural <?> "extra hash bases to use for location proxy" <!> "4",
- filterSize :: w ::: Natural <?> "Cuckoo filter size" <!> "200000000",
- input1 :: w ::: NoLabel FilePath <?> "input fastq 1 path",
- input2 :: w ::: NoLabel FilePath <?> "input fastq 2 path",
- output1 :: w ::: NoLabel FilePath <?> "output fastq 1 path",
- output2 :: w ::: NoLabel FilePath <?> "output fastq 2 path"
- }
- deriving (Generic)
-
-instance ParseFields a => ParseRecord (NoLabel a)
-
-instance ParseFields a => ParseFields (NoLabel a) where
- parseFields msg _ _ def = fmap NoLabel (parseFields msg Nothing Nothing def)
-
-instance ParseRecord (Options Wrapped)
-
-instance CuckooFilterHash ByteString where
- cuckooHash (Salt s) = saltedFnv1aByteString s
- cuckooFingerprint (Salt s) = saltedSipHashByteString s
- {-# INLINE cuckooHash #-}
- {-# INLINE cuckooFingerprint #-}
-
-trim :: Int -> ReadPair -> ReadPair
-trim sz x =
- x
- & _1 . nucs %~ B.drop sz
- & _2 . nucs %~ B.drop sz
- & _1 . qual %~ B.drop sz
- & _2 . qual %~ B.drop sz
-
-insert' :: (KnownNat b, KnownNat f) => Int -> CuckooFilter RealWorld b f ByteString -> ReadPair -> IO Bool
-insert' sz f x =
- let y = B.take sz (x ^. _1 . nucs) <> B.take sz (x ^. _2 . nucs)
- in member f y >>= \case
- True -> pure False
- False ->
- insert f y >>= \case
- True -> pure True
- False -> error "filter full"
main :: IO ()
-main = do
- opts <- unwrapRecord "UMI deduplication"
-
- f <- newCuckooFilter @4 @13 @ByteString 0 (filterSize opts)
-
- parse (unNoLabel $ input1 opts) (unNoLabel $ input2 opts)
- & S.filterM (insert' (fromIntegral $ umiLength opts + extraHashBases opts) f)
- & fmap (trim . fromIntegral $ umiLength opts)
- & unparse (unNoLabel $ output1 opts) (unNoLabel $ output2 opts)
+main = unwrapRecord "UMI deduplication" >>= go
diff --git a/package.yaml b/package.yaml
index 515b9ff..56d6dc3 100644
--- a/package.yaml
+++ b/package.yaml
@@ -19,6 +19,7 @@ executables:
ghc-options: [-O2, -fdicts-strict, -fspec-constr-recursive=16, -fmax-worker-args=16, -fplugin=Fusion.Plugin, -Wall, -Wno-name-shadowing]
other-modules:
- Data.FastQ
+ - Dedumi
tests:
tests:
@@ -28,3 +29,14 @@ tests:
- temporary
other-modules:
- Data.FastQ
+
+ bench:
+ main: bench.hs
+ dependencies:
+ - QuickCheck
+ - criterion
+ - temporary
+ - text
+ other-modules:
+ - Data.FastQ
+ - Dedumi