summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJustin Bedo <cu@cua0.org>2023-11-07 08:52:24 +1100
committerJustin Bedo <cu@cua0.org>2023-11-07 08:52:24 +1100
commita837c5e2f2d41852313943a326e38c36487c64c3 (patch)
tree3f3dbf12327d0fc23fc3462e59d0ba79019ceaee
parente17f009dc5f1750f47f4ef950b2032b4bd283e67 (diff)
allow configuration via argv
-rw-r--r--dedumi.hs66
-rw-r--r--package.yaml1
2 files changed, 43 insertions, 24 deletions
diff --git a/dedumi.hs b/dedumi.hs
index 5ca07a1..3317a53 100644
--- a/dedumi.hs
+++ b/dedumi.hs
@@ -1,8 +1,12 @@
{-# LANGUAGE DataKinds #-}
+{-# LANGUAGE DeriveGeneric #-}
+{-# LANGUAGE FlexibleInstances #-}
{-# LANGUAGE LambdaCase #-}
-{-# LANGUAGE NumericUnderscores #-}
+{-# LANGUAGE OverloadedStrings #-}
+{-# LANGUAGE StandaloneDeriving #-}
{-# LANGUAGE TypeApplications #-}
{-# LANGUAGE TypeFamilies #-}
+{-# LANGUAGE TypeOperators #-}
{-# OPTIONS_GHC -Wno-orphans #-}
module Main where
@@ -15,8 +19,28 @@ import Data.Function
import GHC.Prim (RealWorld)
import GHC.TypeLits
import Lens.Micro
+import Options.Generic
import qualified Streamly.Data.Stream as S
-import System.Environment
+
+newtype NoLabel a = NoLabel {unNoLabel :: a} deriving (Generic)
+
+data Options w = Options
+ { umiLength :: w ::: Natural <?> "length of UMI prefix" <!> "8",
+ extraHashBases :: w ::: Natural <?> "extra hash bases to use for location proxy" <!> "4",
+ filterSize :: w ::: Natural <?> "Cuckoo filter size" <!> "200000000",
+ input1 :: w ::: NoLabel FilePath <?> "input fastq 1 path",
+ input2 :: w ::: NoLabel FilePath <?> "input fastq 2 path",
+ output1 :: w ::: NoLabel FilePath <?> "output fastq 1 path",
+ output2 :: w ::: NoLabel FilePath <?> "output fastq 2 path"
+ }
+ deriving (Generic)
+
+instance ParseFields a => ParseRecord (NoLabel a)
+
+instance ParseFields a => ParseFields (NoLabel a) where
+ parseFields msg _ _ def = fmap NoLabel (parseFields msg Nothing Nothing def)
+
+instance ParseRecord (Options Wrapped)
instance CuckooFilterHash ByteString where
cuckooHash (Salt s) = saltedFnv1aByteString s
@@ -24,23 +48,17 @@ instance CuckooFilterHash ByteString where
{-# INLINE cuckooHash #-}
{-# INLINE cuckooFingerprint #-}
-umiLength :: Int
-umiLength = 8
-
-extraHashBases :: Int
-extraHashBases = 4
-
-trim :: ReadPair -> ReadPair
-trim x =
+trim :: Int -> ReadPair -> ReadPair
+trim sz x =
x
- & _1 . nucs %~ B.drop umiLength
- & _2 . nucs %~ B.drop umiLength
- & _1 . qual %~ B.drop umiLength
- & _2 . qual %~ B.drop umiLength
-
-insert' :: (KnownNat b, KnownNat f) => CuckooFilter RealWorld b f ByteString -> ReadPair -> IO Bool
-insert' f x =
- let y = B.take (umiLength + extraHashBases) (x ^. _1 . nucs) <> B.take (umiLength + extraHashBases) (x ^. _2 . nucs)
+ & _1 . nucs %~ B.drop sz
+ & _2 . nucs %~ B.drop sz
+ & _1 . qual %~ B.drop sz
+ & _2 . qual %~ B.drop sz
+
+insert' :: (KnownNat b, KnownNat f) => Int -> CuckooFilter RealWorld b f ByteString -> ReadPair -> IO Bool
+insert' sz f x =
+ let y = B.take sz (x ^. _1 . nucs) <> B.take sz (x ^. _2 . nucs)
in member f y >>= \case
True -> pure False
False ->
@@ -50,11 +68,11 @@ insert' f x =
main :: IO ()
main = do
- [p1, p2, p3, p4] <- getArgs
+ opts <- unwrapRecord "UMI deduplication"
- f <- newCuckooFilter @4 @13 @ByteString 0 200_000_000
+ f <- newCuckooFilter @4 @13 @ByteString 0 (filterSize opts)
- parse p1 p2
- & S.filterM (insert' f)
- & fmap trim
- & unparse p3 p4
+ parse (unNoLabel $ input1 opts) (unNoLabel $ input2 opts)
+ & S.filterM (insert' (fromIntegral $ umiLength opts + extraHashBases opts) f)
+ & fmap (trim . fromIntegral $ umiLength opts)
+ & unparse (unNoLabel $ output1 opts) (unNoLabel $ output2 opts)
diff --git a/package.yaml b/package.yaml
index f0438a7..033ece3 100644
--- a/package.yaml
+++ b/package.yaml
@@ -11,6 +11,7 @@ dependencies:
- streamly-bytestring
- streamly-core
- ghc-prim
+ - optparse-generic
executables:
dedumi: