summaryrefslogtreecommitdiff
path: root/Dedumi.hs
diff options
context:
space:
mode:
Diffstat (limited to 'Dedumi.hs')
-rw-r--r--Dedumi.hs76
1 files changed, 76 insertions, 0 deletions
diff --git a/Dedumi.hs b/Dedumi.hs
new file mode 100644
index 0000000..008597a
--- /dev/null
+++ b/Dedumi.hs
@@ -0,0 +1,76 @@
+{-# LANGUAGE DataKinds #-}
+{-# LANGUAGE DeriveGeneric #-}
+{-# LANGUAGE FlexibleInstances #-}
+{-# LANGUAGE FlexibleContexts #-}
+{-# LANGUAGE LambdaCase #-}
+{-# LANGUAGE OverloadedStrings #-}
+{-# LANGUAGE StandaloneDeriving #-}
+{-# LANGUAGE TypeApplications #-}
+{-# LANGUAGE TypeFamilies #-}
+{-# LANGUAGE TypeOperators #-}
+{-# OPTIONS_GHC -Wno-orphans #-}
+
+module Dedumi where
+
+import Data.ByteString (ByteString)
+import qualified Data.ByteString as B
+import Data.Cuckoo
+import Data.FastQ
+import Data.Function
+import GHC.Prim (RealWorld)
+import GHC.TypeLits
+import Lens.Micro
+import Options.Generic
+import qualified Streamly.Data.Stream as S
+
+newtype NoLabel a = NoLabel {unNoLabel :: a} deriving (Generic, Show)
+
+data Options w = Options
+ { umiLength :: w ::: Natural <?> "length of UMI prefix" <!> "8",
+ extraHashBases :: w ::: Natural <?> "extra hash bases to use for location proxy" <!> "4",
+ filterSize :: w ::: Natural <?> "Cuckoo filter size" <!> "200000000",
+ input1 :: w ::: NoLabel FilePath <?> "input fastq 1 path",
+ input2 :: w ::: NoLabel FilePath <?> "input fastq 2 path",
+ output1 :: w ::: NoLabel FilePath <?> "output fastq 1 path",
+ output2 :: w ::: NoLabel FilePath <?> "output fastq 2 path"
+ }
+ deriving (Generic)
+
+instance ParseFields a => ParseRecord (NoLabel a)
+
+instance ParseFields a => ParseFields (NoLabel a) where
+ parseFields msg _ _ def = fmap NoLabel (parseFields msg Nothing Nothing def)
+
+instance ParseRecord (Options Wrapped)
+
+instance CuckooFilterHash ByteString where
+ cuckooHash (Salt s) = saltedFnv1aByteString s
+ cuckooFingerprint (Salt s) = saltedSipHashByteString s
+ {-# INLINE cuckooHash #-}
+ {-# INLINE cuckooFingerprint #-}
+
+trim :: Int -> ReadPair -> ReadPair
+trim sz x =
+ x
+ & _1 . nucs %~ B.drop sz
+ & _2 . nucs %~ B.drop sz
+ & _1 . qual %~ B.drop sz
+ & _2 . qual %~ B.drop sz
+
+insert' :: (KnownNat b, KnownNat f) => Int -> CuckooFilter RealWorld b f ByteString -> ReadPair -> IO Bool
+insert' sz f x =
+ let y = B.take sz (x ^. _1 . nucs) <> B.take sz (x ^. _2 . nucs)
+ in member f y >>= \case
+ True -> pure False
+ False ->
+ insert f y >>= \case
+ True -> pure True
+ False -> error "filter full"
+
+go opts = do
+ f <- newCuckooFilter @4 @13 @ByteString 0 (filterSize opts)
+
+ parse (unNoLabel $ input1 opts) (unNoLabel $ input2 opts)
+ & S.filterM (insert' (fromIntegral $ umiLength opts + extraHashBases opts) f)
+ & fmap (trim . fromIntegral $ umiLength opts)
+ & unparse (unNoLabel $ output1 opts) (unNoLabel $ output2 opts)