diff options
Diffstat (limited to 'Dedumi.hs')
-rw-r--r-- | Dedumi.hs | 76 |
1 files changed, 76 insertions, 0 deletions
diff --git a/Dedumi.hs b/Dedumi.hs new file mode 100644 index 0000000..008597a --- /dev/null +++ b/Dedumi.hs @@ -0,0 +1,76 @@ +{-# LANGUAGE DataKinds #-} +{-# LANGUAGE DeriveGeneric #-} +{-# LANGUAGE FlexibleInstances #-} +{-# LANGUAGE FlexibleContexts #-} +{-# LANGUAGE LambdaCase #-} +{-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE StandaloneDeriving #-} +{-# LANGUAGE TypeApplications #-} +{-# LANGUAGE TypeFamilies #-} +{-# LANGUAGE TypeOperators #-} +{-# OPTIONS_GHC -Wno-orphans #-} + +module Dedumi where + +import Data.ByteString (ByteString) +import qualified Data.ByteString as B +import Data.Cuckoo +import Data.FastQ +import Data.Function +import GHC.Prim (RealWorld) +import GHC.TypeLits +import Lens.Micro +import Options.Generic +import qualified Streamly.Data.Stream as S + +newtype NoLabel a = NoLabel {unNoLabel :: a} deriving (Generic, Show) + +data Options w = Options + { umiLength :: w ::: Natural <?> "length of UMI prefix" <!> "8", + extraHashBases :: w ::: Natural <?> "extra hash bases to use for location proxy" <!> "4", + filterSize :: w ::: Natural <?> "Cuckoo filter size" <!> "200000000", + input1 :: w ::: NoLabel FilePath <?> "input fastq 1 path", + input2 :: w ::: NoLabel FilePath <?> "input fastq 2 path", + output1 :: w ::: NoLabel FilePath <?> "output fastq 1 path", + output2 :: w ::: NoLabel FilePath <?> "output fastq 2 path" + } + deriving (Generic) + +instance ParseFields a => ParseRecord (NoLabel a) + +instance ParseFields a => ParseFields (NoLabel a) where + parseFields msg _ _ def = fmap NoLabel (parseFields msg Nothing Nothing def) + +instance ParseRecord (Options Wrapped) + +instance CuckooFilterHash ByteString where + cuckooHash (Salt s) = saltedFnv1aByteString s + cuckooFingerprint (Salt s) = saltedSipHashByteString s + {-# INLINE cuckooHash #-} + {-# INLINE cuckooFingerprint #-} + +trim :: Int -> ReadPair -> ReadPair +trim sz x = + x + & _1 . nucs %~ B.drop sz + & _2 . nucs %~ B.drop sz + & _1 . qual %~ B.drop sz + & _2 . qual %~ B.drop sz + +insert' :: (KnownNat b, KnownNat f) => Int -> CuckooFilter RealWorld b f ByteString -> ReadPair -> IO Bool +insert' sz f x = + let y = B.take sz (x ^. _1 . nucs) <> B.take sz (x ^. _2 . nucs) + in member f y >>= \case + True -> pure False + False -> + insert f y >>= \case + True -> pure True + False -> error "filter full" + +go opts = do + f <- newCuckooFilter @4 @13 @ByteString 0 (filterSize opts) + + parse (unNoLabel $ input1 opts) (unNoLabel $ input2 opts) + & S.filterM (insert' (fromIntegral $ umiLength opts + extraHashBases opts) f) + & fmap (trim . fromIntegral $ umiLength opts) + & unparse (unNoLabel $ output1 opts) (unNoLabel $ output2 opts) |