summaryrefslogtreecommitdiff
path: root/dedumi.hs
blob: 3317a53509c94b48cd957d501179541c87eef064 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
{-# LANGUAGE DataKinds #-}
{-# LANGUAGE DeriveGeneric #-}
{-# LANGUAGE FlexibleInstances #-}
{-# LANGUAGE LambdaCase #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE StandaloneDeriving #-}
{-# LANGUAGE TypeApplications #-}
{-# LANGUAGE TypeFamilies #-}
{-# LANGUAGE TypeOperators #-}
{-# OPTIONS_GHC -Wno-orphans #-}

module Main where

import Data.ByteString (ByteString)
import qualified Data.ByteString as B
import Data.Cuckoo
import Data.FastQ
import Data.Function
import GHC.Prim (RealWorld)
import GHC.TypeLits
import Lens.Micro
import Options.Generic
import qualified Streamly.Data.Stream as S

newtype NoLabel a = NoLabel {unNoLabel :: a} deriving (Generic)

data Options w = Options
  { umiLength :: w ::: Natural <?> "length of UMI prefix" <!> "8",
    extraHashBases :: w ::: Natural <?> "extra hash bases to use for location proxy" <!> "4",
    filterSize :: w ::: Natural <?> "Cuckoo filter size" <!> "200000000",
    input1 :: w ::: NoLabel FilePath <?> "input fastq 1 path",
    input2 :: w ::: NoLabel FilePath <?> "input fastq 2 path",
    output1 :: w ::: NoLabel FilePath <?> "output fastq 1 path",
    output2 :: w ::: NoLabel FilePath <?> "output fastq 2 path"
  }
  deriving (Generic)

instance ParseFields a => ParseRecord (NoLabel a)

instance ParseFields a => ParseFields (NoLabel a) where
  parseFields msg _ _ def = fmap NoLabel (parseFields msg Nothing Nothing def)

instance ParseRecord (Options Wrapped)

instance CuckooFilterHash ByteString where
  cuckooHash (Salt s) = saltedFnv1aByteString s
  cuckooFingerprint (Salt s) = saltedSipHashByteString s
  {-# INLINE cuckooHash #-}
  {-# INLINE cuckooFingerprint #-}

trim :: Int -> ReadPair -> ReadPair
trim sz x =
  x
    & _1 . nucs %~ B.drop sz
    & _2 . nucs %~ B.drop sz
    & _1 . qual %~ B.drop sz
    & _2 . qual %~ B.drop sz

insert' :: (KnownNat b, KnownNat f) => Int -> CuckooFilter RealWorld b f ByteString -> ReadPair -> IO Bool
insert' sz f x =
  let y = B.take sz (x ^. _1 . nucs) <> B.take sz (x ^. _2 . nucs)
   in member f y >>= \case
        True -> pure False
        False ->
          insert f y >>= \case
            True -> pure True
            False -> error "filter full"

main :: IO ()
main = do
  opts <- unwrapRecord "UMI deduplication"

  f <- newCuckooFilter @4 @13 @ByteString 0 (filterSize opts)

  parse (unNoLabel $ input1 opts) (unNoLabel $ input2 opts)
    & S.filterM (insert' (fromIntegral $ umiLength opts + extraHashBases opts) f)
    & fmap (trim . fromIntegral $ umiLength opts)
    & unparse (unNoLabel $ output1 opts) (unNoLabel $ output2 opts)