summaryrefslogtreecommitdiff
path: root/Data/FastQ.hs
blob: 91f0d5f1959d34b4592a7c3490ed7f251173309a (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
{-# LANGUAGE LambdaCase #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE StrictData #-}
{-# LANGUAGE TemplateHaskell #-}
{-# LANGUAGE ViewPatterns #-}
{-# OPTIONS_GHC -Wno-incomplete-uni-patterns #-}

module Data.FastQ where

import Codec.Compression.GZip
import Control.Monad
import Control.Monad.IO.Class (MonadIO, liftIO)
import Data.Bifunctor (bimap)
import Data.ByteString (ByteString)
import qualified Data.ByteString as B
import qualified Data.ByteString.Char8 as BC
import qualified Data.ByteString.Lazy as BSL
import qualified Data.ByteString.Lazy.Internal as BSL
import Data.Function
import Data.Word (Word8)
import Lens.Micro
import Lens.Micro.TH
import Streamly.Data.Array (Array)
import Streamly.Data.Stream (Stream)
import qualified Streamly.Data.Stream as S
import qualified Streamly.External.ByteString as SB
import Streamly.Internal.Data.Stream.Chunked as AS
import Streamly.Internal.Data.Stream.StreamD.Type (Step (..))
import Streamly.Internal.Data.Unfold.Type (Unfold (..))
import System.IO
import System.IO.Unsafe (unsafeInterleaveIO)
import Prelude hiding (Read)

data Read = Read
  { _qual :: ByteString,
    _nucs :: ByteString,
    _header :: ByteString
  }
  deriving (Eq, Show, Ord)

makeLenses ''Read

type ReadPair = (Read, Read)

parse :: MonadIO m => FilePath -> FilePath -> Stream m ReadPair
parse l r =
  S.zipWith
    parseEntry
    (S.unfold streamFile l & AS.splitOn 64)
    (S.unfold streamFile r & AS.splitOn 64)
  where
    streamFile :: MonadIO m => Unfold m FilePath (Array Word8)
    streamFile = Unfold step seed
      where
        seed = fmap (BSL.tail . decompress) . liftIO . BSL.readFile
        step (BSL.Chunk bs bl) = pure $ Yield (SB.toArray bs) bl
        step BSL.Empty = pure Stop

    parseEntry l r =
      let [hdr, seq, "+", qual] = SB.fromArray l & BC.lines
          [hdr', seq', "+", qual'] = SB.fromArray r & BC.lines
       in (Read qual seq hdr, Read qual' seq' hdr')

unparse :: FilePath -> FilePath -> Stream IO ReadPair -> IO ()
unparse l r str = do
  (compress' -> left, compress' -> right) <- fmap unparse' str & toLazyBS
  lh <- openFile l WriteMode
  rh <- openFile r WriteMode
  writeFiles lh rh left right
  where
    writeFiles l r (BSL.Chunk a as) (BSL.Chunk b bs) = do
      B.hPut l a
      B.hPut r b
      writeFiles l r as bs
    writeFiles l _ a BSL.Empty = BSL.hPut l a
    writeFiles _ r BSL.Empty b = BSL.hPut r b

    unparse' :: ReadPair -> (ByteString, ByteString)
    unparse' read =
      ( BC.unlines [read ^. _1 . header, read ^. _1 . nucs, "+", read ^. _1 . qual],
        BC.unlines [read ^. _2 . header, read ^. _2 . nucs, "+", read ^. _2 . qual]
      )

    toLazyBS :: Stream IO (ByteString, ByteString) -> IO (BSL.ByteString, BSL.ByteString)
    toLazyBS =
      S.foldrM (\(l, r) b -> bimap (BSL.chunk ("@" <> l)) (BSL.chunk ("@" <> r)) <$> unsafeInterleaveIO b) (pure (BSL.Empty, BSL.Empty))

    compress' = compressWith (defaultCompressParams {compressLevel = bestSpeed})