diff options
author | Justin Bedo <cu@cua0.org> | 2023-08-16 22:41:22 +1000 |
---|---|---|
committer | Justin Bedo <cu@cua0.org> | 2023-08-16 22:41:22 +1000 |
commit | 2765051c4533483d11b2ea3fb4c660e74300ef01 (patch) | |
tree | fb02ee59bd6973f46ee66e4947aef24c21bab925 | |
parent | 5dffdecd5112d6c60c6ff285a8dd871086b7bc30 (diff) |
switch to streaming-commons and handle BGZip
-rw-r--r-- | Data/FastQ.hs | 52 | ||||
-rw-r--r-- | package.yaml | 2 |
2 files changed, 30 insertions, 24 deletions
diff --git a/Data/FastQ.hs b/Data/FastQ.hs index 3e2f26c..158db3a 100644 --- a/Data/FastQ.hs +++ b/Data/FastQ.hs @@ -2,12 +2,12 @@ {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE StrictData #-} {-# LANGUAGE TemplateHaskell #-} +{-# LANGUAGE TupleSections #-} {-# LANGUAGE ViewPatterns #-} {-# OPTIONS_GHC -Wno-incomplete-uni-patterns #-} module Data.FastQ where -import Codec.Zlib import Control.Monad import Control.Monad.IO.Class (MonadIO, liftIO) import Data.Bifunctor (bimap) @@ -17,6 +17,7 @@ import qualified Data.ByteString.Char8 as BC import qualified Data.ByteString.Lazy as BSL import qualified Data.ByteString.Lazy.Internal as BSL import Data.Function +import Data.Streaming.Zlib import Data.Word (Word8) import Lens.Micro import Lens.Micro.TH @@ -29,7 +30,6 @@ import Streamly.Internal.Data.Stream.Chunked as AS import Streamly.Internal.Data.Stream.StreamD.Type (Step (..)) import Streamly.Internal.Data.Unfold.Type (Unfold (..)) import System.IO -import System.IO.Unsafe (unsafeInterleaveIO) import Prelude hiding (Read) data Read = Read @@ -49,40 +49,45 @@ parse :: MonadIO m => FilePath -> FilePath -> Stream m ReadPair parse l r = S.zipWith parseEntry - (S.unfold streamFile l & AS.splitOn 64 & S.drop 1) - (S.unfold streamFile r & AS.splitOn 64 & S.drop 1) + (S.unfold streamBGZFile l & AS.splitOn 64 & S.drop 1) + (S.unfold streamBGZFile r & AS.splitOn 64 & S.drop 1) where - streamFile :: MonadIO m => Unfold m FilePath (Array Word8) - streamFile = Unfold step seed + streamBGZFile :: MonadIO m => Unfold m FilePath (Array Word8) + streamBGZFile = Unfold step seed where seed path = liftIO $ do h <- openFile path ReadMode i <- initInflate gzipWindow pure $ Just (h, i, Nothing) step Nothing = pure Stop - step (Just (h, i, Nothing)) = liftIO $ step' h i - step (Just (h, i, Just popper)) = liftIO $ do - popper >>= \case - Just str -> pure $ Yield (SB.toArray str) (Just (h, i, Just popper)) - Nothing -> step' h i - - step' h i = do + step (Just (h, i, Nothing)) = liftIO $ do chunk <- B.hGet h BSL.defaultChunkSize if chunk == B.empty then do + hClose h str <- finishInflate i pure $ Yield (SB.toArray str) Nothing else do - p <- feedInflate i chunk - p >>= \case - Just str -> pure $ Yield (SB.toArray str) (Just (h, i, Just p)) - Nothing -> step' h i + complete <- isCompleteInflate i + (frag, unused, i) <- + if complete + then do + f <- finishInflate i + u <- getUnusedInflate i + (f,u,) <$> initInflate gzipWindow + else pure ("", "", i) + popper <- feedInflate i $ unused <> chunk + pure $ Yield (SB.toArray frag) (Just (h, i, Just popper)) + step (Just (h, i, Just popper)) = liftIO $ do + popper >>= \case + PRNext str -> pure $ Yield (SB.toArray str) (Just (h, i, Just popper)) + PRDone -> pure $ Yield (SB.toArray "") (Just (h, i, Nothing)) + PRError e -> error $ "parse:" <> show e parseEntry l r = - let [hdr, seq, "+", qual] = SB.fromArray l & BC.lines - [hdr', seq', "+", qual'] = SB.fromArray r & BC.lines - str = error $ "parseEntry:" <> BC.unpack str - in (Read qual seq hdr, Read qual' seq' hdr') + case (SB.fromArray l & BC.lines, SB.fromArray r & BC.lines) of + ([hdr, seq, "+", qual], [hdr', seq', "+", qual']) -> (Read qual seq hdr, Read qual' seq' hdr') + e -> error $ "parseEntry:" <> show e unparse :: FilePath -> FilePath -> Stream IO ReadPair -> IO () unparse l r str = do @@ -108,10 +113,11 @@ unparse l r str = do writePopper h p = p >>= \case - Just str -> do + PRNext str -> do B.hPut h str writePopper h p - Nothing -> pure () + PRDone -> pure () + PRError e -> error $ "parse:" <> show e unparse' :: ReadPair -> (ByteString, ByteString) unparse' read = diff --git a/package.yaml b/package.yaml index 3085a53..bf95141 100644 --- a/package.yaml +++ b/package.yaml @@ -3,7 +3,7 @@ name: dedumi dependencies: - base - fusion-plugin - - zlib-bindings + - streaming-commons - cuckoo - microlens-th - microlens |