From d814e4da1720f01725caa1e82a1ffff7a675720d Mon Sep 17 00:00:00 2001 From: Justin Bedo Date: Tue, 19 Mar 2024 16:03:38 +1100 Subject: fix sloppy fastq parsing Previously chunked on @, which prevented it being used in quality scores --- Data/FastQ.hs | 30 ++++++++++++++++++++---------- package.yaml | 1 + test.hs | 21 +++++++++++++-------- 3 files changed, 34 insertions(+), 18 deletions(-) diff --git a/Data/FastQ.hs b/Data/FastQ.hs index 940c875..cdf8d7c 100644 --- a/Data/FastQ.hs +++ b/Data/FastQ.hs @@ -1,5 +1,6 @@ {-# LANGUAGE LambdaCase #-} {-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE PatternSynonyms #-} {-# LANGUAGE StrictData #-} {-# LANGUAGE TemplateHaskell #-} {-# LANGUAGE TupleSections #-} @@ -9,6 +10,7 @@ module Data.FastQ where import Control.Monad +import Control.Applicative import Control.Monad.IO.Class (MonadIO, liftIO) import Data.ByteString (ByteString) import qualified Data.ByteString as B @@ -24,7 +26,7 @@ import qualified Streamly.Data.Fold as F import Streamly.Data.Stream (Stream) import qualified Streamly.Data.Stream as S import qualified Streamly.External.ByteString as SB -import Streamly.Internal.Data.Stream.Chunked as AS +import qualified Streamly.Internal.Data.Stream.Chunked as AS import Streamly.Internal.Data.Stream.StreamD.Type (Step (..)) import Streamly.Internal.Data.Unfold.Type (Unfold (..)) import System.IO @@ -44,14 +46,15 @@ type ReadPair = (Read, Read) gzipWindow :: WindowBits gzipWindow = WindowBits 31 -parse :: MonadIO m => FilePath -> FilePath -> Stream m ReadPair +parse :: (MonadIO m) => FilePath -> FilePath -> Stream m ReadPair parse l r = S.zipWith - parseEntry - (S.unfold streamBGZFile l & AS.splitOn 64 & S.drop 1) - (S.unfold streamBGZFile r & AS.splitOn 64 & S.drop 1) + (liftA2 (,)) + (S.unfold streamBGZFile l & AS.splitOn 10 & fmap SB.fromArray & S.foldMany parseRead) + (S.unfold streamBGZFile r & AS.splitOn 10 & fmap SB.fromArray & S.foldMany parseRead) + & S.catMaybes where - streamBGZFile :: MonadIO m => Unfold m FilePath (Array Word8) + streamBGZFile :: (MonadIO m) => Unfold m FilePath (Array Word8) streamBGZFile = Unfold step seed where seed path = liftIO $ do @@ -88,10 +91,17 @@ parse l r = PRDone -> pure $ Yield (SB.toArray "") (Just (h, i, Nothing)) PRError e -> error $ "parse:" <> show e - parseEntry l r = - case (SB.fromArray l & BC.lines, SB.fromArray r & BC.lines) of - ([hdr, seq, "+", qual], [hdr', seq', "+", qual']) -> (Read qual seq hdr, Read qual' seq' hdr') - e -> error $ "parseEntry:" <> show e + parseRead = + (liftA4 (\(64 :. hdr) seq "+" qual -> Read qual seq hdr)) + <$> F.one + <*> F.one + <*> F.one + <*> F.one + + liftA4 fn a b c d = fn <$> a <*> b <*> c <*> d + +pattern (:.) :: Word8 -> ByteString -> ByteString +pattern a :. b <- (B.uncons -> Just (a, b)) unparse :: FilePath -> FilePath -> Stream IO ReadPair -> IO () unparse l r str = do diff --git a/package.yaml b/package.yaml index 033ece3..515b9ff 100644 --- a/package.yaml +++ b/package.yaml @@ -25,5 +25,6 @@ tests: main: test.hs dependencies: - QuickCheck + - temporary other-modules: - Data.FastQ diff --git a/test.hs b/test.hs index a8780ff..b0574a1 100644 --- a/test.hs +++ b/test.hs @@ -10,6 +10,7 @@ import Data.Function import qualified Streamly.Data.Fold as F import qualified Streamly.Data.Stream as S import qualified Streamly.Data.Unfold as U +import System.IO.Temp import Test.QuickCheck import Test.QuickCheck.Arbitrary import Test.QuickCheck.Monadic @@ -19,15 +20,19 @@ instance Arbitrary Read where arbitrary = Read <$> genStr <*> genStr <*> genStr where genBS = fmap B.pack . listOf . elements - genStr = genBS $ [' ' .. '*'] ++ [',' .. '?'] ++ ['A' .. '~'] + genStr = genBS $ [' ' .. '~'] main :: IO () main = quickCheckWith stdArgs {maxSuccess = 10000} $ \rp -> monadicIO $ do - rp' <- run $ do - S.unfold U.fromList rp & unparse "a" "b" - cnt <- B.readFile "a" - B.writeFile "a" $ cnt <> cnt - cnt <- B.readFile "b" - B.writeFile "b" $ cnt <> cnt - parse "a" "b" & S.fold F.toList + rp' <- run $ withSystemTempDirectory "dedumi-test" $ \root -> do + let a = root <> "/a" + b = root <> "/b" + a' = root <> "/a2" + b' = root <> "/b2" + S.unfold U.fromList rp & unparse a b + cnt <- B.readFile a + B.writeFile a' $ cnt <> cnt + cnt <- B.readFile b + B.writeFile b' $ cnt <> cnt + parse a' b' & S.fold F.toList assert $ rp <> rp == rp' -- cgit v1.2.3