summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJustin Bedo <cu@cua0.org>2024-03-19 16:03:38 +1100
committerJustin Bedo <cu@cua0.org>2024-03-20 16:01:18 +1100
commitd814e4da1720f01725caa1e82a1ffff7a675720d (patch)
tree691f82d18b205cef8ec8212c1a9cb66d6de88d59
parenta837c5e2f2d41852313943a326e38c36487c64c3 (diff)
fix sloppy fastq parsing
Previously chunked on @, which prevented it being used in quality scores
-rw-r--r--Data/FastQ.hs30
-rw-r--r--package.yaml1
-rw-r--r--test.hs21
3 files changed, 34 insertions, 18 deletions
diff --git a/Data/FastQ.hs b/Data/FastQ.hs
index 940c875..cdf8d7c 100644
--- a/Data/FastQ.hs
+++ b/Data/FastQ.hs
@@ -1,5 +1,6 @@
{-# LANGUAGE LambdaCase #-}
{-# LANGUAGE OverloadedStrings #-}
+{-# LANGUAGE PatternSynonyms #-}
{-# LANGUAGE StrictData #-}
{-# LANGUAGE TemplateHaskell #-}
{-# LANGUAGE TupleSections #-}
@@ -9,6 +10,7 @@
module Data.FastQ where
import Control.Monad
+import Control.Applicative
import Control.Monad.IO.Class (MonadIO, liftIO)
import Data.ByteString (ByteString)
import qualified Data.ByteString as B
@@ -24,7 +26,7 @@ import qualified Streamly.Data.Fold as F
import Streamly.Data.Stream (Stream)
import qualified Streamly.Data.Stream as S
import qualified Streamly.External.ByteString as SB
-import Streamly.Internal.Data.Stream.Chunked as AS
+import qualified Streamly.Internal.Data.Stream.Chunked as AS
import Streamly.Internal.Data.Stream.StreamD.Type (Step (..))
import Streamly.Internal.Data.Unfold.Type (Unfold (..))
import System.IO
@@ -44,14 +46,15 @@ type ReadPair = (Read, Read)
gzipWindow :: WindowBits
gzipWindow = WindowBits 31
-parse :: MonadIO m => FilePath -> FilePath -> Stream m ReadPair
+parse :: (MonadIO m) => FilePath -> FilePath -> Stream m ReadPair
parse l r =
S.zipWith
- parseEntry
- (S.unfold streamBGZFile l & AS.splitOn 64 & S.drop 1)
- (S.unfold streamBGZFile r & AS.splitOn 64 & S.drop 1)
+ (liftA2 (,))
+ (S.unfold streamBGZFile l & AS.splitOn 10 & fmap SB.fromArray & S.foldMany parseRead)
+ (S.unfold streamBGZFile r & AS.splitOn 10 & fmap SB.fromArray & S.foldMany parseRead)
+ & S.catMaybes
where
- streamBGZFile :: MonadIO m => Unfold m FilePath (Array Word8)
+ streamBGZFile :: (MonadIO m) => Unfold m FilePath (Array Word8)
streamBGZFile = Unfold step seed
where
seed path = liftIO $ do
@@ -88,10 +91,17 @@ parse l r =
PRDone -> pure $ Yield (SB.toArray "") (Just (h, i, Nothing))
PRError e -> error $ "parse:" <> show e
- parseEntry l r =
- case (SB.fromArray l & BC.lines, SB.fromArray r & BC.lines) of
- ([hdr, seq, "+", qual], [hdr', seq', "+", qual']) -> (Read qual seq hdr, Read qual' seq' hdr')
- e -> error $ "parseEntry:" <> show e
+ parseRead =
+ (liftA4 (\(64 :. hdr) seq "+" qual -> Read qual seq hdr))
+ <$> F.one
+ <*> F.one
+ <*> F.one
+ <*> F.one
+
+ liftA4 fn a b c d = fn <$> a <*> b <*> c <*> d
+
+pattern (:.) :: Word8 -> ByteString -> ByteString
+pattern a :. b <- (B.uncons -> Just (a, b))
unparse :: FilePath -> FilePath -> Stream IO ReadPair -> IO ()
unparse l r str = do
diff --git a/package.yaml b/package.yaml
index 033ece3..515b9ff 100644
--- a/package.yaml
+++ b/package.yaml
@@ -25,5 +25,6 @@ tests:
main: test.hs
dependencies:
- QuickCheck
+ - temporary
other-modules:
- Data.FastQ
diff --git a/test.hs b/test.hs
index a8780ff..b0574a1 100644
--- a/test.hs
+++ b/test.hs
@@ -10,6 +10,7 @@ import Data.Function
import qualified Streamly.Data.Fold as F
import qualified Streamly.Data.Stream as S
import qualified Streamly.Data.Unfold as U
+import System.IO.Temp
import Test.QuickCheck
import Test.QuickCheck.Arbitrary
import Test.QuickCheck.Monadic
@@ -19,15 +20,19 @@ instance Arbitrary Read where
arbitrary = Read <$> genStr <*> genStr <*> genStr
where
genBS = fmap B.pack . listOf . elements
- genStr = genBS $ [' ' .. '*'] ++ [',' .. '?'] ++ ['A' .. '~']
+ genStr = genBS $ [' ' .. '~']
main :: IO ()
main = quickCheckWith stdArgs {maxSuccess = 10000} $ \rp -> monadicIO $ do
- rp' <- run $ do
- S.unfold U.fromList rp & unparse "a" "b"
- cnt <- B.readFile "a"
- B.writeFile "a" $ cnt <> cnt
- cnt <- B.readFile "b"
- B.writeFile "b" $ cnt <> cnt
- parse "a" "b" & S.fold F.toList
+ rp' <- run $ withSystemTempDirectory "dedumi-test" $ \root -> do
+ let a = root <> "/a"
+ b = root <> "/b"
+ a' = root <> "/a2"
+ b' = root <> "/b2"
+ S.unfold U.fromList rp & unparse a b
+ cnt <- B.readFile a
+ B.writeFile a' $ cnt <> cnt
+ cnt <- B.readFile b
+ B.writeFile b' $ cnt <> cnt
+ parse a' b' & S.fold F.toList
assert $ rp <> rp == rp'