From e17f009dc5f1750f47f4ef950b2032b4bd283e67 Mon Sep 17 00:00:00 2001 From: Justin Bedo Date: Fri, 18 Aug 2023 11:31:44 +1000 Subject: add tests and bugfix for small BGZip files --- Data/FastQ.hs | 38 +++++++++++++++++++++----------------- dedumi.hs | 2 -- package.yaml | 8 ++++++++ test.hs | 33 +++++++++++++++++++++++++++++++++ 4 files changed, 62 insertions(+), 19 deletions(-) create mode 100644 test.hs diff --git a/Data/FastQ.hs b/Data/FastQ.hs index 158db3a..940c875 100644 --- a/Data/FastQ.hs +++ b/Data/FastQ.hs @@ -10,12 +10,10 @@ module Data.FastQ where import Control.Monad import Control.Monad.IO.Class (MonadIO, liftIO) -import Data.Bifunctor (bimap) import Data.ByteString (ByteString) import qualified Data.ByteString as B import qualified Data.ByteString.Char8 as BC -import qualified Data.ByteString.Lazy as BSL -import qualified Data.ByteString.Lazy.Internal as BSL +import Data.ByteString.Lazy.Internal (defaultChunkSize) import Data.Function import Data.Streaming.Zlib import Data.Word (Word8) @@ -43,6 +41,7 @@ makeLenses ''Read type ReadPair = (Read, Read) +gzipWindow :: WindowBits gzipWindow = WindowBits 31 parse :: MonadIO m => FilePath -> FilePath -> Stream m ReadPair @@ -61,22 +60,27 @@ parse l r = pure $ Just (h, i, Nothing) step Nothing = pure Stop step (Just (h, i, Nothing)) = liftIO $ do - chunk <- B.hGet h BSL.defaultChunkSize - if chunk == B.empty + complete <- isCompleteInflate i + (frag, unused, i) <- + if complete + then do + f <- finishInflate i + u <- getUnusedInflate i + (f,u,) <$> initInflate gzipWindow + else pure ("", "", i) + if unused == "" then do - hClose h - str <- finishInflate i - pure $ Yield (SB.toArray str) Nothing + chunk <- B.hGet h defaultChunkSize + if chunk == B.empty + then do + hClose h + str <- finishInflate i + pure $ Yield (SB.toArray $ frag <> str) Nothing + else do + popper <- feedInflate i $ chunk + pure $ Yield (SB.toArray frag) (Just (h, i, Just popper)) else do - complete <- isCompleteInflate i - (frag, unused, i) <- - if complete - then do - f <- finishInflate i - u <- getUnusedInflate i - (f,u,) <$> initInflate gzipWindow - else pure ("", "", i) - popper <- feedInflate i $ unused <> chunk + popper <- feedInflate i $ unused pure $ Yield (SB.toArray frag) (Just (h, i, Just popper)) step (Just (h, i, Just popper)) = liftIO $ do popper >>= \case diff --git a/dedumi.hs b/dedumi.hs index 4215026..5ca07a1 100644 --- a/dedumi.hs +++ b/dedumi.hs @@ -15,7 +15,6 @@ import Data.Function import GHC.Prim (RealWorld) import GHC.TypeLits import Lens.Micro -import qualified Streamly.Data.Fold as F import qualified Streamly.Data.Stream as S import System.Environment @@ -58,5 +57,4 @@ main = do parse p1 p2 & S.filterM (insert' f) & fmap trim - -- & S.fold (F.drainMapM print) & unparse p3 p4 diff --git a/package.yaml b/package.yaml index bf95141..f0438a7 100644 --- a/package.yaml +++ b/package.yaml @@ -18,3 +18,11 @@ executables: ghc-options: [-O2, -fdicts-strict, -fspec-constr-recursive=16, -fmax-worker-args=16, -fplugin=Fusion.Plugin, -Wall, -Wno-name-shadowing] other-modules: - Data.FastQ + +tests: + tests: + main: test.hs + dependencies: + - QuickCheck + other-modules: + - Data.FastQ diff --git a/test.hs b/test.hs new file mode 100644 index 0000000..a8780ff --- /dev/null +++ b/test.hs @@ -0,0 +1,33 @@ +{-# LANGUAGE DataKinds #-} +{-# LANGUAGE QuasiQuotes #-} +{-# LANGUAGE ScopedTypeVariables #-} + +module Main where + +import qualified Data.ByteString.Char8 as B +import Data.FastQ +import Data.Function +import qualified Streamly.Data.Fold as F +import qualified Streamly.Data.Stream as S +import qualified Streamly.Data.Unfold as U +import Test.QuickCheck +import Test.QuickCheck.Arbitrary +import Test.QuickCheck.Monadic +import Prelude hiding (Read) + +instance Arbitrary Read where + arbitrary = Read <$> genStr <*> genStr <*> genStr + where + genBS = fmap B.pack . listOf . elements + genStr = genBS $ [' ' .. '*'] ++ [',' .. '?'] ++ ['A' .. '~'] + +main :: IO () +main = quickCheckWith stdArgs {maxSuccess = 10000} $ \rp -> monadicIO $ do + rp' <- run $ do + S.unfold U.fromList rp & unparse "a" "b" + cnt <- B.readFile "a" + B.writeFile "a" $ cnt <> cnt + cnt <- B.readFile "b" + B.writeFile "b" $ cnt <> cnt + parse "a" "b" & S.fold F.toList + assert $ rp <> rp == rp' -- cgit v1.2.3