summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJustin Bedo <cu@cua0.org>2023-08-02 14:38:45 +1000
committerJustin Bedo <cu@cua0.org>2023-08-02 14:38:45 +1000
commit172f54474f3e92ad4df5c193a8605d4333da33e1 (patch)
treeb034412aab396b5865f4c80aface622b1919ccef
initial fastq streaming
-rw-r--r--Data/FastQ.hs93
-rw-r--r--dedumi.hs0
-rw-r--r--flake.lock26
-rw-r--r--flake.nix12
-rw-r--r--package.yaml15
5 files changed, 146 insertions, 0 deletions
diff --git a/Data/FastQ.hs b/Data/FastQ.hs
new file mode 100644
index 0000000..5f33d27
--- /dev/null
+++ b/Data/FastQ.hs
@@ -0,0 +1,93 @@
+{-# LANGUAGE LambdaCase #-}
+{-# LANGUAGE OverloadedStrings #-}
+{-# LANGUAGE StrictData #-}
+{-# LANGUAGE TemplateHaskell #-}
+{-# LANGUAGE ViewPatterns #-}
+
+module Data.FastQ where
+
+import Codec.Compression.GZip
+import Data.Bifunctor (bimap)
+import Control.Monad
+import Control.Monad.IO.Class (MonadIO, liftIO)
+import Data.ByteString (ByteString)
+import qualified Data.ByteString as B
+import qualified Data.ByteString.Char8 as BC
+import qualified Data.ByteString.Lazy as BSL
+import qualified Data.ByteString.Lazy.Internal as BSL
+import Data.Function
+import Data.Void
+import Data.Word (Word8)
+import Debug.Trace
+import Lens.Micro
+import Lens.Micro.TH
+import Streamly.Data.Array (Array)
+import Streamly.Data.Fold as F
+import Streamly.Data.Stream (Stream)
+import qualified Streamly.Data.Stream as S
+import qualified Streamly.External.ByteString as SB
+import Streamly.Internal.Data.Stream.Chunked as AS
+import Streamly.Internal.Data.Stream.StreamD.Type (Step (..))
+import Streamly.Internal.Data.Unfold.Type (Unfold (..))
+import qualified Streamly.Internal.FileSystem.File as File
+import System.IO
+import System.IO.Unsafe (unsafeInterleaveIO)
+import Prelude hiding (Read, reads)
+
+data Read = Read
+ { _qual :: ByteString,
+ _nucs :: ByteString
+ }
+ deriving (Eq, Show, Ord)
+
+makeLenses ''Read
+
+data ReadPair = ReadPair
+ { _header :: ByteString,
+ _reads :: (Read, Read)
+ }
+ deriving (Eq, Show, Ord)
+
+makeLenses ''ReadPair
+
+parse :: MonadIO m => FilePath -> FilePath -> Stream m ReadPair
+parse l r =
+ S.zipWith
+ parseEntry
+ (S.unfold streamFile l & AS.splitOn 64)
+ (S.unfold streamFile r & AS.splitOn 64)
+ where
+ streamFile :: MonadIO m => Unfold m FilePath (Array Word8)
+ streamFile = Unfold step seed
+ where
+ seed = pure $ BSL.tail . decompress <$> liftIO (BSL.readFile r)
+ step (BSL.Chunk bs bl) = pure $ Yield (SB.toArray bs) bl
+ step BSL.Empty = pure Stop
+
+ parseEntry l r =
+ let [hdr, seq, "+", qual] = SB.fromArray l & BC.lines
+ [_, seq', "+", qual'] = SB.fromArray r & BC.lines
+ in ReadPair hdr (Read qual seq, Read qual' seq')
+
+unparse :: FilePath -> FilePath -> Stream IO ReadPair -> IO ()
+unparse l r str = do
+ (compress -> left, compress -> right) <- fmap unparse' str & toLazyBS
+ lh <- openFile l WriteMode
+ rh <- openFile r WriteMode
+ writeFiles lh rh left right
+ where
+ writeFiles l r (BSL.Chunk a as) (BSL.Chunk b bs) = do
+ B.hPut l a
+ B.hPut r b
+ writeFiles l r as bs
+ writeFiles l _ a BSL.Empty = BSL.hPut l a
+ writeFiles _ r BSL.Empty b = BSL.hPut r b
+
+ unparse' read =
+ ( BC.unlines [read ^. header, read ^. reads . _1 . nucs, "+", read ^. reads . _1 . qual],
+ BC.unlines [read ^. header, read ^. reads . _2 . nucs, "+", read ^. reads . _2 . qual]
+ )
+
+ toLazyBS :: Stream IO (ByteString, ByteString) -> IO (BSL.ByteString, BSL.ByteString)
+ toLazyBS =
+ S.foldrM (\(l, r) b -> bimap (BSL.chunk ("@" <> l)) (BSL.chunk ("@" <> r)) <$> unsafeInterleaveIO b) (pure (BSL.Empty, BSL.Empty))
diff --git a/dedumi.hs b/dedumi.hs
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/dedumi.hs
diff --git a/flake.lock b/flake.lock
new file mode 100644
index 0000000..8fb24ce
--- /dev/null
+++ b/flake.lock
@@ -0,0 +1,26 @@
+{
+ "nodes": {
+ "nixpkgs": {
+ "locked": {
+ "lastModified": 1690875999,
+ "narHash": "sha256-JCkJfcZL1qs24L+Fv6OyFW4wTE6+NCg9I8nDf6npP+A=",
+ "owner": "nixos",
+ "repo": "nixpkgs",
+ "rev": "356c6dcdf37cfb4162f534e5dcabadddbfbd6bfa",
+ "type": "github"
+ },
+ "original": {
+ "owner": "nixos",
+ "repo": "nixpkgs",
+ "type": "github"
+ }
+ },
+ "root": {
+ "inputs": {
+ "nixpkgs": "nixpkgs"
+ }
+ }
+ },
+ "root": "root",
+ "version": 7
+}
diff --git a/flake.nix b/flake.nix
new file mode 100644
index 0000000..e15a476
--- /dev/null
+++ b/flake.nix
@@ -0,0 +1,12 @@
+{
+ inputs.nixpkgs.url = "github:nixos/nixpkgs";
+ outputs = {self, nixpkgs}:
+ let
+ system = "x86_64-linux";
+ pkgs = import nixpkgs {inherit system;};
+ in
+ {
+ packages.${system}.default = pkgs.haskellPackages.callCabal2nix "dedumi" ./. {};
+ devShells.${system}.default = self.packages.${system}.default.env;
+ };
+}
diff --git a/package.yaml b/package.yaml
new file mode 100644
index 0000000..b0752ee
--- /dev/null
+++ b/package.yaml
@@ -0,0 +1,15 @@
+name: dedumi
+
+dependencies:
+ - base
+ - zlib
+ - microlens-platform
+ - bytestring
+ - streamly-bytestring
+ - streamly
+ - streamly-core
+ - streamly-archive
+
+executables:
+ dedumi:
+ main: dedumi.hs