From e5a8870358713f4e44e135da913c50230d83d4f1 Mon Sep 17 00:00:00 2001 From: Justin Bedo Date: Sun, 7 Apr 2019 21:10:25 +1000 Subject: modify outputs to ensure determinism --- tools/gridss-callVariants.nix | 3 +++ tools/gridss-collectMetrics.nix | 6 ++++++ tools/platypus-callVariants.nix | 3 +++ tools/samtools-merge.nix | 7 ++++++- tools/strelka-call.nix | 13 ++++++++++++- tools/strelka-callSomatic.nix | 13 ++++++++++++- tools/strelka.nix | 12 ++++++------ 7 files changed, 48 insertions(+), 9 deletions(-) diff --git a/tools/gridss-callVariants.nix b/tools/gridss-callVariants.nix index ffbb349..f42e3f3 100644 --- a/tools/gridss-callVariants.nix +++ b/tools/gridss-callVariants.nix @@ -50,6 +50,9 @@ stage rec { ASSEMBLY="$out/gridss.bam" \ ${optionalString (blacklist != null) ("BLACKLIST=" + blacklist)} \ ${optionalString (flags != null) flags} + + # The VCF index is non-deterministic + rm $out/gridss.vcf.idx ''; passthru.multicore = true; } diff --git a/tools/gridss-collectMetrics.nix b/tools/gridss-collectMetrics.nix index c62346e..2e3b3a9 100644 --- a/tools/gridss-collectMetrics.nix +++ b/tools/gridss-collectMetrics.nix @@ -29,5 +29,11 @@ stage rec { O=$out/input \ AS=true \ THRESHOLD_COVERAGE=${toString thresholdCoverage} + + # Make the output deterministic by removing timestamps + sed -i '/^# Started on:/d' $out/input.*_metrics + if [ -e $out/input.insert_size_histogram.pdf ] ; then + sed -i 's/(D:[0-9]\+)/(D:19700101000000)/g' $out/input.insert_size_histogram.pdf + fi ''; } diff --git a/tools/platypus-callVariants.nix b/tools/platypus-callVariants.nix index f3b3e7c..abe2a6b 100644 --- a/tools/platypus-callVariants.nix +++ b/tools/platypus-callVariants.nix @@ -35,6 +35,9 @@ stage { ${optionalString (flags != null) flags} \ -o $out \ --bamFiles=${concatMapStringsSep "," (p: "${filename p}.bam") inputs} + + # Remove timestamps from output + sed -i '/^##fileDate/d' $out ''; passthru.filetype = filetype.vcf {ref = ref;}; passthru.multicore = true; diff --git a/tools/samtools-merge.nix b/tools/samtools-merge.nix index 120f825..9b7d133 100644 --- a/tools/samtools-merge.nix +++ b/tools/samtools-merge.nix @@ -19,7 +19,12 @@ stage { name = "samtools-merge"; buildInputs = with pkgs; [ samtools ]; buildCommand = '' - samtools merge ${optionalString (flags != null) flags} $out ${concatStringsSep " " inputs} + samtools merge ${optionalString (flags != null) flags} out.bam ${concatStringsSep " " inputs} + + # Merge is non-deterministic with PG lines; if files have clashing PG IDs then a random + # suffix is appended to make it unique. PG lines are stripped in the following to + # resolve the issue. + samtools reheader <(samtools view -H out.bam | grep -v '@PG') out.bam > $out ''; passthru.filetype = (builtins.elemAt inputs 0).filetype; } diff --git a/tools/strelka-call.nix b/tools/strelka-call.nix index 7836bbe..bc1e202 100644 --- a/tools/strelka-call.nix +++ b/tools/strelka-call.nix @@ -38,7 +38,18 @@ stage { -m local \ -j $NIX_BUILD_CORES 2>&1 - cp -r results $out + # Strelka writes runtime stats and timestamps; + # both have to be stripped to provide determinism + cd results/variants + rm *.tbi genome.vcf.gz + for f in *.vcf.gz; do + gunzip $f + g=$(basename $f .gz) + sed -i '/^##fileDate/d' $g + sed -i '/^##startTime/d' $g + done + mkdir $out + cp -r * $out ''; passthru.multicore = true; } diff --git a/tools/strelka-callSomatic.nix b/tools/strelka-callSomatic.nix index 256065b..cf4b414 100644 --- a/tools/strelka-callSomatic.nix +++ b/tools/strelka-callSomatic.nix @@ -40,7 +40,18 @@ stage { -m local \ -j $NIX_BUILD_CORES - cp -r results $out + # Strelka writes runtime stats and timestamps; + # both have to be stripped to provide determinism + cd results/variants + rm *.tbi + for f in *.vcf.gz; do + gunzip $f + g=$(basename $f .gz) + sed -i '/^##fileDate/d' $g + sed -i '/^##startTime/d' $g + done + mkdir $out + cp -r * $out ''; passthru.multicore = true; } diff --git a/tools/strelka.nix b/tools/strelka.nix index 1cf8961..d3068b5 100644 --- a/tools/strelka.nix +++ b/tools/strelka.nix @@ -20,9 +20,9 @@ with types; drv: stage { name = "strelka-call-variants"; buildCommand = '' - ln -s ${drv}/variants/variants.vcf.gz $out + ln -s ${drv}/variants/variants.vcf $out ''; - passthru.filetype = filetype.gz (filetype.vcf {ref=ref;}); + passthru.filetype = filetype.vcf {ref=ref;}; }; /* Extract indels from somatic results Type: indels :: somatic results -> vcf @@ -31,8 +31,8 @@ with types; # result of callSomatic drv: stage { name = "strelka-callVariants-indels"; - buildCommand = "ln -s ${drv}/variants/somatic.indels.vcf.gz $out"; - passthru.filetype = filetype.gz (filetype.vcf {ref = ref;}); + buildCommand = "ln -s ${drv}/variants/somatic.indels.vcf $out"; + passthru.filetype = filetype.vcf {ref = ref;}; }; /* Extract SNVs from somatic results Type: snvs :: somatic results -> vcf @@ -41,7 +41,7 @@ with types; # result of callSomatic drv: stage { name = "strelka-callVariants-snvs"; - buildCommand = "ln -s ${drv}/variants/somatic.snvs.vcf.gz $out"; - passthru.filetype = filetype.gz (filetype.vcf {ref = ref;}); + buildCommand = "ln -s ${drv}/variants/somatic.snvs.vcf $out"; + passthru.filetype = filetype.vcf {ref = ref;}; }; } -- cgit v1.2.3