aboutsummaryrefslogtreecommitdiff
path: root/tools
diff options
context:
space:
mode:
authorJustin Bedo <cu@cua0.org>2019-04-07 21:10:25 +1000
committerJustin Bedo <cu@cua0.org>2019-04-07 21:10:25 +1000
commite5a8870358713f4e44e135da913c50230d83d4f1 (patch)
tree9678b00f5b683f0d3df31c11ff2c96c0a597cb0f /tools
parente97bc0b22b9f49d95d6b449b2b20e7222b42622d (diff)
modify outputs to ensure determinism
Diffstat (limited to 'tools')
-rw-r--r--tools/gridss-callVariants.nix3
-rw-r--r--tools/gridss-collectMetrics.nix6
-rw-r--r--tools/platypus-callVariants.nix3
-rw-r--r--tools/samtools-merge.nix7
-rw-r--r--tools/strelka-call.nix13
-rw-r--r--tools/strelka-callSomatic.nix13
-rw-r--r--tools/strelka.nix12
7 files changed, 48 insertions, 9 deletions
diff --git a/tools/gridss-callVariants.nix b/tools/gridss-callVariants.nix
index ffbb349..f42e3f3 100644
--- a/tools/gridss-callVariants.nix
+++ b/tools/gridss-callVariants.nix
@@ -50,6 +50,9 @@ stage rec {
ASSEMBLY="$out/gridss.bam" \
${optionalString (blacklist != null) ("BLACKLIST=" + blacklist)} \
${optionalString (flags != null) flags}
+
+ # The VCF index is non-deterministic
+ rm $out/gridss.vcf.idx
'';
passthru.multicore = true;
}
diff --git a/tools/gridss-collectMetrics.nix b/tools/gridss-collectMetrics.nix
index c62346e..2e3b3a9 100644
--- a/tools/gridss-collectMetrics.nix
+++ b/tools/gridss-collectMetrics.nix
@@ -29,5 +29,11 @@ stage rec {
O=$out/input \
AS=true \
THRESHOLD_COVERAGE=${toString thresholdCoverage}
+
+ # Make the output deterministic by removing timestamps
+ sed -i '/^# Started on:/d' $out/input.*_metrics
+ if [ -e $out/input.insert_size_histogram.pdf ] ; then
+ sed -i 's/(D:[0-9]\+)/(D:19700101000000)/g' $out/input.insert_size_histogram.pdf
+ fi
'';
}
diff --git a/tools/platypus-callVariants.nix b/tools/platypus-callVariants.nix
index f3b3e7c..abe2a6b 100644
--- a/tools/platypus-callVariants.nix
+++ b/tools/platypus-callVariants.nix
@@ -35,6 +35,9 @@ stage {
${optionalString (flags != null) flags} \
-o $out \
--bamFiles=${concatMapStringsSep "," (p: "${filename p}.bam") inputs}
+
+ # Remove timestamps from output
+ sed -i '/^##fileDate/d' $out
'';
passthru.filetype = filetype.vcf {ref = ref;};
passthru.multicore = true;
diff --git a/tools/samtools-merge.nix b/tools/samtools-merge.nix
index 120f825..9b7d133 100644
--- a/tools/samtools-merge.nix
+++ b/tools/samtools-merge.nix
@@ -19,7 +19,12 @@ stage {
name = "samtools-merge";
buildInputs = with pkgs; [ samtools ];
buildCommand = ''
- samtools merge ${optionalString (flags != null) flags} $out ${concatStringsSep " " inputs}
+ samtools merge ${optionalString (flags != null) flags} out.bam ${concatStringsSep " " inputs}
+
+ # Merge is non-deterministic with PG lines; if files have clashing PG IDs then a random
+ # suffix is appended to make it unique. PG lines are stripped in the following to
+ # resolve the issue.
+ samtools reheader <(samtools view -H out.bam | grep -v '@PG') out.bam > $out
'';
passthru.filetype = (builtins.elemAt inputs 0).filetype;
}
diff --git a/tools/strelka-call.nix b/tools/strelka-call.nix
index 7836bbe..bc1e202 100644
--- a/tools/strelka-call.nix
+++ b/tools/strelka-call.nix
@@ -38,7 +38,18 @@ stage {
-m local \
-j $NIX_BUILD_CORES 2>&1
- cp -r results $out
+ # Strelka writes runtime stats and timestamps;
+ # both have to be stripped to provide determinism
+ cd results/variants
+ rm *.tbi genome.vcf.gz
+ for f in *.vcf.gz; do
+ gunzip $f
+ g=$(basename $f .gz)
+ sed -i '/^##fileDate/d' $g
+ sed -i '/^##startTime/d' $g
+ done
+ mkdir $out
+ cp -r * $out
'';
passthru.multicore = true;
}
diff --git a/tools/strelka-callSomatic.nix b/tools/strelka-callSomatic.nix
index 256065b..cf4b414 100644
--- a/tools/strelka-callSomatic.nix
+++ b/tools/strelka-callSomatic.nix
@@ -40,7 +40,18 @@ stage {
-m local \
-j $NIX_BUILD_CORES
- cp -r results $out
+ # Strelka writes runtime stats and timestamps;
+ # both have to be stripped to provide determinism
+ cd results/variants
+ rm *.tbi
+ for f in *.vcf.gz; do
+ gunzip $f
+ g=$(basename $f .gz)
+ sed -i '/^##fileDate/d' $g
+ sed -i '/^##startTime/d' $g
+ done
+ mkdir $out
+ cp -r * $out
'';
passthru.multicore = true;
}
diff --git a/tools/strelka.nix b/tools/strelka.nix
index 1cf8961..d3068b5 100644
--- a/tools/strelka.nix
+++ b/tools/strelka.nix
@@ -20,9 +20,9 @@ with types;
drv: stage {
name = "strelka-call-variants";
buildCommand = ''
- ln -s ${drv}/variants/variants.vcf.gz $out
+ ln -s ${drv}/variants/variants.vcf $out
'';
- passthru.filetype = filetype.gz (filetype.vcf {ref=ref;});
+ passthru.filetype = filetype.vcf {ref=ref;};
};
/* Extract indels from somatic results
Type: indels :: somatic results -> vcf
@@ -31,8 +31,8 @@ with types;
# result of callSomatic
drv: stage {
name = "strelka-callVariants-indels";
- buildCommand = "ln -s ${drv}/variants/somatic.indels.vcf.gz $out";
- passthru.filetype = filetype.gz (filetype.vcf {ref = ref;});
+ buildCommand = "ln -s ${drv}/variants/somatic.indels.vcf $out";
+ passthru.filetype = filetype.vcf {ref = ref;};
};
/* Extract SNVs from somatic results
Type: snvs :: somatic results -> vcf
@@ -41,7 +41,7 @@ with types;
# result of callSomatic
drv: stage {
name = "strelka-callVariants-snvs";
- buildCommand = "ln -s ${drv}/variants/somatic.snvs.vcf.gz $out";
- passthru.filetype = filetype.gz (filetype.vcf {ref = ref;});
+ buildCommand = "ln -s ${drv}/variants/somatic.snvs.vcf $out";
+ passthru.filetype = filetype.vcf {ref = ref;};
};
}