From e7cd661d1c5fb4135e3d436e151294e26aef9127 Mon Sep 17 00:00:00 2001 From: Justin Bedo Date: Mon, 29 Oct 2018 15:33:53 +1100 Subject: Split gridss into constituents Wrap each individual command for GRIDSS so that bionix executed the pipeline rather than GRIDSS. This patch introduces a "call" function that executed the whole pipeline in bionix on an arbitrary BAM file. Resolves #10. --- tools/gridss-annotateVariants.nix | 75 ++++++++++++++++++++++++++++++++++ tools/gridss-assemble.nix | 58 ++++++++++++++++++++++++++ tools/gridss-callVariants.nix | 6 +-- tools/gridss-collectGridssMetrics.nix | 32 +++++++++++++++ tools/gridss-collectMetrics.nix | 30 ++++++++++++++ tools/gridss-computeSamTags.nix | 42 +++++++++++++++++++ tools/gridss-extractSVReads.nix | 40 ++++++++++++++++++ tools/gridss-identifyVariants.nix | 72 ++++++++++++++++++++++++++++++++ tools/gridss-softClipsToSplitReads.nix | 43 +++++++++++++++++++ tools/gridss.nix | 16 +++++++- 10 files changed, 408 insertions(+), 6 deletions(-) create mode 100644 tools/gridss-annotateVariants.nix create mode 100644 tools/gridss-assemble.nix create mode 100644 tools/gridss-collectGridssMetrics.nix create mode 100644 tools/gridss-collectMetrics.nix create mode 100644 tools/gridss-computeSamTags.nix create mode 100644 tools/gridss-extractSVReads.nix create mode 100644 tools/gridss-identifyVariants.nix create mode 100644 tools/gridss-softClipsToSplitReads.nix (limited to 'tools') diff --git a/tools/gridss-annotateVariants.nix b/tools/gridss-annotateVariants.nix new file mode 100644 index 0000000..4f66c6c --- /dev/null +++ b/tools/gridss-annotateVariants.nix @@ -0,0 +1,75 @@ +{ bionix +, nixpkgs +, bwaIndexAttrs ? {} +, faidxAttrs ? {} +, assemblyAttrs ? {} +, extractSVReadsAttrs ? {} +, collectMetricsAttrs ? {} +, softClipsToSplitReadsAttrs ? {} +, identifyVariantsAttrs ? {} +, flags ? null +}: + +with nixpkgs; +with lib; +with bionix.types; +with bionix.gridss; + +inputs: + +let + getref = matchFiletype "gridss-annotateVariants" { bam = x: x.ref; }; + ref = getref (head inputs); + sorted = matchFileSorting "gridss-annotateVariants" { coord = _: true; }; + homoRef = length (unique (map getref inputs)) == 1; + + linkInput = f: attrs: input: '' + BASENAME=$(basename ${input}) + WRKDIR="''${BASENAME}.gridss.working" + if [[ ! -e $WRKDIR ]] ; then + mkdir $WRKDIR + fi + for f in ${f attrs input}/* ; do + ln -s $f $WRKDIR/$BASENAME.''${f#*.} + done + ''; + + assembly = bionix.samtools.sort {} (softClipsToSplitReads softClipsToSplitReadsAttrs (bionix.samtools.sort { nameSort = true;} (bionix.gridss.assemble assemblyAttrs inputs))); +in + +assert (all sorted inputs); +assert (homoRef); + +stdenv.mkDerivation rec { + name = "gridss-identifyVariants"; + buildInputs = [ jre ]; + buildCommand = '' + ln -s ${ref} ref.fa + ln -s ${bionix.samtools.faidx faidxAttrs ref} ref.fa.fai + for f in ${bionix.bwa.index bwaIndexAttrs ref}/*; do + ln -s $f + done + ${concatMapStringsSep "\n" (linkInput extractSVReads extractSVReadsAttrs) inputs} + ${concatMapStringsSep "\n" (linkInput collectMetrics collectMetricsAttrs) inputs} + ${linkInput collectMetrics collectMetricsAttrs assembly} + ASSBASE=$(basename ${assembly}) + ln -s ${assembly} $ASSBASE.gridss.working/$ASSBASE.sv.bam + ln -s ${bionix.samtools.index {} assembly} $ASSBASE.gridss.working/$ASSBASE.sv.bai + ln -s ${identifyVariants identifyVariantsAttrs inputs} input.vcf + java -Xmx4g -Dsamjdk.create_index=true \ + -cp ${jar} gridss.AnnotateVariants \ + REFERENCE_SEQUENCE=ref.fa \ + ${concatMapStringsSep " " (i: "INPUT='${i}'") inputs} \ + ASSEMBLY=${assembly} \ + INPUT_VCF=input.vcf \ + OUTPUT_VCF=out.vcf \ + WORKING_DIR=$TMPDIR/ \ + TMP_DIR=$TMPDIR/ + + mv out.vcf $out + ''; + passthru = { + filetype = filetype.vcf { ref = ref; }; + gridss.assembly = assembly; + }; +} diff --git a/tools/gridss-assemble.nix b/tools/gridss-assemble.nix new file mode 100644 index 0000000..cdda748 --- /dev/null +++ b/tools/gridss-assemble.nix @@ -0,0 +1,58 @@ +{ bionix +, nixpkgs +, bwaIndexAttrs ? {} +, faidxAttrs ? {} +, collectMetricsAttrs ? {} +, extractSVReadsAttrs ? {} +, flags ? null +}: + +with nixpkgs; +with lib; +with bionix.types; + +inputs: + +let + getref = matchFiletype "gridss-assemble" { bam = x: x.ref; }; + ref = getref (head inputs); + sorted = matchFileSorting "gridss-assemble" { coord = _: true; }; + homoRef = length (unique (map getref inputs)) == 1; + + linkInput = input: '' + BASENAME=$(basename ${input}) + WRKDIR="''${BASENAME}.gridss.working" + mkdir $WRKDIR + for f in ${bionix.gridss.extractSVReads extractSVReadsAttrs input}/* ; do + ln -s $f $WRKDIR/$BASENAME.''${f#*.} + done + for f in ${bionix.gridss.collectMetrics collectMetricsAttrs input}/* ; do + ln -s $f $WRKDIR/$BASENAME.''${f#*.} + done + ''; +in + +assert (all sorted inputs); +assert (homoRef); + +stdenv.mkDerivation rec { + name = "gridss-assemble"; + buildInputs = [ jre bwa ]; + buildCommand = '' + ln -s ${ref} ref.fa + ln -s ${bionix.samtools.faidx faidxAttrs ref} ref.fa.fai + for f in ${bionix.bwa.index bwaIndexAttrs ref}/*; do + ln -s $f + done + ${concatMapStringsSep "\n" linkInput inputs} + java -Xmx31g -Dsamjdk.create_index=true \ + -cp ${bionix.gridss.jar} gridss.AssembleBreakends \ + REFERENCE_SEQUENCE=ref.fa \ + ${concatMapStringsSep " " (i: "INPUT='${i}'") inputs} \ + WORKER_THREADS=$NIX_BUILD_CORES \ + OUTPUT=$out \ + WORKING_DIR=$TMPDIR/ \ + TMP_DIR=$TMPDIR/ + ''; + passthru.filetype = filetype.bam { ref = ref; sorting = sort.coord {}; }; +} diff --git a/tools/gridss-callVariants.nix b/tools/gridss-callVariants.nix index 799c930..416eb7c 100644 --- a/tools/gridss-callVariants.nix +++ b/tools/gridss-callVariants.nix @@ -23,10 +23,6 @@ assert (length (unique refs) == 1); stdenv.mkDerivation rec { name = "gridss-callVariants"; buildInputs = [ jre R bwa ]; - jar = fetchurl { - url = "https://github.com/PapenfussLab/gridss/releases/download/v2.0.0/gridss-2.0.0-gridss-jar-with-dependencies.jar"; - sha256 = "01srl3qvv060whqg1y1fpxjc5cwga5wscs1bmf1v3z87dignra7k"; - }; buildCommand = '' ln -s ${ref} ref.fa ln -s ${bionix.samtools.faidx faidxAttrs ref} ref.fa.fai @@ -41,7 +37,7 @@ stdenv.mkDerivation rec { -Dsamjdk.use_async_io_write_samtools=true \ -Dsamjdk.use_async_io_write_tribble=true \ -Dgridss.gridss.output_to_temp_file=true \ - -cp ${jar} gridss.CallVariants \ + -cp ${bionix.gridss.jar} gridss.CallVariants \ WORKER_THREADS=$NIX_BUILD_CORES \ TMP_DIR=. \ WORKING_DIR=. \ diff --git a/tools/gridss-collectGridssMetrics.nix b/tools/gridss-collectGridssMetrics.nix new file mode 100644 index 0000000..bb8d2e9 --- /dev/null +++ b/tools/gridss-collectGridssMetrics.nix @@ -0,0 +1,32 @@ +{ bionix +, nixpkgs +, thresholdCoverage ? 10000 +, flags ? null +}: + +with nixpkgs; +with lib; +with bionix.types; + +input: + +let + ref = matchFiletype "gridss-collectGridssMetrics" { bam = x: x.ref; } input; + sorted = matchFileSorting "gridss-collectGridssMetrics" { name = _: true; } input; +in + + +stdenv.mkDerivation rec { + name = "gridss-collectGridssMetrics"; + buildInputs = [ jre ]; + buildCommand = '' + mkdir $out + ln -s ${input} input.bam + java -Xmx1G -cp ${bionix.gridss.jar} \ + gridss.analysis.CollectGridssMetrics \ + ${optionalString sorted "ASSUME_SORTED=true"} \ + I=input.bam \ + O=$out \ + THRESHOLD_COVERAGE=${toString thresholdCoverage} + ''; +} diff --git a/tools/gridss-collectMetrics.nix b/tools/gridss-collectMetrics.nix new file mode 100644 index 0000000..4688808 --- /dev/null +++ b/tools/gridss-collectMetrics.nix @@ -0,0 +1,30 @@ +{ bionix +, nixpkgs +, thresholdCoverage ? 10000 +, flags ? null +}: + +with nixpkgs; +with lib; +with bionix.types; + +input: + +let + ref = matchFiletype "gridss-collectMetrics" { bam = x: x.ref; } input; +in + + +stdenv.mkDerivation rec { + name = "gridss-collectMetrics"; + buildInputs = [ jre R ]; + buildCommand = '' + mkdir $out + java -Xmx1G -cp ${bionix.gridss.jar} \ + gridss.analysis.CollectGridssMetrics \ + I=${input}\ + O=$out/input \ + AS=true \ + THRESHOLD_COVERAGE=${toString thresholdCoverage} + ''; +} diff --git a/tools/gridss-computeSamTags.nix b/tools/gridss-computeSamTags.nix new file mode 100644 index 0000000..f75cea0 --- /dev/null +++ b/tools/gridss-computeSamTags.nix @@ -0,0 +1,42 @@ +{ bionix +, nixpkgs +, blacklist ? null +, bwaIndexAttrs ? {} +, faidxAttrs ? {} +, flags ? null +}: + +with nixpkgs; +with lib; +with bionix.types; + +input: + +let + ref = matchFiletype "gridss-computeSamTags" { bam = x: x.ref; } input; + sorted = matchFileSorting "gridss-computeSamTags" { name = _: true; } input; +in + +assert(sorted); + +stdenv.mkDerivation rec { + name = "gridss-computeSamTags"; + buildInputs = [ jre ]; + buildCommand = '' + ln -s ${ref} ref.fa + ln -s ${bionix.samtools.faidx faidxAttrs ref} ref.fa.fai + for f in ${bionix.bwa.index bwaIndexAttrs ref}/*; do + ln -s $f + done + java -Xmx1G \ + -Dsamjdk.create_index=false \ + -cp ${bionix.gridss.jar} gridss.ComputeSamTags \ + REFERENCE_SEQUENCE=ref.fa \ + WORKING_DIR=$TMP_DIR \ + TMP_DIR=$TMP_DIR \ + I=${input} \ + O=$out \ + AS=true + ''; + passthru.filetype = input.filetype; +} diff --git a/tools/gridss-extractSVReads.nix b/tools/gridss-extractSVReads.nix new file mode 100644 index 0000000..a10abf5 --- /dev/null +++ b/tools/gridss-extractSVReads.nix @@ -0,0 +1,40 @@ +{ bionix +, nixpkgs +, dictIndexAttrs ? {} +, faidxAttrs ? {} +, flags ? null +, unmappedReads ? false +, minClipLength ? 5 +}: + +with nixpkgs; +with lib; +with bionix.types; + +input: + +let + ref = matchFiletype "gridss-extractSVReads" { bam = x: x.ref; } input; +in + + +stdenv.mkDerivation rec { + name = "gridss-extractSVReads"; + buildInputs = [ jre R ]; + buildCommand = '' + ln -s ${ref} ref.fa + ln -s ${bionix.samtools.faidx faidxAttrs ref} ref.fa.fai + ln -s ${bionix.samtools.dict dictIndexAttrs ref} ref.fa.dict + ln -s ${input} input.bam + mkdir $out + java -Dsamjdk.create_index=true \ + -cp ${bionix.gridss.jar} gridss.ExtractSVReads \ + REFERENCE_SEQUENCE=ref.fa \ + I=input.bam \ + O=$out/input.sv.bam \ + METRICS_OUTPUT=$out/input.sv_metrics \ + INSERT_SIZE_METRICS=$out/input.insert_size_metrics \ + UNMAPPED_READS=${if unmappedReads then "true" else "false"} \ + MIN_CLIP_LENGTH=${toString minClipLength} + ''; +} diff --git a/tools/gridss-identifyVariants.nix b/tools/gridss-identifyVariants.nix new file mode 100644 index 0000000..f44771b --- /dev/null +++ b/tools/gridss-identifyVariants.nix @@ -0,0 +1,72 @@ +{ bionix +, nixpkgs +, bwaIndexAttrs ? {} +, faidxAttrs ? {} +, assemblyAttrs ? {} +, extractSVReadsAttrs ? {} +, collectMetricsAttrs ? {} +, softClipsToSplitReadsAttrs ? {} +, flags ? null +}: + +with nixpkgs; +with lib; +with bionix.types; +with bionix.gridss; + +inputs: + +let + getref = matchFiletype "gridss-identifyVariants" { bam = x: x.ref; }; + ref = getref (head inputs); + sorted = matchFileSorting "gridss-identifyVariants" { coord = _: true; }; + homoRef = length (unique (map getref inputs)) == 1; + + linkInput = f: attrs: input: '' + BASENAME=$(basename ${input}) + WRKDIR="''${BASENAME}.gridss.working" + if [[ ! -e $WRKDIR ]] ; then + mkdir $WRKDIR + fi + for f in ${f attrs input}/* ; do + ln -s $f $WRKDIR/$BASENAME.''${f#*.} + done + ''; + + assembly = bionix.samtools.sort {} (softClipsToSplitReads softClipsToSplitReadsAttrs (bionix.samtools.sort { nameSort = true;} (bionix.gridss.assemble assemblyAttrs inputs))); +in + +assert (all sorted inputs); +assert (homoRef); + +stdenv.mkDerivation rec { + name = "gridss-identifyVariants"; + buildInputs = [ jre ]; + buildCommand = '' + ln -s ${ref} ref.fa + ln -s ${bionix.samtools.faidx faidxAttrs ref} ref.fa.fai + for f in ${bionix.bwa.index bwaIndexAttrs ref}/*; do + ln -s $f + done + ${concatMapStringsSep "\n" (linkInput extractSVReads extractSVReadsAttrs) inputs} + ${concatMapStringsSep "\n" (linkInput collectMetrics collectMetricsAttrs) inputs} + ${linkInput collectMetrics collectMetricsAttrs assembly} + ASSBASE=$(basename ${assembly}) + ln -s ${assembly} $ASSBASE.gridss.working/$ASSBASE.sv.bam + ln -s ${bionix.samtools.index {} assembly} $ASSBASE.gridss.working/$ASSBASE.sv.bai + java -Xmx4g -Dsamjdk.create_index=true \ + -cp ${jar} gridss.IdentifyVariants \ + REFERENCE_SEQUENCE=ref.fa \ + ${concatMapStringsSep " " (i: "INPUT='${i}'") inputs} \ + ASSEMBLY=${assembly} \ + OUTPUT_VCF=out.vcf \ + WORKING_DIR=$TMPDIR/ \ + TMP_DIR=$TMPDIR/ + + mv out.vcf $out + ''; + passthru = { + filetype = filetype.vcf { ref = ref; }; + gridss.assembly = assembly; + }; +} diff --git a/tools/gridss-softClipsToSplitReads.nix b/tools/gridss-softClipsToSplitReads.nix new file mode 100644 index 0000000..8a7dca3 --- /dev/null +++ b/tools/gridss-softClipsToSplitReads.nix @@ -0,0 +1,43 @@ +{ bionix +, nixpkgs +, bwaIndexAttrs ? {} +, faidxAttrs ? {} +, alignerStreaming ? false +, flags ? null +}: + +with nixpkgs; +with lib; +with bionix.types; + +input: + +let + ref = matchFiletype "gridss-softClipsToSplitReads" { bam = x: x.ref; } input; +in + +assert (matchFileSorting "gridss-softClipsToSplitReads" { name = _: true; } input); + +stdenv.mkDerivation rec { + name = "gridss-softClipsToSplitReads"; + buildInputs = [ jre ]; + buildCommand = '' + ln -s ${ref} ref.fa + ln -s ${bionix.samtools.faidx faidxAttrs ref} ref.fa.fai + for f in ${bionix.bwa.index bwaIndexAttrs ref}/*; do + ln -s $f + done + java -Xmx2G -Dsamjdk.create_index=false \ + -cp ${bionix.gridss.jar} gridss.SoftClipsToSplitReads \ + REFERENCE_SEQUENCE=ref.fa \ + I=${input} \ + O=$out \ + ${optionalString alignerStreaming "ALIGNER_STREAMING=true"} \ + WORKER_THREADS=$NIX_BUILD_CORES + ''; + passthru.filetype = + if alignerStreaming then + filetype.bam { ref = ref; sort = sorting.none {}; } + else + input.filetype; +} diff --git a/tools/gridss.nix b/tools/gridss.nix index 7a2f217..d46a8ce 100644 --- a/tools/gridss.nix +++ b/tools/gridss.nix @@ -1,7 +1,21 @@ {bionix, nixpkgs}: +with nixpkgs; with bionix; -{ +rec { + jar = fetchurl { + url = "https://github.com/PapenfussLab/gridss/releases/download/v2.0.0/gridss-2.0.0-gridss-jar-with-dependencies.jar"; + sha256 = "01srl3qvv060whqg1y1fpxjc5cwga5wscs1bmf1v3z87dignra7k"; + }; callVariants = callBionix ./gridss-callVariants.nix; + computeSamTags = callBionix ./gridss-computeSamTags.nix; + softClipsToSplitReads = callBionix ./gridss-softClipsToSplitReads.nix; + collectMetrics = callBionix ./gridss-collectMetrics.nix; + extractSVReads = callBionix ./gridss-extractSVReads.nix; + assemble = callBionix ./gridss-assemble.nix; + identifyVariants = callBionix ./gridss-identifyVariants.nix; + annotateVariants = callBionix ./gridss-annotateVariants.nix; + preprocessBam = input: with samtools; markdup {} (sort {} (fixmate {mateScore = true;} (softClipsToSplitReads {} (computeSamTags {} (sort {nameSort = true;} input))))); + call = inputs: annotateVariants {} (map preprocessBam inputs); } -- cgit v1.2.3