\documentclass[aspectratio=169,UKenglish]{beamer} \usetheme{metropolis} \usepackage[sfdefault]{FiraSans} \usefonttheme{professionalfonts} \setbeamerfont{footnote}{size= \tiny} \usepackage{unicode-math} \usepackage{microtype} \usepackage{tikz} \usepackage{pgfplots} \usepgfplotslibrary{ternary} \usepackage{stmaryrd} \newcommand{\R}{\mathbb{R}} \newcommand{\bx}{\mathbf{x}} \newcommand{\by}{\mathbf{y}} \newcommand{\bu}{\mathbf{u}} \newcommand{\bv}{\mathbf{v}} \newcommand{\X}{\mathbf{X}} \newcommand{\V}{\mathbf{V}} \newcommand{\A}{\mathbf{A}} \newcommand{\I}{\mathbf{I}} \newcommand{\U}{\mathbf{u}} \newcommand{\Q}{\mathbf{q}} \newcommand{\PP}{\mathbf{P}} \DeclareMathOperator{\alr}{alr} \DeclareMathOperator{\clr}{clr} \usepackage[natbib=true,url=false,style=verbose-ibid]{biblatex} \addbibresource{slides.bib} \AtBeginBibliography{\small} % Tikz relative positioning https://tex.stackexchange.com/questions/89588/positioning-relative-to-page-in-tikz \makeatletter \def \parsecomma#1,#2 \endparsecomma{\def \page@x{#1} \def \page@y{#2}} \tikzdeclarecoordinatesystem{page}{\parsecomma#1 \endparsecomma \pgfpointanchor{current page}{north east} \pgf@xc= \pgf@x \pgf@yc= \pgf@y \pgfpointanchor{current page}{south west} \pgf@xb= \pgf@x \pgf@yb= \pgf@y \pgfmathparse{(\pgf@xc- \pgf@xb)/2.* \page@x+( \pgf@xc+ \pgf@xb)/2.} \expandafter \pgf@x \expandafter= \pgfmathresult pt \pgfmathparse{(\pgf@yc- \pgf@yb)/2.* \page@y+( \pgf@yc+ \pgf@yb)/2.} \expandafter \pgf@y \expandafter= \pgfmathresult pt} \makeatother \usepackage{acronym} \usepackage{xspace} \renewcommand*{\acsfont}[1]{\textsc{#1}} \newacro{dms}{Deep Mutational Scanning} \newacro{clr}{Centred Log-Ratio} \newacro{alr}{Additive Log-Ratio} \newacro{pca}{Principal Component Analysis} \newcommand{\dms}{\ac{dms} \xspace} \definecolor{cb1}{HTML}{1b9e77} \definecolor{cb2}{HTML}{d95f02} \definecolor{cb3}{HTML}{7570b3} \author{Justin Bed\H{o}} \title{Representation learning of compositional counts: an exploration of deep mutational scanning data} \date{December 13, 2022} % Abstract: % Deep mutational scanning data provides important functional information on the % effects of protein variants. Many different aspects of proteins can be assayed, % many different experimental designs are possible, and many different scores are % computed leading to very heterogeneous data that is difficult to integrate. % In this talk I will explore a representational learning approach on raw count % data. This technique uses recent methods combining compositional data analysis % with a generalised form of principal component analysis to infer protein % representations without specific knowledge of the experimental design or assay % type. % Bio % Dr Justin Bedő is the Stafford Fox Centenary Fellow in Bioinformatics and % Computational Biology at the Walter and Eliza Hall Institute. He studied % computer science followed by a PhD in machine learning at the Australian % National University and was awarded his doctorate in 2009. He subsequently % worked as a researcher across both academia and industry at NICTA, IBISC % (Informatique, BioInformatique, Systèmes Complexes) CNRS, and IBM Research on % machine learning methods development and applications to biology before joining % the WEHI in 2016. \begin{document} \maketitle \begin{frame}{Variants of Uncertain Significance \footfullcite{Liu2020}} \begin{center} \input{clinvar.tikz} \end{center} \end{frame} \begin{frame}{\dms} \begin{quote} Deep mutational scanning is a method for systematically introducing mutations into a gene and then analyzing the resulting protein products to see how the changes affect the protein's function. \end{quote} \begin{enumerate} \item Growing resource of functional data \item MaveDB \footfullcite{Esposito2019} \unskip \footnote{\url{https://www.mavedb.org}} catalogs a number of datasets and provides easy access \end{enumerate} \end{frame} \begin{frame}{Deep Mutational Scanning: Overview \footfullcite{Fowler2014}} \begin{tikzpicture} \node at (page cs:0,0.75){\(t_0\)}; \node at (page cs:0.53,0.75){\(t_1\)}; \node(a) at (page cs:-0.75,0.5){\includegraphics[width=0.3 \textwidth]{Protein-BRCA1.png}}; \node(b) at (page cs:0,0.5){\begin{tikzpicture} \node[circle,draw,fill=cb1] at (page cs:-0.06,0){}; \node[circle,draw,fill=cb1] at (page cs:0,0){}; \node[circle,draw,fill=cb1] at (page cs:0.06,0){}; \node[circle,draw,fill=cb2] at (page cs:-0.06,0.1){}; \node[circle,draw,fill=cb2] at (page cs:0.06,0.1){}; \node[circle,draw,fill=cb2] at (page cs:0,0.1){}; \node[circle,draw,fill=cb3] at (page cs:-0.06,-0.1){}; \node[circle,draw,fill=cb3] at (page cs:0,-0.1){}; \node[circle,draw,fill=cb3] at (page cs:0.06,-0.1){}; \end{tikzpicture}}; \node(c) at (page cs:0.5,0.5){\begin{tikzpicture} \node[circle,draw,fill=cb1] at (page cs:0.5,0){}; \node[circle,draw,fill=cb1] at (page cs:0.56,0){}; \node[circle,draw,fill=cb1] at (page cs:0.62,0){}; \node[circle,draw,fill=cb1] at (page cs:0.68,0){}; \node[circle,draw,fill=cb1] at (page cs:0.74,0){}; \node[circle,draw,fill=cb2] at (page cs:0.5,0.1){}; \node[circle,draw,fill=cb3] at (page cs:0.5,-0.1){}; \node[circle,draw,fill=cb3] at (page cs:0.56,-0.1){}; \end{tikzpicture}}; \node(d) at (page cs:0.2,-0.25){\includegraphics[width=0.3 \textwidth]{nextseq500.jpg}}; \draw[->] (a) -- (b) node[midway,above]{mutagenesis}; \draw[->] (b) -- (c) node[midway,above]{selection}; \draw[->] (b) -- (d); \draw[->] (c) -- (d); \end{tikzpicture} \end{frame} \begin{frame}{Deep Mutational Scanning: Integration issues} \begin{enumerate} \item Assays can measure different properties \item Numerous different experimental designs \item Scores calculated a variety of ways, e.g., Rubin et al. \footfullcite{Rubin2017}: \[L_{v,t}=\log\left(\frac{(c_{v,t}+\frac12)(c_{wt,0}+\frac12)}{(c_{v,0}+\frac12)(c_{wt,t}+\frac12)}\right) \] \end{enumerate} \end{frame} \begin{frame}{Representational learning on \ac{dms} data} For a given protein: \begin{itemize} \item Learn a representation of the available \ac{dms} data \item unsupervised to deal with varying designs \item work on counts not scores \end{itemize} \end{frame} \begin{frame}{Compositional simplex} \begin{columns}[T] \begin{column}{.63 \textwidth} \begin{definition}[Compositional data] Data \(X \in \R_{\geq 0}^{n \times d}\) is compositional if rows \(\bx_i\) are in the simplex \[S^d=\{\,\bx \in \R^d_{\geq 0} : \forall j,x_j > 0 ; \sum_{j=1}^d x_j = \kappa\,\} \] for constant \(\kappa > 0\). \end{definition} \end{column} \hfill \begin{column}{.26 \textwidth} \begin{tikzpicture}[scale=0.5] \begin{ternaryaxis} \addplot3 coordinates{(0.25,0.5,0.25)}; \path (0.25,0.5,0.25) coordinate (M) (1,0,0) coordinate (C) (0,1,0) coordinate (A) (0,0,1) coordinate (B); \end{ternaryaxis} \end{tikzpicture} \end{column} \end{columns} \vspace{10pt} \(\Rightarrow\) Information is given only by the ratios of components and any composition can be normalised to the standard simplex where \(\kappa = 1\) (c.f., dividing by library size). \end{frame} \begin{frame}{Isomorphisms to Euclidean vector spaces} The simplex forms a \(d-1\) dimensional Euclidean vector space \footfullcite{Aitchison1982}: \begin{definition}[\ac{alr}] \[\alr_i(\bx) = \log \frac{x_i}{x_0} \] \end{definition} \begin{definition}[\ac{clr}] \[\clr_i(\bx) = \log \frac{x_i}{\left(\prod_{j=1}^d x_j\right)^{\frac 1 d}} \] \end{definition} \end{frame} \begin{frame}{\textsc{Pca} on \ac{dms} data} \begin{block}{Transformation approach} \begin{enumerate} \item Map \dms data to Euclidean space via \ac{alr} / \ac{clr} \item Apply standard \ac{pca} \end{enumerate} \end{block} \pause \begin{block}{Problems} \begin{itemize} \item Zeros: \begin{enumerate} \item \(\log(0)\) undefined \(\Rightarrow\) can't handle unobserved components \item geometric mean is \(0\) \(\Rightarrow\) \ac{clr} is undefined \end{enumerate} \end{itemize} \end{block} \end{frame} \begin{frame}{Traditional \ac{pca}} Given \(\X\in \R^{n\times d}\) minimise loss \[\ell_{\textsc{pca}} \triangleq {\lVert \X - \V\A \rVert}^2_{\textrm{F}} \] s.t. \(\V \in \R^{n \times k}\), \(\A \in \R^{k \times d}\), and \(\V^\intercal \V = \I\). \pause Has been generalised to exponential families \footfullcite{collins2001generalization} via Bregman divergences \footfullcite{Amari2016-ua}. \end{frame} \begin{frame}{Exponential family \ac{pca}} \begin{definition}[Bregman Divergence] Let \(\varphi \colon \R^d \to \R\) be a differentiable convex function. The Bregman divergence \(D_\varphi\) with generator \(\varphi\) is \[ D_\varphi\left(\bu\,\Vert\,\bv\right) \triangleq \varphi(\bu)-\varphi(\bv)-\langle \nabla\varphi(\bv),\bu-\bv\rangle. \] \end{definition} \pause Denote the convex conjugate of \(\varphi\) as \(\varphi^*(\bu) \triangleq \sup_\bv\left\{\langle \bu,\bv\rangle-\varphi(\bv)\right\}\). The exponential family \ac{pca} is then given by minimising loss \[\ell_{\varphi} \triangleq D_\varphi\left(\X\,\Vert\,\nabla\varphi^*\left(\V\A\right)\right) \] under the same constraints as previously, approximating \(\X \sim \nabla\varphi^*\left(\V\A\right)\). \end{frame} \begin{frame}{Aitchison's simplex and exponential \ac{pca}} Aitchison's log-transformation is a dual affine coordinate space made explicit with \[\varphi(z) = z\log(z) - z \Leftrightarrow \varphi^*(z) = e^z, \] but what about normalisation? \pause Consider \ac{alr}: \[\alr(\bx) \triangleq x_0 \sum_{i=1}^d\varphi\left(\frac{x_i}{x_0}\right) \Leftrightarrow \alr^*(\bx) = x_0\sum_{i=1}^d e^{\frac{x_i}{x_0}} \] \end{frame} \begin{frame}{Scaled Bregman} \begin{theorem}[Scaled Bregman \footfullcite{nock2016scaled}] Let \(\varphi \colon \mathcal{X} \to \R\) be convex differentiable and \(g \colon \mathcal{X} \to \R\) be differentiable. Then \[D_{\breve{\varphi}}\left(\bx\,\middle\Vert\,\by\right) = g(\bx)\cdot D_\varphi\left(\frac{\bx}{g(\bx)}\,\middle\Vert\,\frac{\by}{g(\by)}\right) \] where \[\breve{\varphi} \triangleq g(\bx) \cdot \varphi\left(\frac{x}{g(\bx)}\right) \] \end{theorem} Avalos et al. \footfullcite{avalos2018representation} \ considered a relaxed form for \ac{clr} recently. \end{frame} \begin{frame}{\textsc{Clr} undefined if any component is unobserved} \begin{itemize} \item Zeros still a problem for \ac{clr} as geometric mean is \(0\). \item[\(\Rightarrow\)] use median as gague function. \end{itemize} \end{frame} \section{Experiments} \begin{frame}{Activation-Induced Deaminase \footfullcite{Gajula2014}} \begin{tikzpicture}[remember picture,overlay] \node[scale=0.85] at (page cs:0,0.08){\input{106-samples.tikz}}; \end{tikzpicture} \end{frame} \begin{frame}{Activation-Induced Deaminase} \begin{tikzpicture} \node at (page cs:-0.5,0.08){\input{106-Leu113.tikz}}; \node at (page cs:0.5,0.5){\includegraphics{gku689fig3-a.pdf}}; \node at (page cs:0.5,-0.25){\includegraphics{gku689fig3-key.pdf}}; \end{tikzpicture} \end{frame} \begin{frame}{Activation-Induced Deaminase} \begin{tikzpicture} \node at (page cs:-0.5,0.08){\input{106-Phe115.tikz}}; \node at (page cs:0.5,0.5){\includegraphics{gku689fig3-b.pdf}}; \node at (page cs:0.5,-0.25){\includegraphics{gku689fig3-key.pdf}}; \end{tikzpicture} \end{frame} \begin{frame}{Activation-Induced Deaminase} \begin{tikzpicture} \node at (page cs:-0.5,0.08){\input{106-Glu117.tikz}}; \node at (page cs:0.5,0.5){\includegraphics{gku689fig3-c.pdf}}; \node at (page cs:0.5,-0.25){\includegraphics{gku689fig3-key.pdf}}; \end{tikzpicture} \end{frame} \begin{frame}{Activation-Induced Deaminase} \begin{tikzpicture} \node at (page cs:-0.7,0.9){\textbf{Bregman}}; \node at (page cs:0.3,0.9){\textbf{+1-log \ac{pca}}}; \node[scale=0.9] at (page cs:-0.5,0.08){\input{106-samples.tikz}}; \node[scale=0.9] at (page cs:0.5,0.08){\input{106-samples-log.tikz}}; \end{tikzpicture} \end{frame} \begin{frame}{\textsc{Erbb2} \footfullcite{Elazar2016}} \begin{tikzpicture} \node[scale=0.8] at (page cs: -0.5,0){\input{helix-erbb2.tikz}}; \node at (page cs: 0.5,0.07){\includegraphics[width=0.4 \textwidth]{helix-erbb2-pub.jpg}}; \end{tikzpicture} \end{frame} \begin{frame}{\textsc{Brca1} \footfullcite{Findlay2018}} \begin{tikzpicture}[remember picture,overlay] \node[inner sep=0pt] at (5,0.5){\input{brca1-density.tikz}}; \node[inner sep=0pt] at (11,1.25){\includegraphics{brca1-hist-pub.jpg}}; \end{tikzpicture} \end{frame} \begin{frame}{\textsc{Brca1}: Positional effects} \begin{columns}[T] \begin{column}{.4 \textwidth} \vspace{1cm} \[\V\A+\U^\intercal\Q\PP \] where \(\U \in \R^n\), \(\Q \in \R^l\), \(\PP \in \mathbb{2}^{l\times d}\) \end{column} \hfill \begin{column}{.58 \textwidth} \begin{tikzpicture} \node[scale=.45]{\input{position.tikz}}; \end{tikzpicture} \end{column} \end{columns} \end{frame} \begin{frame}{Acknowledgements} \begin{columns}[T] \begin{column}{.4 \textwidth} \textbf{Papenfuss lab} \begin{itemize} \item Tony Papenfuss \item \textit{Alan Rubin} \item \textit{Matthew Wakefield} \end{itemize} \end{column} \hfill \begin{column}{.4 \textwidth} \textbf{Stafford Fox medical research foundation} \end{column} \end{columns} \end{frame} \end{document}