From e72ee50421716fde6646afd3a444b993413d3440 Mon Sep 17 00:00:00 2001 From: Justin Bedo Date: Fri, 9 Dec 2022 10:46:11 +1100 Subject: add illustrative figures and abtract --- slides.tex | 99 +++++++++++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 76 insertions(+), 23 deletions(-) (limited to 'slides.tex') diff --git a/slides.tex b/slides.tex index f8121d9..1cf5229 100644 --- a/slides.tex +++ b/slides.tex @@ -5,12 +5,13 @@ \usefonttheme{professionalfonts} \setbeamerfont{footnote}{size= \tiny} + \usepackage{unicode-math} \usepackage{microtype} \usepackage{tikz} -\usetikzlibrary{shapes} -\usetikzlibrary{bayesnet} +\usepackage{pgfplots} +\usepgfplotslibrary{ternary} \usepackage{stmaryrd} \newcommand{\R}{\mathbb{R}} @@ -22,9 +23,13 @@ \newcommand{\V}{\mathbf{V}} \newcommand{\A}{\mathbf{A}} \newcommand{\I}{\mathbf{I}} +\newcommand{\U}{\mathbf{u}} +\newcommand{\Q}{\mathbf{q}} +\newcommand{\PP}{\mathbf{P}} \DeclareMathOperator{\alr}{alr} \DeclareMathOperator{\clr}{clr} + \usepackage[natbib=true,url=false,style=verbose-ibid]{biblatex} \addbibresource{slides.bib} \AtBeginBibliography{\small} @@ -84,9 +89,19 @@ \definecolor{cb3}{HTML}{7570b3} \author{Justin Bed\H{o}} -\title{Representation learning of compositional counts: exploration of deep mutational scanning data} +\title{Representation learning of compositional counts: an exploration of deep mutational scanning data} \date{December 13, 2022} +% Abstract: + +% Deep mutational scanning data provides important functional information on the % effects of protein variants. Many different aspects of proteins can be assayed, % many different experimental designs are possible, and many different scores are % computed leading to very heterogeneous data that is difficult to integrate. + +% In this talk I will explore a representational learning approach on raw count % data. This technique uses recent methods combining compositional data analysis % with a generalised form of principal component analysis to infer protein % representations without specific knowledge of the experimental design or assay % type. + +% Bio + +% Dr Justin Bedő is the Stafford Fox Centenary Fellow in Bioinformatics and % Computational Biology at the Walter and Eliza Hall Institute. He studied % computer science followed by a PhD in machine learning at the Australian % National University and was awarded his doctorate in 2009. He subsequently % worked as a researcher across both academia and industry at NICTA, IBISC % (Informatique, BioInformatique, Systèmes Complexes) CNRS, and IBM Research on % machine learning methods development and applications to biology before joining % the WEHI in 2016. + \begin{document} \maketitle @@ -105,6 +120,7 @@ \item Growing resource of functional data \item MaveDB \footfullcite{Esposito2019} + \unskip \footnote{\url{https://www.mavedb.org}} catalogs a number of datasets and provides easy access \end{enumerate} \end{frame} @@ -153,24 +169,42 @@ \begin{enumerate} \item Scores calculated a variety of ways, e.g., Rubin et al. \footfullcite{Rubin2017}: - \[L_{v,t}=\log\left(\frac{(c_{v,t}+\frac12)(c_{wt,0}+\frac12)}{(c_{v,0}+\frac12)(c_{wt,t}+\frac12)}\right) \] + \[L_{v,t}=\log\left(\frac{(c_{v,t}+\frac12)(c_{wt,0}+\frac12)}{(c_{v,0}+\frac12)(c_{wt,t}+\frac12)}\right) \] + \item Assays can measure different properties + \item Numerous different experimental designs \end{enumerate} \end{frame} - \begin{frame}{Basics} - \begin{definition}[Compositional data] Data \(X \in \R^{n \times d}\) is compositional if rows \(\bx_i\) are in the simplex - \[S^d=\{\,\bx \in \R^d : \forall j,x_j > 0 ; \sum_{j=1}^d x_j = \kappa\,\} \] - for constant \(\kappa > 0\). - \end{definition} Information is therefore given only by the ratios of components and any composition can be normalised to the standard simplex where \(\kappa = 1\) (c.f., dividing by library size). + \begin{frame}{Compositional simplex} + \begin{columns}[T] + \begin{column}{.63 + \textwidth} + \begin{definition}[Compositional data] Data \(X \in \R^{n \times d}\) is compositional if rows \(\bx_i\) are in the simplex + \[S^d=\{\,\bx \in \R^d : \forall j,x_j > 0 ; \sum_{j=1}^d x_j = \kappa\,\} \] + for constant \(\kappa > 0\). + \end{definition} + \end{column} + \hfill + \begin{column}{.26 + \textwidth} + \begin{tikzpicture}[scale=0.5] + \begin{ternaryaxis} + \addplot3 coordinates{(0.25,0.5,0.25)}; + \path (0.25,0.5,0.25) coordinate (M) (1,0,0) coordinate (C) (0,1,0) coordinate (A) (0,0,1) coordinate (B); + \end{ternaryaxis} + \end{tikzpicture} + \end{column} + \end{columns} + \vspace{10pt} \(\Rightarrow\) Information is given only by the ratios of components and any composition can be normalised to the standard simplex where \(\kappa = 1\) (c.f., dividing by library size). \end{frame} \begin{frame}{Isomorphisms to Euclidean vector spaces} The simplex forms a \(d-1\) dimensional Euclidean vector space \footfullcite{Aitchison1982}: \begin{definition}[\ac{alr}] - \[\alr(\bx)_i = \log \frac{x_i}{x_0} \] + \[\alr_i(\bx) = \log \frac{x_i}{x_0} \] \end{definition} \begin{definition}[\ac{clr}] - \[\clr(\bx)_i = \log \frac{x_i}{\left(\prod_{j=1}^d x_j\right)^{\frac 1 d}} \] + \[\clr_i(\bx) = \log \frac{x_i}{\left(\prod_{j=1}^d x_j\right)^{\frac 1 d}} \] \end{definition} \end{frame} @@ -190,6 +224,7 @@ \begin{itemize} \item Zeros: \begin{enumerate} + \item $\log(0)$ undefined \item geometric mean is \(0\) \(\Rightarrow\) \ac{clr} is undefined \item @@ -198,7 +233,7 @@ \item Interpretation: \begin{enumerate} \item - \ac{alr} is not isometry + \ac{alr} is not an isometry \item \ac{clr} is degenerate \end{enumerate} @@ -208,7 +243,7 @@ \begin{frame}{Traditional \ac{pca}} Given \(\X\in \R^{n\times d}\) minimise loss - \[\ell_{\textsc{pca}} \triangleq {\lVert \X - \V\A \rVert}^2_{\textrm{F}} \] + \[\ell_{\textsc{pca}} \triangleq {\lVert \X - \V\A \rVert}^2_{\textrm{F}}\] s.t. \(\V \in \R^{n \times k}\), \(\A \in \R^{k \times d}\), and \(\V^\intercal \V = \I\). @@ -219,36 +254,35 @@ \begin{frame}{Exponential family \ac{pca}} - \begin{definition}{Bregman Divergence} Let \(\varphi \colon \R^d \to \R\) be a smooth ($C^1$) convex function on convex set \(\Omega\). + \begin{definition}[Bregman Divergence] Let \(\varphi \colon \R^d \to \R\) be a smooth ($C^1$) convex function on convex set \(\Omega\). The Bregman divergence \(D_\varphi\) with generator \(\varphi\) is - \[ D_\varphi\left(\bu\,\Vert\,\bv\right) \triangleq \varphi(\bu)-\varphi(\bv)-\langle \nabla\varphi(\bv),\bu-\bv\rangle. \] + \[ D_\varphi\left(\bu\,\Vert\,\bv\right) \triangleq \varphi(\bu)-\varphi(\bv)-\langle \nabla\varphi(\bv),\bu-\bv\rangle. \] \end{definition} - Denote the convex conjugate of \(\varphi\) as \(\varphi^*(\bu) \triangleq \sup_\bv\left\{\langle \bu,\bv\rangle-\varphi(\bv)\right\}\). The exponential family \ac{pca} is then given by minimising loss - \[\ell_{\varphi} \triangleq D_\varphi\left(\X\,\Vert\,\nabla\varphi^*\left(\V\A\right)\right) \] + \[\ell_{\varphi} \triangleq D_\varphi\left(\X\,\Vert\,\nabla\varphi^*\left(\V\A\right)\right) \] under the same constraints as previously, approximating \(\X \sim \nabla\varphi^*\left(\V\A\right)\). \end{frame} \begin{frame}{Aitchison's simplex and exponential \ac{pca}} Aitchison's log-transformation is a dual affine coordinate space made explicit with - \[\varphi(z) = z\log(z) - z \Leftrightarrow \varphi^*(z) = e^z, \] + \[\varphi(z) = z\log(z) - z \Leftrightarrow \varphi^*(z) = e^z,\] but what about normalisation? Consider \ac{alr}: - \[\alr(\bx) \triangleq x_0 \sum_{i=1}^d\varphi\left(\frac{x_i}{x_0}\right) \Leftrightarrow \alr^*(\bx) = x_0\sum_{i=1}^d e^{\frac{x_i}{x_0}} \] + \[\alr(\bx) \triangleq x_0 \sum_{i=1}^d\varphi\left(\frac{x_i}{x_0}\right) \Leftrightarrow \alr^*(\bx) = x_0\sum_{i=1}^d e^{\frac{x_i}{x_0}} \] \end{frame} \begin{frame}{Scaled Bregman} - \begin{theorem}{Scaled Bregman - \footfullcite{nock2016scaled}} Let \(\varphi \colon \mathcal{X} \to \R\) be convex differentiable and \(g \colon \mathcal{X} \to \R\) be differentiable. + \begin{theorem}[Scaled Bregman + \footfullcite{nock2016scaled}] Let \(\varphi \colon \mathcal{X} \to \R\) be convex differentiable and \(g \colon \mathcal{X} \to \R\) be differentiable. Then - \[g(\bx)\cdot D_\varphi\left(\frac{\bx}{g(\bx)}\,\middle\Vert\,\frac{\by}{g(\by)}\right) = D_{\breve{\varphi}}\left(\bx\,\middle\Vert\,\by\right) \] + \[g(\bx)\cdot D_\varphi\left(\frac{\bx}{g(\bx)}\,\middle\Vert\,\frac{\by}{g(\by)}\right) = D_{\breve{\varphi}}\left(\bx\,\middle\Vert\,\by\right) \] where - \[\breve{\varphi} \triangleq g(\bx) \cdot \varphi\left(\frac{x}{g(\bx)}\right) \] + \[\breve{\varphi} \triangleq g(\bx) \cdot \varphi\left(\frac{x}{g(\bx)}\right)\] \end{theorem} Avalos et al. @@ -257,6 +291,11 @@ \ac{clr} recently. \end{frame} + \begin{frame}{Medians instead of means} + Zeros still a problem, as geometric mean is $0$. Instead, use median as gague + function. + \end{frame} + \begin{frame}{Activation-Induced Deaminase \footfullcite{Gajula2014}} \begin{tikzpicture}[remember picture,overlay] @@ -316,4 +355,18 @@ \end{tikzpicture} \end{frame} + \begin{frame}{\textsc{Brca1}: Positional effects} + \begin{columns}[T] + \begin{column}{.4\textwidth} + \[\V\A+\U^\intercal\Q\PP\] + where $\U \in \R^n$, $\Q \in \R^l$, $\PP \in \mathbb{2}^{l\times d}$ + \end{column}\hfill + \begin{column}{.58\textwidth} + \begin{tikzpicture} + \node[scale=.45]{\input{position.tikz}}; + \end{tikzpicture} + \end{column} + \end{columns} + \end{frame} + \end{document} -- cgit v1.2.3