summaryrefslogtreecommitdiff
path: root/slides.tex
diff options
context:
space:
mode:
authorJustin Bedo <cu@cua0.org>2022-12-09 10:46:11 +1100
committerJustin Bedo <cu@cua0.org>2022-12-12 10:04:41 +1100
commite72ee50421716fde6646afd3a444b993413d3440 (patch)
tree598a9dcc461fdd51e917f7cbe2d713ab3b99a758 /slides.tex
parenta6df2a5886383bbf1d782802bfd65fdcf4dc319f (diff)
add illustrative figures and abtract
Diffstat (limited to 'slides.tex')
-rw-r--r--slides.tex99
1 files changed, 76 insertions, 23 deletions
diff --git a/slides.tex b/slides.tex
index f8121d9..1cf5229 100644
--- a/slides.tex
+++ b/slides.tex
@@ -5,12 +5,13 @@
\usefonttheme{professionalfonts}
\setbeamerfont{footnote}{size=
\tiny}
+ \usepackage{unicode-math}
\usepackage{microtype}
\usepackage{tikz}
-\usetikzlibrary{shapes}
-\usetikzlibrary{bayesnet}
+\usepackage{pgfplots}
+\usepgfplotslibrary{ternary}
\usepackage{stmaryrd}
\newcommand{\R}{\mathbb{R}}
@@ -22,9 +23,13 @@
\newcommand{\V}{\mathbf{V}}
\newcommand{\A}{\mathbf{A}}
\newcommand{\I}{\mathbf{I}}
+\newcommand{\U}{\mathbf{u}}
+\newcommand{\Q}{\mathbf{q}}
+\newcommand{\PP}{\mathbf{P}}
\DeclareMathOperator{\alr}{alr}
\DeclareMathOperator{\clr}{clr}
+
\usepackage[natbib=true,url=false,style=verbose-ibid]{biblatex}
\addbibresource{slides.bib}
\AtBeginBibliography{\small}
@@ -84,9 +89,19 @@
\definecolor{cb3}{HTML}{7570b3}
\author{Justin Bed\H{o}}
-\title{Representation learning of compositional counts: exploration of deep mutational scanning data}
+\title{Representation learning of compositional counts: an exploration of deep mutational scanning data}
\date{December 13, 2022}
+% Abstract:
+
+% Deep mutational scanning data provides important functional information on the % effects of protein variants. Many different aspects of proteins can be assayed, % many different experimental designs are possible, and many different scores are % computed leading to very heterogeneous data that is difficult to integrate.
+
+% In this talk I will explore a representational learning approach on raw count % data. This technique uses recent methods combining compositional data analysis % with a generalised form of principal component analysis to infer protein % representations without specific knowledge of the experimental design or assay % type.
+
+% Bio
+
+% Dr Justin Bedő is the Stafford Fox Centenary Fellow in Bioinformatics and % Computational Biology at the Walter and Eliza Hall Institute. He studied % computer science followed by a PhD in machine learning at the Australian % National University and was awarded his doctorate in 2009. He subsequently % worked as a researcher across both academia and industry at NICTA, IBISC % (Informatique, BioInformatique, Systèmes Complexes) CNRS, and IBM Research on % machine learning methods development and applications to biology before joining % the WEHI in 2016.
+
\begin{document}
\maketitle
@@ -105,6 +120,7 @@
\item Growing resource of functional data
\item MaveDB
\footfullcite{Esposito2019}
+ \unskip
\footnote{\url{https://www.mavedb.org}} catalogs a number of datasets and provides easy access
\end{enumerate}
\end{frame}
@@ -153,24 +169,42 @@
\begin{enumerate}
\item Scores calculated a variety of ways, e.g., Rubin et al.
\footfullcite{Rubin2017}:
- \[L_{v,t}=\log\left(\frac{(c_{v,t}+\frac12)(c_{wt,0}+\frac12)}{(c_{v,0}+\frac12)(c_{wt,t}+\frac12)}\right) \]
+ \[L_{v,t}=\log\left(\frac{(c_{v,t}+\frac12)(c_{wt,0}+\frac12)}{(c_{v,0}+\frac12)(c_{wt,t}+\frac12)}\right) \]
+ \item Assays can measure different properties
+ \item Numerous different experimental designs
\end{enumerate}
\end{frame}
- \begin{frame}{Basics}
- \begin{definition}[Compositional data] Data \(X \in \R^{n \times d}\) is compositional if rows \(\bx_i\) are in the simplex
- \[S^d=\{\,\bx \in \R^d : \forall j,x_j > 0 ; \sum_{j=1}^d x_j = \kappa\,\} \]
- for constant \(\kappa > 0\).
- \end{definition} Information is therefore given only by the ratios of components and any composition can be normalised to the standard simplex where \(\kappa = 1\) (c.f., dividing by library size).
+ \begin{frame}{Compositional simplex}
+ \begin{columns}[T]
+ \begin{column}{.63
+ \textwidth}
+ \begin{definition}[Compositional data] Data \(X \in \R^{n \times d}\) is compositional if rows \(\bx_i\) are in the simplex
+ \[S^d=\{\,\bx \in \R^d : \forall j,x_j > 0 ; \sum_{j=1}^d x_j = \kappa\,\} \]
+ for constant \(\kappa > 0\).
+ \end{definition}
+ \end{column}
+ \hfill
+ \begin{column}{.26
+ \textwidth}
+ \begin{tikzpicture}[scale=0.5]
+ \begin{ternaryaxis}
+ \addplot3 coordinates{(0.25,0.5,0.25)};
+ \path (0.25,0.5,0.25) coordinate (M) (1,0,0) coordinate (C) (0,1,0) coordinate (A) (0,0,1) coordinate (B);
+ \end{ternaryaxis}
+ \end{tikzpicture}
+ \end{column}
+ \end{columns}
+ \vspace{10pt} \(\Rightarrow\) Information is given only by the ratios of components and any composition can be normalised to the standard simplex where \(\kappa = 1\) (c.f., dividing by library size).
\end{frame}
\begin{frame}{Isomorphisms to Euclidean vector spaces} The simplex forms a \(d-1\) dimensional Euclidean vector space
\footfullcite{Aitchison1982}:
\begin{definition}[\ac{alr}]
- \[\alr(\bx)_i = \log \frac{x_i}{x_0} \]
+ \[\alr_i(\bx) = \log \frac{x_i}{x_0} \]
\end{definition}
\begin{definition}[\ac{clr}]
- \[\clr(\bx)_i = \log \frac{x_i}{\left(\prod_{j=1}^d x_j\right)^{\frac 1 d}} \]
+ \[\clr_i(\bx) = \log \frac{x_i}{\left(\prod_{j=1}^d x_j\right)^{\frac 1 d}} \]
\end{definition}
\end{frame}
@@ -190,6 +224,7 @@
\begin{itemize}
\item Zeros:
\begin{enumerate}
+ \item $\log(0)$ undefined
\item geometric mean is \(0\) \(\Rightarrow\)
\ac{clr} is undefined
\item
@@ -198,7 +233,7 @@
\item Interpretation:
\begin{enumerate}
\item
- \ac{alr} is not isometry
+ \ac{alr} is not an isometry
\item
\ac{clr} is degenerate
\end{enumerate}
@@ -208,7 +243,7 @@
\begin{frame}{Traditional
\ac{pca}} Given \(\X\in \R^{n\times d}\) minimise loss
- \[\ell_{\textsc{pca}} \triangleq {\lVert \X - \V\A \rVert}^2_{\textrm{F}} \]
+ \[\ell_{\textsc{pca}} \triangleq {\lVert \X - \V\A \rVert}^2_{\textrm{F}}\]
s.t.
\(\V \in \R^{n \times k}\), \(\A \in \R^{k \times d}\), and \(\V^\intercal \V = \I\).
@@ -219,36 +254,35 @@
\begin{frame}{Exponential family
\ac{pca}}
- \begin{definition}{Bregman Divergence} Let \(\varphi \colon \R^d \to \R\) be a smooth ($C^1$) convex function on convex set \(\Omega\).
+ \begin{definition}[Bregman Divergence] Let \(\varphi \colon \R^d \to \R\) be a smooth ($C^1$) convex function on convex set \(\Omega\).
The Bregman divergence \(D_\varphi\) with generator \(\varphi\) is
- \[ D_\varphi\left(\bu\,\Vert\,\bv\right) \triangleq \varphi(\bu)-\varphi(\bv)-\langle \nabla\varphi(\bv),\bu-\bv\rangle. \]
+ \[ D_\varphi\left(\bu\,\Vert\,\bv\right) \triangleq \varphi(\bu)-\varphi(\bv)-\langle \nabla\varphi(\bv),\bu-\bv\rangle. \]
\end{definition}
-
Denote the convex conjugate of \(\varphi\) as \(\varphi^*(\bu) \triangleq \sup_\bv\left\{\langle \bu,\bv\rangle-\varphi(\bv)\right\}\).
The exponential family
\ac{pca} is then given by minimising loss
- \[\ell_{\varphi} \triangleq D_\varphi\left(\X\,\Vert\,\nabla\varphi^*\left(\V\A\right)\right) \]
+ \[\ell_{\varphi} \triangleq D_\varphi\left(\X\,\Vert\,\nabla\varphi^*\left(\V\A\right)\right) \]
under the same constraints as previously, approximating \(\X \sim \nabla\varphi^*\left(\V\A\right)\).
\end{frame}
\begin{frame}{Aitchison's simplex and exponential
\ac{pca}} Aitchison's log-transformation is a dual affine coordinate space made explicit with
- \[\varphi(z) = z\log(z) - z \Leftrightarrow \varphi^*(z) = e^z, \]
+ \[\varphi(z) = z\log(z) - z \Leftrightarrow \varphi^*(z) = e^z,\]
but what about normalisation?
Consider
\ac{alr}:
- \[\alr(\bx) \triangleq x_0 \sum_{i=1}^d\varphi\left(\frac{x_i}{x_0}\right) \Leftrightarrow \alr^*(\bx) = x_0\sum_{i=1}^d e^{\frac{x_i}{x_0}} \]
+ \[\alr(\bx) \triangleq x_0 \sum_{i=1}^d\varphi\left(\frac{x_i}{x_0}\right) \Leftrightarrow \alr^*(\bx) = x_0\sum_{i=1}^d e^{\frac{x_i}{x_0}} \]
\end{frame}
\begin{frame}{Scaled Bregman}
- \begin{theorem}{Scaled Bregman
- \footfullcite{nock2016scaled}} Let \(\varphi \colon \mathcal{X} \to \R\) be convex differentiable and \(g \colon \mathcal{X} \to \R\) be differentiable.
+ \begin{theorem}[Scaled Bregman
+ \footfullcite{nock2016scaled}] Let \(\varphi \colon \mathcal{X} \to \R\) be convex differentiable and \(g \colon \mathcal{X} \to \R\) be differentiable.
Then
- \[g(\bx)\cdot D_\varphi\left(\frac{\bx}{g(\bx)}\,\middle\Vert\,\frac{\by}{g(\by)}\right) = D_{\breve{\varphi}}\left(\bx\,\middle\Vert\,\by\right) \]
+ \[g(\bx)\cdot D_\varphi\left(\frac{\bx}{g(\bx)}\,\middle\Vert\,\frac{\by}{g(\by)}\right) = D_{\breve{\varphi}}\left(\bx\,\middle\Vert\,\by\right) \]
where
- \[\breve{\varphi} \triangleq g(\bx) \cdot \varphi\left(\frac{x}{g(\bx)}\right) \]
+ \[\breve{\varphi} \triangleq g(\bx) \cdot \varphi\left(\frac{x}{g(\bx)}\right)\]
\end{theorem}
Avalos et al.
@@ -257,6 +291,11 @@
\ac{clr} recently.
\end{frame}
+ \begin{frame}{Medians instead of means}
+ Zeros still a problem, as geometric mean is $0$. Instead, use median as gague
+ function.
+ \end{frame}
+
\begin{frame}{Activation-Induced Deaminase
\footfullcite{Gajula2014}}
\begin{tikzpicture}[remember picture,overlay]
@@ -316,4 +355,18 @@
\end{tikzpicture}
\end{frame}
+ \begin{frame}{\textsc{Brca1}: Positional effects}
+ \begin{columns}[T]
+ \begin{column}{.4\textwidth}
+ \[\V\A+\U^\intercal\Q\PP\]
+ where $\U \in \R^n$, $\Q \in \R^l$, $\PP \in \mathbb{2}^{l\times d}$
+ \end{column}\hfill
+ \begin{column}{.58\textwidth}
+ \begin{tikzpicture}
+ \node[scale=.45]{\input{position.tikz}};
+ \end{tikzpicture}
+ \end{column}
+ \end{columns}
+ \end{frame}
+
\end{document}