\author{Justin Bed\H{o}}
\title{Exploration of deep mutational scanning data with unsupervised methods}
\date{December 13, 2022}

\section{Deep mutational scanning data}

\begin{frame}{\dms data} \end{frame}

\section{Compositional data}

\begin{frame}{Basics}

\begin{definition}[Compositional data]
Data \(X \in \R^{n \times d}\) is compositional if rows \(\bx_i\) are in the simplex
\[S^d=\{\,\bx \in \R^d : \forall j,x_j > 0 ; \sum_{j=1}^d x_j = \kappa\,\}
\]
for constant \(\kappa > 0\).
\end{definition}

Information is therefore given only by the ratios of components and any composition can be normalised to the standard simplex where \(\kappa = 1\) (c.f., dividing by library size).

\end{frame}

\begin{frame}{Isomorphisms to Euclidean vector spaces}

The simplex forms a \(d-1\) dimensional Euclidean vector space \footfullcite{Aitchison1982}:

\begin{definition}[\ac{alr}]
\[\alr(\bx)_i = \log \frac{x_i}{x_0}
\]
\end{definition}

\begin{definition}[\ac{clr}]
\[\clr(\bx)_i = \log \frac{x_i}{\left(\prod_{j=1}^d x_j\right)^{\frac 1 d}}
\]
\end{definition}

\end{frame}

\begin{frame}{\textsc{Pca} on \ac{dms} data}

\begin{block}{Transformation approach}
\begin{enumerate}
\item Map \dms data to Euclidean space via \ac{alr} / \ac{clr}
\item Apply standard \ac{pca}
\end{enumerate}
\end{block}

\begin{block}{Problems}
\begin{itemize}
\item Zeros:
\begin{enumerate}
\item geometric mean is \(0\) \(\Rightarrow\) \ac{clr} is undefined
\item \ac{alr} is undefined for unobserved components in the ref.
\end{enumerate}
\item Interpretation:
\begin{enumerate}
\item \ac{alr} is not isometry
\item \ac{clr} is degenerate
\end{enumerate}
\end{itemize}
\end{block}

\end{frame}

\begin{frame}{Traditional \ac{pca}}

Given \(\X\in \R^{n\times d}\) minimise loss
\[\ell_{\textsc{pca}} \triangleq {\lVert \X - \V\A \rVert}^2_{\textrm{F}}
\]
s.t. \(\V \in \R^{n \times k}\), \(\A \in \R^{k \times d}\), and \(\V^\intercal \V = \I\). Has been generalised to exponential families \footfullcite{collins2001generalization} via Bregman divergences \footfullcite{Amari2016-ua}. \end{frame} \begin{frame}{Exponential family \ac{pca}} \begin{definition}{Bregman Divergence} Let \(\varphi \colon \R^d \to \R\) be a smooth ($C^1$) convex function on convex set \(\Omega\). The Bregman divergence \(D_\varphi\) with generator \(\varphi\) is \[ D_\varphi\left(\bu\,\Vert\,\bv\right) \triangleq \varphi(\bu)-\varphi(\bv)-\langle \nabla\varphi(\bv),\bu-\bv\rangle. \] \end{definition} Denote the convex conjugate of \(\varphi\) as \(\varphi^*(\bu) \triangleq \sup_\bv\left\{\langle \bu,\bv\rangle-\varphi(\bv)\right\}\). The exponential family \ac{pca} is then given by minimising loss \[\ell_{\varphi} \triangleq D_\varphi\left(\X\,\Vert\,\nabla\varphi^*\left(\V\A\right)\right) \] under the same constraints as previously, approximating \(\X \sim \nabla\varphi^*\left(\V\A\right)\). \end{frame} \begin{frame}{Aitchison's simplex and exponential \ac{pca}} Aitchison's log-transformation is a dual affine coordinate space made explicit with \[\varphi(z) = z\log(z) - z \Leftrightarrow \varphi^*(z) = e^z, \] but what about normalisation? Consider \ac{alr}: \[\alr(\bx) \triangleq x_0 \sum_{i=1}^d\varphi\left(\frac{x_i}{x_0}\right) \Leftrightarrow \alr^*(\bx) = x_0\sum_{i=1}^d e^{\frac{x_i}{x_0}} \] \end{frame} \begin{frame} \begin{theorem}{Scaled Bregman \footfullcite{nock2016scaled}} Let \(\varphi \colon \mathcal{X} \to \R\) be convex differentiable and \(g \colon \mathcal{X} \to \R\) be differentiable. Then \[g(\bx)\cdot D_\varphi\left(\frac{\bx}{g(\bx)}\,\middle\Vert\,\frac{\by}{g(\by)}\right) = D_{\breve{\varphi}}\left(\bx\,\middle\Vert\,\by\right) \] where \[\breve{\varphi} \triangleq g(\bx) \cdot \varphi\left(\frac{x}{g(\bx)}\right) \] \end{theorem} Avalos et al. \footfullcite{avalos2018representation} \ considered a relaxed form for \ac{clr} recently. \end{frame} \end{document}