% Offizielle Beispieldatei für beamer-Vorlage aus tubslatex Version 0.3beta2
\documentclass[fleqn,11pt,aspectratio=43]{beamer}

\usepackage[english]{babel}
\usepackage[utf8]{inputenc}
\usepackage{graphicx}
\usepackage{svg}
\usepackage[
backend=biber,
%style=alphabetic,
%sorting=ynt
]{biblatex}
\addbibresource{../Presentation/references.bib}
\usetheme[%
  %nexus,%        Nexus Fonts benutzen
  %lnum,%         Versalziffern verwenden
  %cmyk,%<rgbprint>,          Auswahl des Farbmodells
  blue,%<orange/green/violet> Auswahl des Sekundärfarbklangs
  dark,%<light,medium>        Auswahl der Helligkeit
  %colorhead,%    Farbig hinterlegte Kopfleiste
  %colorfoot,%    Farbig hinterlegt Fußleiste auf Titelseite
  colorblocks,%   Blöcke Farbig hinterlegen
  %nopagenum,%    Keine Seitennumer in Fußzeile
  %nodate,%       Kein Datum in Fußleiste
  tocinheader,%   Inhaltsverzeichnis in Kopfleiste
  %tinytocinheader,% kleines Kopfleisten-Inhaltsverzeichnis
  %widetoc,%      breites Kopfleisten-Inhaltsverzeichnis
  %narrowtoc,%    schmales Kopfleisten-Inhaltsverzeichnis
  %nosubsectionsinheader,%  Keine subsections im Kopfleisten-Inhaltsverzeichnis
  %nologoinfoot,% Kein Logo im Fußbereich darstellen
  ]{tubs}

% Titelseite
\title{EIE: Efficient Inference Engine on Compressed Deep Neural Network}
%\subtitle{Das Corporate Design in  \LaTeX}
\author{Leonard Kugis}
% Titelgrafik, automatisch beschnitten, Weitere Optionen: <scaled/cropx/cropy>
% \titlegraphic[cropped]{\includegraphics{infozentrum.jpg}}
%\titlegraphic[scaled]{\includegraphics{titlepicture.jpg}}

% Logo, dass auf Titelseiten oben rechts und auf Inthaltsseiten unten rechts
% dargestellt wird. Es wird jeweils automatisch skliert
%\logo{\includegraphics{dummy_institut.pdf}}
%\logo{Institut für Unkreativität\\und Schreibschwäche}

\begin{document}

\nocite{*}
\renewcommand*{\bibfont}{\scriptsize}

\begin{frame}[plain]
\titlepage
\end{frame}

\begin{frame}{Table of contents}
    \tableofcontents
\end{frame}

\section{Deep Neural Networks}

\begin{frame}{Table of contents}
  \tableofcontents[currentsection]
\end{frame}

\begin{frame}{Deep Neural Networks}
  \begin{figure}[h]
    \centering
    \includegraphics[width=\textwidth, keepaspectratio]{resources/cnn}
    \caption{Deep Neural Network \cite{726791}}
  \end{figure}
\end{frame}

\begin{frame}{Deep Neural Networks}
  \begin{figure}[h]
    \centering
    \includegraphics[width=\textwidth, keepaspectratio]{resources/fcn}
    \caption{Fully connected layer}
  \end{figure}
\end{frame}

\begin{frame}{Deep Neural Networks}
  \begin{figure}[h]
    \centering
    \includegraphics[width=\textwidth, keepaspectratio]{resources/fcn}
    \caption{Fully connected layer}
  \end{figure}
  \begin{itemize}
    \item $b_i = f(\sum\limits_{j=0}^{n} W_{ij} a_j)$
    \item Multiply-Accumulate (MAC) operations
    \item Matrix $W$ can be sparse
  \end{itemize}
\end{frame}

\section{Motivation}

\begin{frame}{Table of contents}
  \tableofcontents[currentsection]
\end{frame}

\begin{frame}{Inference metrics}
  \begin{itemize}
    \item Throughput \\
      \textcolor{gray}{Amount of data processed during one unit of time}
    \item Latency \\
      \textcolor{gray}{Amount of time it takes to process a single workload}
    \item Model size \\
      \textcolor{gray}{Storage amount to store the model (e.g. weights)}
    \item Energy use \\
      \textcolor{gray}{Energy consumption for processing a specific amount of data}
  \end{itemize}
\end{frame}

\begin{frame}{Memory access}
  \begin{figure}[h]
    \centering
    \includegraphics[width=0.9\textwidth, keepaspectratio]{resources/memory_latency}
    \caption{Memory hierarchy and energy cost of hierarchy levels \cite{DBLP:journals/corr/SzeCESZ16}}
  \end{figure}
\end{frame}

\begin{frame}{Memory access}
  \begin{figure}[h]
    \centering
    \includegraphics[width=0.6\textwidth, keepaspectratio]{resources/dnn_dataflows_png}
    \caption{Common dataflow models in inference architectures \cite{DBLP:journals/corr/SzeCESZ16}}
  \end{figure}
\end{frame}

\begin{frame}{Memory access}
  \begin{figure}[h]
    \centering
    \includegraphics[width=0.6\textwidth, keepaspectratio]{resources/dnn_dataflows_access_png}
    \caption{Common dataflow models in inference architectures (based on \cite{DBLP:journals/corr/SzeCESZ16})}
  \end{figure}
\end{frame}

\section{Compression}

\begin{frame}{Table of contents}
  \tableofcontents[currentsection]
\end{frame}

\begin{frame}
  \begin{itemize}
    \item Dynamic
    \begin{itemize}
      \item Input data
    \end{itemize}
    \item Static (parameters)
    \begin{itemize}
      \item Weights
      \item Parameters of activation functions
    \end{itemize}
  \end{itemize}
\end{frame}

\begin{frame}{AlexNet}
  \begin{itemize}
    \item 5 convolutional layers
    \item 3 fully connected layers
    \item $\sim 62$ million parameters
    \item $\sim 240$ MB with 32-bit float representation
  \end{itemize}
\end{frame}

\begin{frame}{Basis projection}
  \begin{figure}[h]
    \centering
    \includegraphics[width=0.75\textwidth, keepaspectratio]{resources/basis_projection_png}
    \caption{Basis projection and resulting weight distribution \cite{DBLP:journals/corr/SuleimanZS16}}
  \end{figure}
\end{frame}

\begin{frame}{Pruning}
  \begin{itemize}
    \item Idea: Remove unimportant weights with low impact on accuracy
  \end{itemize}
  \begin{figure}[h]
    \centering
    \vspace{0.5cm}
    \includegraphics[width=0.3\textwidth, keepaspectratio]{resources/pruning}
    \caption{3-step pruning working principle (based on \cite{NIPS2015_ae0eb3ee})}
  \end{figure}
\end{frame}

\begin{frame}{Pruning}
  \begin{minipage}{0.24\linewidth}
    \begin{figure}[h]
      \centering
      \includegraphics[width=\textwidth, keepaspectratio]{resources/pruning}
    \end{figure}
  \end{minipage}
  \hfill
  \begin{minipage}{0.74\linewidth}
    \begin{itemize}
      \item Magnitude threshold based pruning
      \begin{itemize}
        \item Remove a weight, if value is below specific threshold
      \end{itemize}
      \item Optimal Brain Damage $(a)$ \& Optimal Brain Surgeon $(b)$
      \begin{itemize}
        \item Removes weights based on sensitivity on objective function
        \item Considers first $(a)$ and second order derivatives $(b)$ to measure sensitivity
        \item Remove lowest sensitive weights first
      \end{itemize}
      \item Biased weight decay
      \begin{itemize}
        \item Update-Time pruning
        \item Adjust weight update term so that large weights persist and small weights converge to zero 
      \end{itemize}
    \end{itemize}
  \end{minipage}
\end{frame}

\begin{frame}{Weight quantization}
  \begin{figure}[h]
    \centering
    \includegraphics[width=0.5\textwidth, keepaspectratio]{resources/clustering}
    \caption{Weight quantization \cite{Han2015DeepCC}}
  \end{figure}
  \begin{itemize}
    \item Group similar weights into clusters
    \item Fine tune clusters with gradient matrix during update
  \end{itemize}
\end{frame}

\begin{frame}{Weight quantization}
  \begin{itemize}
    \item Minimalize Within-Cluster Sum of Squares (WCSS): $\text{argmin}_C \sum\limits_{i=1}^{k} \sum\limits_{\omega \in c_i} | \omega - c_i |^2$
    \item Perform k-means clustering:
    \begin{enumerate}
      \item Initialize $k$ cluster centroids
      \item For all weights:
      \begin{enumerate}
        \item Assign weight to the cluster with nearest centroid
        \item Recalculate cluster centroids
      \end{enumerate}
    \end{enumerate}
  \end{itemize}
\end{frame}

\begin{frame}{Weight quantization}
  \begin{figure}[h]
    \centering
    \includegraphics[width=0.8\textwidth, keepaspectratio]{resources/centroid_initialization}
    \caption{Different centroid initialization methods \cite{Han2015DeepCC}}
  \end{figure}
\end{frame}

\begin{frame}{Huffman encoding}
  \begin{figure}[h]
    \centering
    \includegraphics[width=0.7\textwidth, keepaspectratio]{resources/huffman}
    \caption{Huffman encoding example}
  \end{figure}
\end{frame}

\begin{frame}{HashNets}
  \begin{figure}[h]
    \centering
    \includegraphics[width=0.7\textwidth, keepaspectratio]{resources/hashnets}
    \caption{HashNets encoding (based on \cite{10.5555/3045118.3045361})}
  \end{figure}
\end{frame}

\begin{frame}{HashNets}
  \begin{minipage}{0.49\linewidth}
    \begin{figure}[h]
      \centering
      \includegraphics[width=\textwidth, keepaspectratio]{resources/hashnets}
    \end{figure}
  \end{minipage}
  \hfill
  \begin{minipage}{0.49\linewidth}
    \begin{itemize}
      \item Virtual weight matrix $\textbf{V}^{\ell}$
      \item One-way hash function $h^{\ell}(i, j)$
      \item Weight array $w^{\ell}$
      \item Hash function returns index for weight array
      \item $w^{\ell}_{h^{\ell}(i, j)} = \textbf{V}^{\ell}_{ij}$
    \end{itemize}
  \end{minipage}
\end{frame}

\begin{frame}{Storage format}
  \begin{itemize}
    \item Compressed sparse column (CSC) /
      Compressed sparse row (CSR) representation
    \item Encode each column $W_j$ as vectors $v$ and $z$
    \begin{itemize}
      \item $v$: Non-zero weights
      \item $z$: Number of zeros before corresponding element in $v$
    \end{itemize}
  \end{itemize}
  \begin{itemize}
    \item E.g. column $[0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3]$ becomes
      $v = [1, 2, 0, 3]$, $z = [2, 0, 15, 2]$
    \item $v$'s and $z$'s for all columns are stored in a single pair of arrays
    \item Vector $p$ with $p_j$ pointing to the first element of column $W_j$
  \end{itemize}
\end{frame}

\section{EIE implementation}

\begin{frame}{Table of contents}
  \tableofcontents[currentsection]
\end{frame}

\begin{frame}{EIE implementation}
  \begin{itemize}
    \item Optimizes per-activation formula:
    \begin{align}
      b_i = f(\sum\limits_{j=0}^{n-1} W_{ij} a_j) \overset{!}{=} \text{ReLU}(\sum\limits_{j \in X_i \cap Y} S[I_{ij}] a_j)
    \end{align}
    \item $X_i$: Set of columns, skipping $W_{ij} = 0$
    \item $Y$: Set of indices in $a$ for which $a_j \neq 0$
    \item $I_{ij}$: 4-bit index
    \item $S$: Shared lookup table
  \end{itemize}
\end{frame}

\begin{frame}{Weight matrix segmentation}
  \begin{figure}[h]
    \centering
    \includegraphics[width=0.52\textwidth, keepaspectratio]{resources/eie_matrix}
    \includegraphics[width=0.52\textwidth, keepaspectratio]{resources/eie_layout}
    \caption{Weight matrix segmentation and memory layout \cite{10.1109/ISCA.2016.30}}
  \end{figure}
\end{frame}

\begin{frame}{Hardware implementation}
  \begin{figure}[h]
    \centering
    \includegraphics[width=\textwidth, keepaspectratio]{resources/eie_hw}
    \caption{Hardware architecture \cite{10.1109/ISCA.2016.30}}
  \end{figure}
\end{frame}

\begin{frame}{Hardware implementation}
  \begin{minipage}{0.39\linewidth}
    \begin{figure}[h]
      \centering
      \includegraphics[width=\textwidth, keepaspectratio]{resources/eie_hw_zero}
      \caption{Non-Zero detection node (based on \cite{10.1109/ISCA.2016.30})}
    \end{figure}
  \end{minipage}
  \hfill
  \begin{minipage}{0.59\linewidth}
    \begin{itemize}
      \item Filter zero elements in input vector $a$
      \item Broadcast non-zero elements $a_j$ and corresponding indices $j$ to all PEs
    \end{itemize}
  \end{minipage}
\end{frame}

\begin{frame}{Hardware implementation}
  \begin{minipage}{0.39\linewidth}
    \begin{figure}[h]
      \centering
      \includegraphics[width=\textwidth, keepaspectratio]{resources/eie_hw_pointer}
      \caption{Pointer read unit (based on \cite{10.1109/ISCA.2016.30})}
    \end{figure}
  \end{minipage}
  \hfill
  \begin{minipage}{0.59\linewidth}
    \begin{itemize}
      \item Pointers to columns $W_j$ are stored alternating in seperate SRAM banks
      \item Start- and end-pointers are read out simultaneously
    \end{itemize}
  \end{minipage}
\end{frame}

\begin{frame}{Hardware implementation}
  \begin{minipage}{0.39\linewidth}
    \begin{figure}[h]
      \centering
      \includegraphics[width=\textwidth, keepaspectratio]{resources/eie_hw_matrix}
      \caption{Sparse matrix read unit (based on \cite{10.1109/ISCA.2016.30})}
    \end{figure}
  \end{minipage}
  \hfill
  \begin{minipage}{0.59\linewidth}
    \begin{itemize}
      \item Reads weight values $v$ and zeroes $z$ for current operation based on pointers
      \item SRAM word length: 64 bit, entries of $v$ and $z$ are 4 bit each \\
        $\rightarrow$ 8 entries per word
    \end{itemize}
  \end{minipage}
\end{frame}

\begin{frame}{Hardware implementation}
  \begin{minipage}{0.39\linewidth}
    \begin{figure}[h]
      \centering
      \includegraphics[width=\textwidth, keepaspectratio]{resources/eie_hw_alu}
      \caption{Arithmetic unit (based on \cite{10.1109/ISCA.2016.30})}
    \end{figure}
  \end{minipage}
  \hfill
  \begin{minipage}{0.59\linewidth}
    \begin{itemize}
      \item Receives column vector $v$, absolute destination accumulator register index $x$ and activation value $a_j$
      \item Calculates $b_x = b_x + v \cdot a_j$
      \item Accumulates indices $x$ and forwards real target address 
    \end{itemize}
  \end{minipage}
\end{frame}

\begin{frame}{Hardware implementation}
  \begin{minipage}{0.29\linewidth}
    \begin{figure}[h]
      \centering
      \includegraphics[width=\textwidth, keepaspectratio]{resources/eie_hw_rw}
      \caption{Read/Write unit (based on \cite{10.1109/ISCA.2016.30})}
    \end{figure}
  \end{minipage}
  \hfill
  \begin{minipage}{0.69\linewidth}
    \begin{itemize}
      \item Maintains source ($a$) and destination ($b$) activation values
      \item Feed-Forward-Network: destination values of one layer are activation values of next layer
      \item Register banks exchange roles on each layer
    \end{itemize}
  \end{minipage}
\end{frame}

\begin{frame}{Hardware implementation}
  \begin{minipage}{0.24\linewidth}
    \begin{figure}[h]
      \centering
      \includegraphics[width=\textwidth, keepaspectratio]{resources/eie_hw_lnzd}
      \caption{ReLU \& Leading non-zero detection unit (based on \cite{10.1109/ISCA.2016.30})}
    \end{figure}
  \end{minipage}
  \hfill
  \begin{minipage}{0.74\linewidth}
    \begin{itemize}
      \item Performs ReLU function on destination values
      \item Detects first PE-local non-zero destination value
      \item Sends it to group Leading non-zero detection unit to distribute it to PEs for next cycle
    \end{itemize}
  \end{minipage}
\end{frame}

\section{Evaluation}

\begin{frame}{Table of contents}
  \tableofcontents[currentsection]
\end{frame}

\begin{frame}{Speed and energy}
  \begin{figure}[h]
    \centering
    \includegraphics[width=\textwidth, keepaspectratio]{resources/eval_speed_png}\\
    \vspace{0.5cm}
    \includegraphics[width=\textwidth, keepaspectratio]{resources/eval_energy_png}
    \caption{Speedup and energy efficiency comparison \cite{10.1109/ISCA.2016.30}}
  \end{figure}
  \begin{itemize}
    \item Throughput: 102 GOP/s compressed $\rightarrow$ 3 TOP/s uncompressed
  \end{itemize}
\end{frame}

\begin{frame}{Accelerator comparison}
  \begin{figure}[h]
    \centering
    \includegraphics[width=\textwidth, keepaspectratio]{resources/accelerators_table}
    \caption{EIE compared with different DNN hardware accelerators \cite{10.1109/ISCA.2016.30}}
  \end{figure}
\end{frame}

\section{Future work}

\begin{frame}{Table of contents}
  \tableofcontents[currentsection]
\end{frame}

\begin{frame}{Inference accelerators}
  \begin{itemize}
    \item Exploitation of different compression methods
    \begin{itemize}
      \item Huffman encoding (35-49x compression)
      \item HashNets (variable compression up to 64x)
    \end{itemize}
    \item Combination with other optimization methods
    \begin{itemize}
      \item In-Memory computation
      \item Approximating circuits
    \end{itemize}
    \item Optimize hardware itself
    \begin{itemize}
      \item Different storage technologies (e.g. Memristors)
    \end{itemize}
  \end{itemize}
\end{frame}

\begin{frame}[allowframebreaks]{References}
  \printbibliography
\end{frame}

\begin{frame}[highlight]{End}
  End
\end{frame}

\end{document}