% Offizielle Beispieldatei für beamer-Vorlage aus tubslatex Version 0.3beta2 \documentclass[fleqn,11pt,aspectratio=43]{beamer} \usepackage[english]{babel} \usepackage[utf8x]{inputenc} \usepackage{graphicx} \usepackage{svg} \usetheme[% %nexus,% Nexus Fonts benutzen %lnum,% Versalziffern verwenden %cmyk,%, Auswahl des Farbmodells blue,% Auswahl des Sekundärfarbklangs dark,% Auswahl der Helligkeit %colorhead,% Farbig hinterlegte Kopfleiste %colorfoot,% Farbig hinterlegt Fußleiste auf Titelseite colorblocks,% Blöcke Farbig hinterlegen %nopagenum,% Keine Seitennumer in Fußzeile %nodate,% Kein Datum in Fußleiste tocinheader,% Inhaltsverzeichnis in Kopfleiste %tinytocinheader,% kleines Kopfleisten-Inhaltsverzeichnis %widetoc,% breites Kopfleisten-Inhaltsverzeichnis %narrowtoc,% schmales Kopfleisten-Inhaltsverzeichnis %nosubsectionsinheader,% Keine subsections im Kopfleisten-Inhaltsverzeichnis %nologoinfoot,% Kein Logo im Fußbereich darstellen ]{tubs} % Titelseite \title{EIE: Efficient Inference Engine on Compressed Deep Neural Network} %\subtitle{Das Corporate Design in \LaTeX} \author{Leonard Kugis} % Titelgrafik, automatisch beschnitten, Weitere Optionen: % \titlegraphic[cropped]{\includegraphics{infozentrum.jpg}} %\titlegraphic[scaled]{\includegraphics{titlepicture.jpg}} % Logo, dass auf Titelseiten oben rechts und auf Inthaltsseiten unten rechts % dargestellt wird. Es wird jeweils automatisch skliert %\logo{\includegraphics{dummy_institut.pdf}} %\logo{Institut für Unkreativität\\und Schreibschwäche} \begin{document} \begin{frame}[plain] \titlepage \end{frame} \begin{frame}{Table of contents} \tableofcontents \end{frame} \section{Deep Neural Networks} \begin{frame}{Table of contents} \tableofcontents[currentsection] \end{frame} \begin{frame} \begin{figure}[h] \centering \includegraphics[width=\textwidth, keepaspectratio]{resources/cnn} \caption{Deep Neural Network} \end{figure} \end{frame} \begin{frame} \begin{figure}[h] \centering \includegraphics[width=\textwidth, keepaspectratio]{resources/fcn} \caption{Fully connected layer} \end{figure} \end{frame} \begin{frame} \begin{figure}[h] \centering \includegraphics[width=\textwidth, keepaspectratio]{resources/fcn} \caption{Fully connected layer} \end{figure} \begin{itemize} \item $b_i = f(\sum\limits_{j=0}^{n} W_{ij} a_j)$ \item Multiply-Accumulate (MAC) operations \item Matrix $W$ can be sparse \end{itemize} \end{frame} \section{Motivation} \begin{frame}{Table of contents} \tableofcontents[currentsection] \end{frame} \begin{frame}{Inference metrics} \begin{itemize} \item Throughput \\ \textcolor{gray}{Amount of data processed during one unit of time} \item Latency \\ \textcolor{gray}{Amount of time it takes to process a single workload} \item Model size \\ \textcolor{gray}{Storage amount to store the model (e.g. weights)} \item Energy use \\ \textcolor{gray}{Energy consumption for processing a specific amount of data} \end{itemize} \end{frame} \begin{frame} \begin{figure}[h] \centering \includegraphics[width=0.65\textwidth, keepaspectratio]{resources/dnn_dataflows_png} \caption{Common dataflow models in inference architectures} \end{figure} \end{frame} \begin{frame} \begin{figure}[h] \centering \includegraphics[width=0.65\textwidth, keepaspectratio]{resources/dnn_dataflows_access_png} \caption{Common dataflow models in inference architectures} \end{figure} \end{frame} \begin{frame} \begin{figure}[h] \centering \includegraphics[width=0.9\textwidth, keepaspectratio]{resources/memory_latency_png} \caption{Memory hierarchy and energy cost of hierarchy levels} \end{figure} \end{frame} \section{Compression} \begin{frame}{Table of contents} \tableofcontents[currentsection] \end{frame} \begin{frame} \begin{itemize} \item Dynamic \begin{itemize} \item Input data \end{itemize} \item Static (parameters) \begin{itemize} \item Weights \item Parameters of activation functions \end{itemize} \end{itemize} \end{frame} \begin{frame}{AlexNet} \begin{itemize} \item 5 convolutional layers \item 3 fully connected layers \item $\sim 62$ million parameters \item $\sim 240$ MB with 32-bit float representation \end{itemize} \end{frame} \begin{frame}{Basis projection} \begin{figure}[h] \centering \includegraphics[width=0.75\textwidth, keepaspectratio]{resources/basis_projection_png} \caption{Basis projection and resulting weight distribution} \end{figure} \end{frame} \begin{frame}{Pruning} \begin{itemize} \item Idea: Remove unimportant weights with low impact on accuracy \end{itemize} \begin{figure}[h] \centering \vspace{0.5cm} \includegraphics[width=0.3\textwidth, keepaspectratio]{resources/pruning} \caption{3-step pruning working principle} \end{figure} \end{frame} \begin{frame}{Pruning} \begin{minipage}{0.24\linewidth} \begin{figure}[h] \centering \includegraphics[width=\textwidth, keepaspectratio]{resources/pruning} \end{figure} \end{minipage} \hfill \begin{minipage}{0.74\linewidth} \begin{itemize} \item Magnitude threshold based pruning \begin{itemize} \item Remove a weight, if value is below specific threshold \end{itemize} \item Optimal Brain Damage $(a)$ \& Optimal Brain Surgeon $(b)$ \begin{itemize} \item Removes weights based on sensitivity on objective function \item Considers first $(a)$ and second order derivatives $(b)$ to measure sensitivity \item Remove lowest sensitive weights first \end{itemize} \item Biased weight decay \begin{itemize} \item Update-Time pruning \item Adjust weight update term so, that large weights persist and small weights converge to zero \end{itemize} \end{itemize} \end{minipage} \end{frame} \begin{frame}{Weight quantization} \begin{figure}[h] \centering \includegraphics[width=0.5\textwidth, keepaspectratio]{resources/clustering} \caption{Weight quantization} \end{figure} \begin{itemize} \item Group similar weights into clusters \item Fine tune clusters with gradient matrix during update \end{itemize} \end{frame} \begin{frame}{Weight quantization} \begin{itemize} \item Minimalize Within-Cluster Sum of Squares (WCSS): $\text{argmin}_C \sum\limits_{i=1}^{k} \sum\limits_{\omega \in c_i} | \omega - c_i |^2$ \item Perform k-means clustering: \begin{enumerate} \item Initialize $k$ cluster centroids \item Assign each data to the cluster with nearest centroid \item Recalculate cluster centroids \end{enumerate} \end{itemize} \end{frame} \begin{frame}{Weight quantization} \begin{figure}[h] \centering \includegraphics[width=0.8\textwidth, keepaspectratio]{resources/centroid_initialization} \caption{Different centroid initialization methods} \end{figure} \end{frame} \begin{frame}{Huffman encoding} \begin{figure}[h] \centering \includegraphics[width=0.7\textwidth, keepaspectratio]{resources/huffman} \caption{Huffman encoding example} \end{figure} \end{frame} \begin{frame}{HashNets} \begin{figure}[h] \centering \includegraphics[width=0.7\textwidth, keepaspectratio]{resources/hashnets} \caption{HashNets encoding} \end{figure} \end{frame} \begin{frame}{HashNets} \begin{minipage}{0.49\linewidth} \begin{figure}[h] \centering \includegraphics[width=\textwidth, keepaspectratio]{resources/hashnets} \end{figure} \end{minipage} \hfill \begin{minipage}{0.49\linewidth} \begin{itemize} \item Virtual weight matrix $\textbf{V}^{\ell}$ \item One-way hash function $h^{\ell}(i, j)$ \item Weight array $w^{\ell}$ \item Hash function returns index for weight array \item $w^{\ell}_{h^{\ell}(i, j)} = \textbf{V}^{\ell}_{ij}$ \end{itemize} \end{minipage} \end{frame} \begin{frame}{Storage format} \begin{itemize} \item Compressed sparse column (CSC) / Compressed sparse row (CSR) representation \item Encode each column $W_j$ as vectors $v$ and $z$ \begin{itemize} \item $v$: Non-zero weights \item $z$: Number of zeros before corresponding element in $v$ \end{itemize} \end{itemize} \begin{itemize} \item E.g. column $[0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3]$ becomes $v = [1, 2, 0, 3]$, $z = [2, 0, 15, 2]$ \item $v$'s and $z$'s for all columns are stored in a single pair of arrays \item Vector $p$ with $p_j$ pointing to the first element of column $W_j$ \end{itemize} \end{frame} \section{EIE implementation} \begin{frame}{Table of contents} \tableofcontents[currentsection] \end{frame} \begin{frame}{EIE implementation} \begin{itemize} \item Optimizes per-activation formula: \begin{align} b_i = \text{ReLU}(\sum\limits_{j=0}^{n-1} W_{ij} a_j) \overset{!}{=} \text{ReLU}(\sum\limits_{j \in X_i \cap Y} S[I_{ij}] a_j) \end{align} \item $X_i$: Set of columns with $W_{ij} \neq 0$ \item $Y$: Set of indices in $a$ for which $a_j \neq 0$ \item $I_{ij}$: 4-bit index \item $S$: Shared lookup table \end{itemize} \end{frame} \begin{frame}{Weight matrix segmentation} \begin{figure}[h] \centering \includegraphics[width=0.52\textwidth, keepaspectratio]{resources/eie_matrix} \includegraphics[width=0.52\textwidth, keepaspectratio]{resources/eie_layout} \caption{Weight matrix segmentation and memory layout} \end{figure} \end{frame} \begin{frame}{Hardware implementation} \begin{figure}[h] \centering \includegraphics[width=\textwidth, keepaspectratio]{resources/eie_hw} \caption{Hardware architecture} \end{figure} \end{frame} \begin{frame}{Hardware implementation} \begin{minipage}{0.39\linewidth} \begin{figure}[h] \centering \includegraphics[width=\textwidth, keepaspectratio]{resources/eie_hw_zero} \caption{Non-Zero detection node} \end{figure} \end{minipage} \hfill \begin{minipage}{0.59\linewidth} \begin{itemize} \item Filter zero elements in input vector $a$ \item Broadcast non-zero elements $a_j$ and corresponding indices $j$ to all PEs \end{itemize} \end{minipage} \end{frame} \begin{frame}{Hardware implementation} \begin{minipage}{0.39\linewidth} \begin{figure}[h] \centering \includegraphics[width=\textwidth, keepaspectratio]{resources/eie_hw_pointer} \caption{Pointer read unit} \end{figure} \end{minipage} \hfill \begin{minipage}{0.59\linewidth} \begin{itemize} \item Pointers to columns $W_j$ are stored in seperate SRAM banks (start/end) \end{itemize} \end{minipage} \end{frame} \begin{frame}{Hardware implementation} \begin{minipage}{0.39\linewidth} \begin{figure}[h] \centering \includegraphics[width=\textwidth, keepaspectratio]{resources/eie_hw_matrix} \caption{Sparse matrix read unit} \end{figure} \end{minipage} \hfill \begin{minipage}{0.59\linewidth} \begin{itemize} \item Reads weight values $v$ and zeroes $z$ for current operation based on pointers \item SRAM word length: 64 bit, entries of $v$ and $z$ are 4 bit each \\ $\rightarrow$ 8 entries per word \end{itemize} \end{minipage} \end{frame} \begin{frame}{Hardware implementation} \begin{minipage}{0.39\linewidth} \begin{figure}[h] \centering \includegraphics[width=\textwidth, keepaspectratio]{resources/eie_hw_alu} \caption{Arithmetic unit} \end{figure} \end{minipage} \hfill \begin{minipage}{0.59\linewidth} \begin{itemize} \item Receives column vector $v$, destination accumulator register index $x$ and activation value $a_j$ \item Calculates $b_x = b_x + v \cdot a_j$ \item Accumulates indices $x$ and forwards real target address \end{itemize} \end{minipage} \end{frame} \begin{frame}{Hardware implementation} \begin{minipage}{0.29\linewidth} \begin{figure}[h] \centering \includegraphics[width=\textwidth, keepaspectratio]{resources/eie_hw_rw} \caption{Read/Write unit} \end{figure} \end{minipage} \hfill \begin{minipage}{0.69\linewidth} \begin{itemize} \item Maintains source ($a$) and destination ($b$) activation values \item Feed-Forward-Network: destination values of one layer are activation values of next layer \item Register banks exchange roles on each layer \end{itemize} \end{minipage} \end{frame} \begin{frame}{Hardware implementation} \begin{minipage}{0.24\linewidth} \begin{figure}[h] \centering \includegraphics[width=\textwidth, keepaspectratio]{resources/eie_hw_lnzd} \caption{ReLU \& Leading non-zero detection unit} \end{figure} \end{minipage} \hfill \begin{minipage}{0.74\linewidth} \begin{itemize} \item Performs ReLU function on destination values \item Detects first PE-local non-zero destination value \item Sends it to group Leading non-zero detection unit to distribute it to PEs for next cycle \end{itemize} \end{minipage} \end{frame} \section{Evaluation} \begin{frame}{Table of contents} \tableofcontents[currentsection] \end{frame} \begin{frame}{Speed and energy} \begin{figure}[h] \centering \includegraphics[width=\textwidth, keepaspectratio]{resources/eval_speed_png}\\ \vspace{0.5cm} \includegraphics[width=\textwidth, keepaspectratio]{resources/eval_energy_png} \caption{Speedup and energy efficienty comparison} \end{figure} \begin{itemize} \item Throughput: 102 GOP/s compressed $\rightarrow$ 3 TOP/s uncompressed \end{itemize} \end{frame} \begin{frame}{Accelerator comparison} \begin{figure}[h] \centering \includegraphics[width=\textwidth, keepaspectratio]{resources/accelerators_table} \caption{EIE compared with different DNN hardware accelerators} \end{figure} \end{frame} \section{Future work} \begin{frame}{Table of contents} \tableofcontents[currentsection] \end{frame} \begin{frame}{Inference accelerators} \begin{itemize} \item Exploitation of different compression methods \begin{itemize} \item Huffman encoding (35-49x compression) \item HashNets (variable compression up to 64x) \end{itemize} \item Combination with other optimization methods \begin{itemize} \item In-Memory calculation \item Approximating circuits \end{itemize} \item Optimize hardware itself \begin{itemize} \item Different storage technologies (e.g. ReRAM) \end{itemize} \end{itemize} \end{frame} \begin{frame}[highlight] Ende \end{frame} \end{document}