From 036b0c74c8f712e9fbf55ef41b8d2ae13feb2baf Mon Sep 17 00:00:00 2001 From: Leonard Kugis Date: Sat, 7 Jan 2023 14:54:34 +0100 Subject: Finished presentation slides --- Presentation/presentation.tex | 488 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 488 insertions(+) create mode 100644 Presentation/presentation.tex (limited to 'Presentation/presentation.tex') diff --git a/Presentation/presentation.tex b/Presentation/presentation.tex new file mode 100644 index 0000000..18be3aa --- /dev/null +++ b/Presentation/presentation.tex @@ -0,0 +1,488 @@ +% Offizielle Beispieldatei für beamer-Vorlage aus tubslatex Version 0.3beta2 +\documentclass[fleqn,11pt,aspectratio=43]{beamer} + +\usepackage[english]{babel} +\usepackage[utf8x]{inputenc} +\usepackage{graphicx} +\usepackage{svg} +\usetheme[% + %nexus,% Nexus Fonts benutzen + %lnum,% Versalziffern verwenden + %cmyk,%, Auswahl des Farbmodells + blue,% Auswahl des Sekundärfarbklangs + dark,% Auswahl der Helligkeit + %colorhead,% Farbig hinterlegte Kopfleiste + %colorfoot,% Farbig hinterlegt Fußleiste auf Titelseite + colorblocks,% Blöcke Farbig hinterlegen + %nopagenum,% Keine Seitennumer in Fußzeile + %nodate,% Kein Datum in Fußleiste + tocinheader,% Inhaltsverzeichnis in Kopfleiste + %tinytocinheader,% kleines Kopfleisten-Inhaltsverzeichnis + %widetoc,% breites Kopfleisten-Inhaltsverzeichnis + %narrowtoc,% schmales Kopfleisten-Inhaltsverzeichnis + %nosubsectionsinheader,% Keine subsections im Kopfleisten-Inhaltsverzeichnis + %nologoinfoot,% Kein Logo im Fußbereich darstellen + ]{tubs} + +% Titelseite +\title{EIE: Efficient Inference Engine on Compressed Deep Neural Network} +%\subtitle{Das Corporate Design in \LaTeX} +\author{Leonard Kugis} +% Titelgrafik, automatisch beschnitten, Weitere Optionen: +% \titlegraphic[cropped]{\includegraphics{infozentrum.jpg}} +%\titlegraphic[scaled]{\includegraphics{titlepicture.jpg}} + +% Logo, dass auf Titelseiten oben rechts und auf Inthaltsseiten unten rechts +% dargestellt wird. Es wird jeweils automatisch skliert +%\logo{\includegraphics{dummy_institut.pdf}} +%\logo{Institut für Unkreativität\\und Schreibschwäche} + +\begin{document} + +\begin{frame}[plain] +\titlepage +\end{frame} + +\begin{frame}{Table of contents} + \tableofcontents +\end{frame} + +\section{Deep Neural Networks} + +\begin{frame}{Table of contents} + \tableofcontents[currentsection] +\end{frame} + +\begin{frame} + \begin{figure}[h] + \centering + \includegraphics[width=\textwidth, keepaspectratio]{resources/cnn} + \caption{Deep Neural Network} + \end{figure} +\end{frame} + +\begin{frame} + \begin{figure}[h] + \centering + \includegraphics[width=\textwidth, keepaspectratio]{resources/fcn} + \caption{Fully connected layer} + \end{figure} +\end{frame} + +\begin{frame} + \begin{figure}[h] + \centering + \includegraphics[width=\textwidth, keepaspectratio]{resources/fcn} + \caption{Fully connected layer} + \end{figure} + \begin{itemize} + \item $b_i = f(\sum\limits_{j=0}^{n} W_{ij} a_j)$ + \item Multiply-Accumulate (MAC) operations + \item Matrix $W$ can be sparse + \end{itemize} +\end{frame} + +\section{Motivation} + +\begin{frame}{Table of contents} + \tableofcontents[currentsection] +\end{frame} + +\begin{frame}{Inference metrics} + \begin{itemize} + \item Throughput \\ + \textcolor{gray}{Amount of data processed during one unit of time} + \item Latency \\ + \textcolor{gray}{Amount of time it takes to process a single workload} + \item Model size \\ + \textcolor{gray}{Storage amount to store the model (e.g. weights)} + \item Energy use \\ + \textcolor{gray}{Energy consumption for processing a specific amount of data} + \end{itemize} +\end{frame} + +\begin{frame} + \begin{figure}[h] + \centering + \includegraphics[width=0.65\textwidth, keepaspectratio]{resources/dnn_dataflows_png} + \caption{Common dataflow models in inference architectures} + \end{figure} +\end{frame} + +\begin{frame} + \begin{figure}[h] + \centering + \includegraphics[width=0.65\textwidth, keepaspectratio]{resources/dnn_dataflows_access_png} + \caption{Common dataflow models in inference architectures} + \end{figure} +\end{frame} + +\begin{frame} + \begin{figure}[h] + \centering + \includegraphics[width=0.9\textwidth, keepaspectratio]{resources/memory_latency_png} + \caption{Memory hierarchy and energy cost of hierarchy levels} + \end{figure} +\end{frame} + +\section{Compression} + +\begin{frame}{Table of contents} + \tableofcontents[currentsection] +\end{frame} + +\begin{frame} + \begin{itemize} + \item Dynamic + \begin{itemize} + \item Input data + \end{itemize} + \item Static (parameters) + \begin{itemize} + \item Weights + \item Parameters of activation functions + \end{itemize} + \end{itemize} +\end{frame} + +\begin{frame}{AlexNet} + \begin{itemize} + \item 5 convolutional layers + \item 3 fully connected layers + \item $\sim 62$ million parameters + \item $\sim 240$ MB with 32-bit float representation + \end{itemize} +\end{frame} + +\begin{frame}{Basis projection} + \begin{figure}[h] + \centering + \includegraphics[width=0.75\textwidth, keepaspectratio]{resources/basis_projection_png} + \caption{Basis projection and resulting weight distribution} + \end{figure} +\end{frame} + +\begin{frame}{Pruning} + \begin{itemize} + \item Idea: Remove unimportant weights with low impact on accuracy + \end{itemize} + \begin{figure}[h] + \centering + \vspace{0.5cm} + \includegraphics[width=0.3\textwidth, keepaspectratio]{resources/pruning} + \caption{3-step pruning working principle} + \end{figure} +\end{frame} + +\begin{frame}{Pruning} + \begin{minipage}{0.24\linewidth} + \begin{figure}[h] + \centering + \includegraphics[width=\textwidth, keepaspectratio]{resources/pruning} + \end{figure} + \end{minipage} + \hfill + \begin{minipage}{0.74\linewidth} + \begin{itemize} + \item Magnitude threshold based pruning + \begin{itemize} + \item Remove a weight, if value is below specific threshold + \end{itemize} + \item Optimal Brain Damage $(a)$ \& Optimal Brain Surgeon $(b)$ + \begin{itemize} + \item Removes weights based on sensitivity on objective function + \item Considers first $(a)$ and second order derivatives $(b)$ to measure sensitivity + \item Remove lowest sensitive weights first + \end{itemize} + \item Biased weight decay + \begin{itemize} + \item Update-Time pruning + \item Adjust weight update term so, that large weights persist and small weights converge to zero + \end{itemize} + \end{itemize} + \end{minipage} +\end{frame} + +\begin{frame}{Weight quantization} + \begin{figure}[h] + \centering + \includegraphics[width=0.5\textwidth, keepaspectratio]{resources/clustering} + \caption{Weight quantization} + \end{figure} + \begin{itemize} + \item Group similar weights into clusters + \item Fine tune clusters with gradient matrix during update + \end{itemize} +\end{frame} + +\begin{frame}{Weight quantization} + \begin{itemize} + \item Minimalize Within-Cluster Sum of Squares (WCSS): $\text{argmin}_C \sum\limits_{i=1}^{k} \sum\limits_{\omega \in c_i} | \omega - c_i |^2$ + \item Perform k-means clustering: + \begin{enumerate} + \item Initialize $k$ cluster centroids + \item Assign each data to the cluster with nearest centroid + \item Recalculate cluster centroids + \end{enumerate} + \end{itemize} +\end{frame} + +\begin{frame}{Weight quantization} + \begin{figure}[h] + \centering + \includegraphics[width=0.8\textwidth, keepaspectratio]{resources/centroid_initialization} + \caption{Different centroid initialization methods} + \end{figure} +\end{frame} + +\begin{frame}{Huffman encoding} + \begin{figure}[h] + \centering + \includegraphics[width=0.7\textwidth, keepaspectratio]{resources/huffman} + \caption{Huffman encoding example} + \end{figure} +\end{frame} + +\begin{frame}{HashNets} + \begin{figure}[h] + \centering + \includegraphics[width=0.7\textwidth, keepaspectratio]{resources/hashnets} + \caption{HashNets encoding} + \end{figure} +\end{frame} + +\begin{frame}{HashNets} + \begin{minipage}{0.49\linewidth} + \begin{figure}[h] + \centering + \includegraphics[width=\textwidth, keepaspectratio]{resources/hashnets} + \end{figure} + \end{minipage} + \hfill + \begin{minipage}{0.49\linewidth} + \begin{itemize} + \item Virtual weight matrix $\textbf{V}^{\ell}$ + \item One-way hash function $h^{\ell}(i, j)$ + \item Weight array $w^{\ell}$ + \item Hash function returns index for weight array + \item $w^{\ell}_{h^{\ell}(i, j)} = \textbf{V}^{\ell}_{ij}$ + \end{itemize} + \end{minipage} +\end{frame} + +\begin{frame}{Storage format} + \begin{itemize} + \item Compressed sparse column (CSC) / + Compressed sparse row (CSR) representation + \item Encode each column $W_j$ as vectors $v$ and $z$ + \begin{itemize} + \item $v$: Non-zero weights + \item $z$: Number of zeros before corresponding element in $v$ + \end{itemize} + \end{itemize} + \begin{itemize} + \item E.g. column $[0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3]$ becomes + $v = [1, 2, 0, 3]$, $z = [2, 0, 15, 2]$ + \item $v$'s and $z$'s for all columns are stored in a single pair of arrays + \item Vector $p$ with $p_j$ pointing to the first element of column $W_j$ + \end{itemize} +\end{frame} + +\section{EIE implementation} + +\begin{frame}{Table of contents} + \tableofcontents[currentsection] +\end{frame} + +\begin{frame}{EIE implementation} + \begin{itemize} + \item Optimizes per-activation formula: + \begin{align} + b_i = \text{ReLU}(\sum\limits_{j=0}^{n-1} W_{ij} a_j) \overset{!}{=} \text{ReLU}(\sum\limits_{j \in X_i \cap Y} S[I_{ij}] a_j) + \end{align} + \item $X_i$: Set of columns with $W_{ij} \neq 0$ + \item $Y$: Set of indices in $a$ for which $a_j \neq 0$ + \item $I_{ij}$: 4-bit index + \item $S$: Shared lookup table + \end{itemize} +\end{frame} + +\begin{frame}{Weight matrix segmentation} + \begin{figure}[h] + \centering + \includegraphics[width=0.52\textwidth, keepaspectratio]{resources/eie_matrix} + \includegraphics[width=0.52\textwidth, keepaspectratio]{resources/eie_layout} + \caption{Weight matrix segmentation and memory layout} + \end{figure} +\end{frame} + +\begin{frame}{Hardware implementation} + \begin{figure}[h] + \centering + \includegraphics[width=\textwidth, keepaspectratio]{resources/eie_hw} + \caption{Hardware architecture} + \end{figure} +\end{frame} + +\begin{frame}{Hardware implementation} + \begin{minipage}{0.39\linewidth} + \begin{figure}[h] + \centering + \includegraphics[width=\textwidth, keepaspectratio]{resources/eie_hw_zero} + \caption{Non-Zero detection node} + \end{figure} + \end{minipage} + \hfill + \begin{minipage}{0.59\linewidth} + \begin{itemize} + \item Filter zero elements in input vector $a$ + \item Broadcast non-zero elements $a_j$ and corresponding indices $j$ to all PEs + \end{itemize} + \end{minipage} +\end{frame} + +\begin{frame}{Hardware implementation} + \begin{minipage}{0.39\linewidth} + \begin{figure}[h] + \centering + \includegraphics[width=\textwidth, keepaspectratio]{resources/eie_hw_pointer} + \caption{Pointer read unit} + \end{figure} + \end{minipage} + \hfill + \begin{minipage}{0.59\linewidth} + \begin{itemize} + \item Pointers to columns $W_j$ are stored in seperate SRAM banks (start/end) + \end{itemize} + \end{minipage} +\end{frame} + +\begin{frame}{Hardware implementation} + \begin{minipage}{0.39\linewidth} + \begin{figure}[h] + \centering + \includegraphics[width=\textwidth, keepaspectratio]{resources/eie_hw_matrix} + \caption{Sparse matrix read unit} + \end{figure} + \end{minipage} + \hfill + \begin{minipage}{0.59\linewidth} + \begin{itemize} + \item Reads weight values $v$ and zeroes $z$ for current operation based on pointers + \item SRAM word length: 64 bit, entries of $v$ and $z$ are 4 bit each \\ + $\rightarrow$ 8 entries per word + \end{itemize} + \end{minipage} +\end{frame} + +\begin{frame}{Hardware implementation} + \begin{minipage}{0.39\linewidth} + \begin{figure}[h] + \centering + \includegraphics[width=\textwidth, keepaspectratio]{resources/eie_hw_alu} + \caption{Arithmetic unit} + \end{figure} + \end{minipage} + \hfill + \begin{minipage}{0.59\linewidth} + \begin{itemize} + \item Receives column vector $v$, destination accumulator register index $x$ and activation value $a_j$ + \item Calculates $b_x = b_x + v \cdot a_j$ + \item Accumulates indices $x$ and forwards real target address + \end{itemize} + \end{minipage} +\end{frame} + +\begin{frame}{Hardware implementation} + \begin{minipage}{0.29\linewidth} + \begin{figure}[h] + \centering + \includegraphics[width=\textwidth, keepaspectratio]{resources/eie_hw_rw} + \caption{Read/Write unit} + \end{figure} + \end{minipage} + \hfill + \begin{minipage}{0.69\linewidth} + \begin{itemize} + \item Maintains source ($a$) and destination ($b$) activation values + \item Feed-Forward-Network: destination values of one layer are activation values of next layer + \item Register banks exchange roles on each layer + \end{itemize} + \end{minipage} +\end{frame} + +\begin{frame}{Hardware implementation} + \begin{minipage}{0.24\linewidth} + \begin{figure}[h] + \centering + \includegraphics[width=\textwidth, keepaspectratio]{resources/eie_hw_lnzd} + \caption{ReLU \& Leading non-zero detection unit} + \end{figure} + \end{minipage} + \hfill + \begin{minipage}{0.74\linewidth} + \begin{itemize} + \item Performs ReLU function on destination values + \item Detects first PE-local non-zero destination value + \item Sends it to group Leading non-zero detection unit to distribute it to PEs for next cycle + \end{itemize} + \end{minipage} +\end{frame} + +\section{Evaluation} + +\begin{frame}{Table of contents} + \tableofcontents[currentsection] +\end{frame} + +\begin{frame}{Speed and energy} + \begin{figure}[h] + \centering + \includegraphics[width=\textwidth, keepaspectratio]{resources/eval_speed_png}\\ + \vspace{0.5cm} + \includegraphics[width=\textwidth, keepaspectratio]{resources/eval_energy_png} + \caption{Speedup and energy efficienty comparison} + \end{figure} + \begin{itemize} + \item Throughput: 102 GOP/s compressed $\rightarrow$ 3 TOP/s uncompressed + \end{itemize} +\end{frame} + +\begin{frame}{Accelerator comparison} + \begin{figure}[h] + \centering + \includegraphics[width=\textwidth, keepaspectratio]{resources/accelerators_table} + \caption{EIE compared with different DNN hardware accelerators} + \end{figure} +\end{frame} + +\section{Future work} + +\begin{frame}{Table of contents} + \tableofcontents[currentsection] +\end{frame} + +\begin{frame}{Inference accelerators} + \begin{itemize} + \item Exploitation of different compression methods + \begin{itemize} + \item Huffman encoding (35-49x compression) + \item HashNets (variable compression up to 64x) + \end{itemize} + \item Combination with other optimization methods + \begin{itemize} + \item In-Memory calculation + \item Approximating circuits + \end{itemize} + \item Optimize hardware itself + \begin{itemize} + \item Different storage technologies (e.g. ReRAM) + \end{itemize} + \end{itemize} +\end{frame} + +\begin{frame}[highlight] + Ende +\end{frame} + +\end{document} \ No newline at end of file -- cgit v1.2.1