summaryrefslogtreecommitdiff
path: root/Presentation/presentation.tex
diff options
context:
space:
mode:
authorLeonard Kugis <leonard@kug.is>2023-01-07 14:54:34 +0100
committerLeonard Kugis <leonard@kug.is>2023-01-07 14:54:34 +0100
commit036b0c74c8f712e9fbf55ef41b8d2ae13feb2baf (patch)
treec33a3de067e1ac8ef756f05521a6534bafdfa4fb /Presentation/presentation.tex
parent5ef7ef8d615ab3098e0b90f18af939908d4f4dfa (diff)
Finished presentation slides
Diffstat (limited to 'Presentation/presentation.tex')
-rw-r--r--Presentation/presentation.tex488
1 files changed, 488 insertions, 0 deletions
diff --git a/Presentation/presentation.tex b/Presentation/presentation.tex
new file mode 100644
index 0000000..18be3aa
--- /dev/null
+++ b/Presentation/presentation.tex
@@ -0,0 +1,488 @@
+% Offizielle Beispieldatei für beamer-Vorlage aus tubslatex Version 0.3beta2
+\documentclass[fleqn,11pt,aspectratio=43]{beamer}
+
+\usepackage[english]{babel}
+\usepackage[utf8x]{inputenc}
+\usepackage{graphicx}
+\usepackage{svg}
+\usetheme[%
+ %nexus,% Nexus Fonts benutzen
+ %lnum,% Versalziffern verwenden
+ %cmyk,%<rgbprint>, Auswahl des Farbmodells
+ blue,%<orange/green/violet> Auswahl des Sekundärfarbklangs
+ dark,%<light,medium> Auswahl der Helligkeit
+ %colorhead,% Farbig hinterlegte Kopfleiste
+ %colorfoot,% Farbig hinterlegt Fußleiste auf Titelseite
+ colorblocks,% Blöcke Farbig hinterlegen
+ %nopagenum,% Keine Seitennumer in Fußzeile
+ %nodate,% Kein Datum in Fußleiste
+ tocinheader,% Inhaltsverzeichnis in Kopfleiste
+ %tinytocinheader,% kleines Kopfleisten-Inhaltsverzeichnis
+ %widetoc,% breites Kopfleisten-Inhaltsverzeichnis
+ %narrowtoc,% schmales Kopfleisten-Inhaltsverzeichnis
+ %nosubsectionsinheader,% Keine subsections im Kopfleisten-Inhaltsverzeichnis
+ %nologoinfoot,% Kein Logo im Fußbereich darstellen
+ ]{tubs}
+
+% Titelseite
+\title{EIE: Efficient Inference Engine on Compressed Deep Neural Network}
+%\subtitle{Das Corporate Design in \LaTeX}
+\author{Leonard Kugis}
+% Titelgrafik, automatisch beschnitten, Weitere Optionen: <scaled/cropx/cropy>
+% \titlegraphic[cropped]{\includegraphics{infozentrum.jpg}}
+%\titlegraphic[scaled]{\includegraphics{titlepicture.jpg}}
+
+% Logo, dass auf Titelseiten oben rechts und auf Inthaltsseiten unten rechts
+% dargestellt wird. Es wird jeweils automatisch skliert
+%\logo{\includegraphics{dummy_institut.pdf}}
+%\logo{Institut für Unkreativität\\und Schreibschwäche}
+
+\begin{document}
+
+\begin{frame}[plain]
+\titlepage
+\end{frame}
+
+\begin{frame}{Table of contents}
+ \tableofcontents
+\end{frame}
+
+\section{Deep Neural Networks}
+
+\begin{frame}{Table of contents}
+ \tableofcontents[currentsection]
+\end{frame}
+
+\begin{frame}
+ \begin{figure}[h]
+ \centering
+ \includegraphics[width=\textwidth, keepaspectratio]{resources/cnn}
+ \caption{Deep Neural Network}
+ \end{figure}
+\end{frame}
+
+\begin{frame}
+ \begin{figure}[h]
+ \centering
+ \includegraphics[width=\textwidth, keepaspectratio]{resources/fcn}
+ \caption{Fully connected layer}
+ \end{figure}
+\end{frame}
+
+\begin{frame}
+ \begin{figure}[h]
+ \centering
+ \includegraphics[width=\textwidth, keepaspectratio]{resources/fcn}
+ \caption{Fully connected layer}
+ \end{figure}
+ \begin{itemize}
+ \item $b_i = f(\sum\limits_{j=0}^{n} W_{ij} a_j)$
+ \item Multiply-Accumulate (MAC) operations
+ \item Matrix $W$ can be sparse
+ \end{itemize}
+\end{frame}
+
+\section{Motivation}
+
+\begin{frame}{Table of contents}
+ \tableofcontents[currentsection]
+\end{frame}
+
+\begin{frame}{Inference metrics}
+ \begin{itemize}
+ \item Throughput \\
+ \textcolor{gray}{Amount of data processed during one unit of time}
+ \item Latency \\
+ \textcolor{gray}{Amount of time it takes to process a single workload}
+ \item Model size \\
+ \textcolor{gray}{Storage amount to store the model (e.g. weights)}
+ \item Energy use \\
+ \textcolor{gray}{Energy consumption for processing a specific amount of data}
+ \end{itemize}
+\end{frame}
+
+\begin{frame}
+ \begin{figure}[h]
+ \centering
+ \includegraphics[width=0.65\textwidth, keepaspectratio]{resources/dnn_dataflows_png}
+ \caption{Common dataflow models in inference architectures}
+ \end{figure}
+\end{frame}
+
+\begin{frame}
+ \begin{figure}[h]
+ \centering
+ \includegraphics[width=0.65\textwidth, keepaspectratio]{resources/dnn_dataflows_access_png}
+ \caption{Common dataflow models in inference architectures}
+ \end{figure}
+\end{frame}
+
+\begin{frame}
+ \begin{figure}[h]
+ \centering
+ \includegraphics[width=0.9\textwidth, keepaspectratio]{resources/memory_latency_png}
+ \caption{Memory hierarchy and energy cost of hierarchy levels}
+ \end{figure}
+\end{frame}
+
+\section{Compression}
+
+\begin{frame}{Table of contents}
+ \tableofcontents[currentsection]
+\end{frame}
+
+\begin{frame}
+ \begin{itemize}
+ \item Dynamic
+ \begin{itemize}
+ \item Input data
+ \end{itemize}
+ \item Static (parameters)
+ \begin{itemize}
+ \item Weights
+ \item Parameters of activation functions
+ \end{itemize}
+ \end{itemize}
+\end{frame}
+
+\begin{frame}{AlexNet}
+ \begin{itemize}
+ \item 5 convolutional layers
+ \item 3 fully connected layers
+ \item $\sim 62$ million parameters
+ \item $\sim 240$ MB with 32-bit float representation
+ \end{itemize}
+\end{frame}
+
+\begin{frame}{Basis projection}
+ \begin{figure}[h]
+ \centering
+ \includegraphics[width=0.75\textwidth, keepaspectratio]{resources/basis_projection_png}
+ \caption{Basis projection and resulting weight distribution}
+ \end{figure}
+\end{frame}
+
+\begin{frame}{Pruning}
+ \begin{itemize}
+ \item Idea: Remove unimportant weights with low impact on accuracy
+ \end{itemize}
+ \begin{figure}[h]
+ \centering
+ \vspace{0.5cm}
+ \includegraphics[width=0.3\textwidth, keepaspectratio]{resources/pruning}
+ \caption{3-step pruning working principle}
+ \end{figure}
+\end{frame}
+
+\begin{frame}{Pruning}
+ \begin{minipage}{0.24\linewidth}
+ \begin{figure}[h]
+ \centering
+ \includegraphics[width=\textwidth, keepaspectratio]{resources/pruning}
+ \end{figure}
+ \end{minipage}
+ \hfill
+ \begin{minipage}{0.74\linewidth}
+ \begin{itemize}
+ \item Magnitude threshold based pruning
+ \begin{itemize}
+ \item Remove a weight, if value is below specific threshold
+ \end{itemize}
+ \item Optimal Brain Damage $(a)$ \& Optimal Brain Surgeon $(b)$
+ \begin{itemize}
+ \item Removes weights based on sensitivity on objective function
+ \item Considers first $(a)$ and second order derivatives $(b)$ to measure sensitivity
+ \item Remove lowest sensitive weights first
+ \end{itemize}
+ \item Biased weight decay
+ \begin{itemize}
+ \item Update-Time pruning
+ \item Adjust weight update term so, that large weights persist and small weights converge to zero
+ \end{itemize}
+ \end{itemize}
+ \end{minipage}
+\end{frame}
+
+\begin{frame}{Weight quantization}
+ \begin{figure}[h]
+ \centering
+ \includegraphics[width=0.5\textwidth, keepaspectratio]{resources/clustering}
+ \caption{Weight quantization}
+ \end{figure}
+ \begin{itemize}
+ \item Group similar weights into clusters
+ \item Fine tune clusters with gradient matrix during update
+ \end{itemize}
+\end{frame}
+
+\begin{frame}{Weight quantization}
+ \begin{itemize}
+ \item Minimalize Within-Cluster Sum of Squares (WCSS): $\text{argmin}_C \sum\limits_{i=1}^{k} \sum\limits_{\omega \in c_i} | \omega - c_i |^2$
+ \item Perform k-means clustering:
+ \begin{enumerate}
+ \item Initialize $k$ cluster centroids
+ \item Assign each data to the cluster with nearest centroid
+ \item Recalculate cluster centroids
+ \end{enumerate}
+ \end{itemize}
+\end{frame}
+
+\begin{frame}{Weight quantization}
+ \begin{figure}[h]
+ \centering
+ \includegraphics[width=0.8\textwidth, keepaspectratio]{resources/centroid_initialization}
+ \caption{Different centroid initialization methods}
+ \end{figure}
+\end{frame}
+
+\begin{frame}{Huffman encoding}
+ \begin{figure}[h]
+ \centering
+ \includegraphics[width=0.7\textwidth, keepaspectratio]{resources/huffman}
+ \caption{Huffman encoding example}
+ \end{figure}
+\end{frame}
+
+\begin{frame}{HashNets}
+ \begin{figure}[h]
+ \centering
+ \includegraphics[width=0.7\textwidth, keepaspectratio]{resources/hashnets}
+ \caption{HashNets encoding}
+ \end{figure}
+\end{frame}
+
+\begin{frame}{HashNets}
+ \begin{minipage}{0.49\linewidth}
+ \begin{figure}[h]
+ \centering
+ \includegraphics[width=\textwidth, keepaspectratio]{resources/hashnets}
+ \end{figure}
+ \end{minipage}
+ \hfill
+ \begin{minipage}{0.49\linewidth}
+ \begin{itemize}
+ \item Virtual weight matrix $\textbf{V}^{\ell}$
+ \item One-way hash function $h^{\ell}(i, j)$
+ \item Weight array $w^{\ell}$
+ \item Hash function returns index for weight array
+ \item $w^{\ell}_{h^{\ell}(i, j)} = \textbf{V}^{\ell}_{ij}$
+ \end{itemize}
+ \end{minipage}
+\end{frame}
+
+\begin{frame}{Storage format}
+ \begin{itemize}
+ \item Compressed sparse column (CSC) /
+ Compressed sparse row (CSR) representation
+ \item Encode each column $W_j$ as vectors $v$ and $z$
+ \begin{itemize}
+ \item $v$: Non-zero weights
+ \item $z$: Number of zeros before corresponding element in $v$
+ \end{itemize}
+ \end{itemize}
+ \begin{itemize}
+ \item E.g. column $[0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3]$ becomes
+ $v = [1, 2, 0, 3]$, $z = [2, 0, 15, 2]$
+ \item $v$'s and $z$'s for all columns are stored in a single pair of arrays
+ \item Vector $p$ with $p_j$ pointing to the first element of column $W_j$
+ \end{itemize}
+\end{frame}
+
+\section{EIE implementation}
+
+\begin{frame}{Table of contents}
+ \tableofcontents[currentsection]
+\end{frame}
+
+\begin{frame}{EIE implementation}
+ \begin{itemize}
+ \item Optimizes per-activation formula:
+ \begin{align}
+ b_i = \text{ReLU}(\sum\limits_{j=0}^{n-1} W_{ij} a_j) \overset{!}{=} \text{ReLU}(\sum\limits_{j \in X_i \cap Y} S[I_{ij}] a_j)
+ \end{align}
+ \item $X_i$: Set of columns with $W_{ij} \neq 0$
+ \item $Y$: Set of indices in $a$ for which $a_j \neq 0$
+ \item $I_{ij}$: 4-bit index
+ \item $S$: Shared lookup table
+ \end{itemize}
+\end{frame}
+
+\begin{frame}{Weight matrix segmentation}
+ \begin{figure}[h]
+ \centering
+ \includegraphics[width=0.52\textwidth, keepaspectratio]{resources/eie_matrix}
+ \includegraphics[width=0.52\textwidth, keepaspectratio]{resources/eie_layout}
+ \caption{Weight matrix segmentation and memory layout}
+ \end{figure}
+\end{frame}
+
+\begin{frame}{Hardware implementation}
+ \begin{figure}[h]
+ \centering
+ \includegraphics[width=\textwidth, keepaspectratio]{resources/eie_hw}
+ \caption{Hardware architecture}
+ \end{figure}
+\end{frame}
+
+\begin{frame}{Hardware implementation}
+ \begin{minipage}{0.39\linewidth}
+ \begin{figure}[h]
+ \centering
+ \includegraphics[width=\textwidth, keepaspectratio]{resources/eie_hw_zero}
+ \caption{Non-Zero detection node}
+ \end{figure}
+ \end{minipage}
+ \hfill
+ \begin{minipage}{0.59\linewidth}
+ \begin{itemize}
+ \item Filter zero elements in input vector $a$
+ \item Broadcast non-zero elements $a_j$ and corresponding indices $j$ to all PEs
+ \end{itemize}
+ \end{minipage}
+\end{frame}
+
+\begin{frame}{Hardware implementation}
+ \begin{minipage}{0.39\linewidth}
+ \begin{figure}[h]
+ \centering
+ \includegraphics[width=\textwidth, keepaspectratio]{resources/eie_hw_pointer}
+ \caption{Pointer read unit}
+ \end{figure}
+ \end{minipage}
+ \hfill
+ \begin{minipage}{0.59\linewidth}
+ \begin{itemize}
+ \item Pointers to columns $W_j$ are stored in seperate SRAM banks (start/end)
+ \end{itemize}
+ \end{minipage}
+\end{frame}
+
+\begin{frame}{Hardware implementation}
+ \begin{minipage}{0.39\linewidth}
+ \begin{figure}[h]
+ \centering
+ \includegraphics[width=\textwidth, keepaspectratio]{resources/eie_hw_matrix}
+ \caption{Sparse matrix read unit}
+ \end{figure}
+ \end{minipage}
+ \hfill
+ \begin{minipage}{0.59\linewidth}
+ \begin{itemize}
+ \item Reads weight values $v$ and zeroes $z$ for current operation based on pointers
+ \item SRAM word length: 64 bit, entries of $v$ and $z$ are 4 bit each \\
+ $\rightarrow$ 8 entries per word
+ \end{itemize}
+ \end{minipage}
+\end{frame}
+
+\begin{frame}{Hardware implementation}
+ \begin{minipage}{0.39\linewidth}
+ \begin{figure}[h]
+ \centering
+ \includegraphics[width=\textwidth, keepaspectratio]{resources/eie_hw_alu}
+ \caption{Arithmetic unit}
+ \end{figure}
+ \end{minipage}
+ \hfill
+ \begin{minipage}{0.59\linewidth}
+ \begin{itemize}
+ \item Receives column vector $v$, destination accumulator register index $x$ and activation value $a_j$
+ \item Calculates $b_x = b_x + v \cdot a_j$
+ \item Accumulates indices $x$ and forwards real target address
+ \end{itemize}
+ \end{minipage}
+\end{frame}
+
+\begin{frame}{Hardware implementation}
+ \begin{minipage}{0.29\linewidth}
+ \begin{figure}[h]
+ \centering
+ \includegraphics[width=\textwidth, keepaspectratio]{resources/eie_hw_rw}
+ \caption{Read/Write unit}
+ \end{figure}
+ \end{minipage}
+ \hfill
+ \begin{minipage}{0.69\linewidth}
+ \begin{itemize}
+ \item Maintains source ($a$) and destination ($b$) activation values
+ \item Feed-Forward-Network: destination values of one layer are activation values of next layer
+ \item Register banks exchange roles on each layer
+ \end{itemize}
+ \end{minipage}
+\end{frame}
+
+\begin{frame}{Hardware implementation}
+ \begin{minipage}{0.24\linewidth}
+ \begin{figure}[h]
+ \centering
+ \includegraphics[width=\textwidth, keepaspectratio]{resources/eie_hw_lnzd}
+ \caption{ReLU \& Leading non-zero detection unit}
+ \end{figure}
+ \end{minipage}
+ \hfill
+ \begin{minipage}{0.74\linewidth}
+ \begin{itemize}
+ \item Performs ReLU function on destination values
+ \item Detects first PE-local non-zero destination value
+ \item Sends it to group Leading non-zero detection unit to distribute it to PEs for next cycle
+ \end{itemize}
+ \end{minipage}
+\end{frame}
+
+\section{Evaluation}
+
+\begin{frame}{Table of contents}
+ \tableofcontents[currentsection]
+\end{frame}
+
+\begin{frame}{Speed and energy}
+ \begin{figure}[h]
+ \centering
+ \includegraphics[width=\textwidth, keepaspectratio]{resources/eval_speed_png}\\
+ \vspace{0.5cm}
+ \includegraphics[width=\textwidth, keepaspectratio]{resources/eval_energy_png}
+ \caption{Speedup and energy efficienty comparison}
+ \end{figure}
+ \begin{itemize}
+ \item Throughput: 102 GOP/s compressed $\rightarrow$ 3 TOP/s uncompressed
+ \end{itemize}
+\end{frame}
+
+\begin{frame}{Accelerator comparison}
+ \begin{figure}[h]
+ \centering
+ \includegraphics[width=\textwidth, keepaspectratio]{resources/accelerators_table}
+ \caption{EIE compared with different DNN hardware accelerators}
+ \end{figure}
+\end{frame}
+
+\section{Future work}
+
+\begin{frame}{Table of contents}
+ \tableofcontents[currentsection]
+\end{frame}
+
+\begin{frame}{Inference accelerators}
+ \begin{itemize}
+ \item Exploitation of different compression methods
+ \begin{itemize}
+ \item Huffman encoding (35-49x compression)
+ \item HashNets (variable compression up to 64x)
+ \end{itemize}
+ \item Combination with other optimization methods
+ \begin{itemize}
+ \item In-Memory calculation
+ \item Approximating circuits
+ \end{itemize}
+ \item Optimize hardware itself
+ \begin{itemize}
+ \item Different storage technologies (e.g. ReRAM)
+ \end{itemize}
+ \end{itemize}
+\end{frame}
+
+\begin{frame}[highlight]
+ Ende
+\end{frame}
+
+\end{document} \ No newline at end of file