From 036b0c74c8f712e9fbf55ef41b8d2ae13feb2baf Mon Sep 17 00:00:00 2001 From: Leonard Kugis Date: Sat, 7 Jan 2023 14:54:34 +0100 Subject: Finished presentation slides --- Presentation/presentation.tex | 488 ++++++++++++++++++++++++++++++++++++++++++ Presentation/structure.md | 83 +++++++ Presentation/template.tex | 6 +- 3 files changed, 574 insertions(+), 3 deletions(-) create mode 100644 Presentation/presentation.tex create mode 100644 Presentation/structure.md (limited to 'Presentation') diff --git a/Presentation/presentation.tex b/Presentation/presentation.tex new file mode 100644 index 0000000..18be3aa --- /dev/null +++ b/Presentation/presentation.tex @@ -0,0 +1,488 @@ +% Offizielle Beispieldatei für beamer-Vorlage aus tubslatex Version 0.3beta2 +\documentclass[fleqn,11pt,aspectratio=43]{beamer} + +\usepackage[english]{babel} +\usepackage[utf8x]{inputenc} +\usepackage{graphicx} +\usepackage{svg} +\usetheme[% + %nexus,% Nexus Fonts benutzen + %lnum,% Versalziffern verwenden + %cmyk,%, Auswahl des Farbmodells + blue,% Auswahl des Sekundärfarbklangs + dark,% Auswahl der Helligkeit + %colorhead,% Farbig hinterlegte Kopfleiste + %colorfoot,% Farbig hinterlegt Fußleiste auf Titelseite + colorblocks,% Blöcke Farbig hinterlegen + %nopagenum,% Keine Seitennumer in Fußzeile + %nodate,% Kein Datum in Fußleiste + tocinheader,% Inhaltsverzeichnis in Kopfleiste + %tinytocinheader,% kleines Kopfleisten-Inhaltsverzeichnis + %widetoc,% breites Kopfleisten-Inhaltsverzeichnis + %narrowtoc,% schmales Kopfleisten-Inhaltsverzeichnis + %nosubsectionsinheader,% Keine subsections im Kopfleisten-Inhaltsverzeichnis + %nologoinfoot,% Kein Logo im Fußbereich darstellen + ]{tubs} + +% Titelseite +\title{EIE: Efficient Inference Engine on Compressed Deep Neural Network} +%\subtitle{Das Corporate Design in \LaTeX} +\author{Leonard Kugis} +% Titelgrafik, automatisch beschnitten, Weitere Optionen: +% \titlegraphic[cropped]{\includegraphics{infozentrum.jpg}} +%\titlegraphic[scaled]{\includegraphics{titlepicture.jpg}} + +% Logo, dass auf Titelseiten oben rechts und auf Inthaltsseiten unten rechts +% dargestellt wird. Es wird jeweils automatisch skliert +%\logo{\includegraphics{dummy_institut.pdf}} +%\logo{Institut für Unkreativität\\und Schreibschwäche} + +\begin{document} + +\begin{frame}[plain] +\titlepage +\end{frame} + +\begin{frame}{Table of contents} + \tableofcontents +\end{frame} + +\section{Deep Neural Networks} + +\begin{frame}{Table of contents} + \tableofcontents[currentsection] +\end{frame} + +\begin{frame} + \begin{figure}[h] + \centering + \includegraphics[width=\textwidth, keepaspectratio]{resources/cnn} + \caption{Deep Neural Network} + \end{figure} +\end{frame} + +\begin{frame} + \begin{figure}[h] + \centering + \includegraphics[width=\textwidth, keepaspectratio]{resources/fcn} + \caption{Fully connected layer} + \end{figure} +\end{frame} + +\begin{frame} + \begin{figure}[h] + \centering + \includegraphics[width=\textwidth, keepaspectratio]{resources/fcn} + \caption{Fully connected layer} + \end{figure} + \begin{itemize} + \item $b_i = f(\sum\limits_{j=0}^{n} W_{ij} a_j)$ + \item Multiply-Accumulate (MAC) operations + \item Matrix $W$ can be sparse + \end{itemize} +\end{frame} + +\section{Motivation} + +\begin{frame}{Table of contents} + \tableofcontents[currentsection] +\end{frame} + +\begin{frame}{Inference metrics} + \begin{itemize} + \item Throughput \\ + \textcolor{gray}{Amount of data processed during one unit of time} + \item Latency \\ + \textcolor{gray}{Amount of time it takes to process a single workload} + \item Model size \\ + \textcolor{gray}{Storage amount to store the model (e.g. weights)} + \item Energy use \\ + \textcolor{gray}{Energy consumption for processing a specific amount of data} + \end{itemize} +\end{frame} + +\begin{frame} + \begin{figure}[h] + \centering + \includegraphics[width=0.65\textwidth, keepaspectratio]{resources/dnn_dataflows_png} + \caption{Common dataflow models in inference architectures} + \end{figure} +\end{frame} + +\begin{frame} + \begin{figure}[h] + \centering + \includegraphics[width=0.65\textwidth, keepaspectratio]{resources/dnn_dataflows_access_png} + \caption{Common dataflow models in inference architectures} + \end{figure} +\end{frame} + +\begin{frame} + \begin{figure}[h] + \centering + \includegraphics[width=0.9\textwidth, keepaspectratio]{resources/memory_latency_png} + \caption{Memory hierarchy and energy cost of hierarchy levels} + \end{figure} +\end{frame} + +\section{Compression} + +\begin{frame}{Table of contents} + \tableofcontents[currentsection] +\end{frame} + +\begin{frame} + \begin{itemize} + \item Dynamic + \begin{itemize} + \item Input data + \end{itemize} + \item Static (parameters) + \begin{itemize} + \item Weights + \item Parameters of activation functions + \end{itemize} + \end{itemize} +\end{frame} + +\begin{frame}{AlexNet} + \begin{itemize} + \item 5 convolutional layers + \item 3 fully connected layers + \item $\sim 62$ million parameters + \item $\sim 240$ MB with 32-bit float representation + \end{itemize} +\end{frame} + +\begin{frame}{Basis projection} + \begin{figure}[h] + \centering + \includegraphics[width=0.75\textwidth, keepaspectratio]{resources/basis_projection_png} + \caption{Basis projection and resulting weight distribution} + \end{figure} +\end{frame} + +\begin{frame}{Pruning} + \begin{itemize} + \item Idea: Remove unimportant weights with low impact on accuracy + \end{itemize} + \begin{figure}[h] + \centering + \vspace{0.5cm} + \includegraphics[width=0.3\textwidth, keepaspectratio]{resources/pruning} + \caption{3-step pruning working principle} + \end{figure} +\end{frame} + +\begin{frame}{Pruning} + \begin{minipage}{0.24\linewidth} + \begin{figure}[h] + \centering + \includegraphics[width=\textwidth, keepaspectratio]{resources/pruning} + \end{figure} + \end{minipage} + \hfill + \begin{minipage}{0.74\linewidth} + \begin{itemize} + \item Magnitude threshold based pruning + \begin{itemize} + \item Remove a weight, if value is below specific threshold + \end{itemize} + \item Optimal Brain Damage $(a)$ \& Optimal Brain Surgeon $(b)$ + \begin{itemize} + \item Removes weights based on sensitivity on objective function + \item Considers first $(a)$ and second order derivatives $(b)$ to measure sensitivity + \item Remove lowest sensitive weights first + \end{itemize} + \item Biased weight decay + \begin{itemize} + \item Update-Time pruning + \item Adjust weight update term so, that large weights persist and small weights converge to zero + \end{itemize} + \end{itemize} + \end{minipage} +\end{frame} + +\begin{frame}{Weight quantization} + \begin{figure}[h] + \centering + \includegraphics[width=0.5\textwidth, keepaspectratio]{resources/clustering} + \caption{Weight quantization} + \end{figure} + \begin{itemize} + \item Group similar weights into clusters + \item Fine tune clusters with gradient matrix during update + \end{itemize} +\end{frame} + +\begin{frame}{Weight quantization} + \begin{itemize} + \item Minimalize Within-Cluster Sum of Squares (WCSS): $\text{argmin}_C \sum\limits_{i=1}^{k} \sum\limits_{\omega \in c_i} | \omega - c_i |^2$ + \item Perform k-means clustering: + \begin{enumerate} + \item Initialize $k$ cluster centroids + \item Assign each data to the cluster with nearest centroid + \item Recalculate cluster centroids + \end{enumerate} + \end{itemize} +\end{frame} + +\begin{frame}{Weight quantization} + \begin{figure}[h] + \centering + \includegraphics[width=0.8\textwidth, keepaspectratio]{resources/centroid_initialization} + \caption{Different centroid initialization methods} + \end{figure} +\end{frame} + +\begin{frame}{Huffman encoding} + \begin{figure}[h] + \centering + \includegraphics[width=0.7\textwidth, keepaspectratio]{resources/huffman} + \caption{Huffman encoding example} + \end{figure} +\end{frame} + +\begin{frame}{HashNets} + \begin{figure}[h] + \centering + \includegraphics[width=0.7\textwidth, keepaspectratio]{resources/hashnets} + \caption{HashNets encoding} + \end{figure} +\end{frame} + +\begin{frame}{HashNets} + \begin{minipage}{0.49\linewidth} + \begin{figure}[h] + \centering + \includegraphics[width=\textwidth, keepaspectratio]{resources/hashnets} + \end{figure} + \end{minipage} + \hfill + \begin{minipage}{0.49\linewidth} + \begin{itemize} + \item Virtual weight matrix $\textbf{V}^{\ell}$ + \item One-way hash function $h^{\ell}(i, j)$ + \item Weight array $w^{\ell}$ + \item Hash function returns index for weight array + \item $w^{\ell}_{h^{\ell}(i, j)} = \textbf{V}^{\ell}_{ij}$ + \end{itemize} + \end{minipage} +\end{frame} + +\begin{frame}{Storage format} + \begin{itemize} + \item Compressed sparse column (CSC) / + Compressed sparse row (CSR) representation + \item Encode each column $W_j$ as vectors $v$ and $z$ + \begin{itemize} + \item $v$: Non-zero weights + \item $z$: Number of zeros before corresponding element in $v$ + \end{itemize} + \end{itemize} + \begin{itemize} + \item E.g. column $[0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3]$ becomes + $v = [1, 2, 0, 3]$, $z = [2, 0, 15, 2]$ + \item $v$'s and $z$'s for all columns are stored in a single pair of arrays + \item Vector $p$ with $p_j$ pointing to the first element of column $W_j$ + \end{itemize} +\end{frame} + +\section{EIE implementation} + +\begin{frame}{Table of contents} + \tableofcontents[currentsection] +\end{frame} + +\begin{frame}{EIE implementation} + \begin{itemize} + \item Optimizes per-activation formula: + \begin{align} + b_i = \text{ReLU}(\sum\limits_{j=0}^{n-1} W_{ij} a_j) \overset{!}{=} \text{ReLU}(\sum\limits_{j \in X_i \cap Y} S[I_{ij}] a_j) + \end{align} + \item $X_i$: Set of columns with $W_{ij} \neq 0$ + \item $Y$: Set of indices in $a$ for which $a_j \neq 0$ + \item $I_{ij}$: 4-bit index + \item $S$: Shared lookup table + \end{itemize} +\end{frame} + +\begin{frame}{Weight matrix segmentation} + \begin{figure}[h] + \centering + \includegraphics[width=0.52\textwidth, keepaspectratio]{resources/eie_matrix} + \includegraphics[width=0.52\textwidth, keepaspectratio]{resources/eie_layout} + \caption{Weight matrix segmentation and memory layout} + \end{figure} +\end{frame} + +\begin{frame}{Hardware implementation} + \begin{figure}[h] + \centering + \includegraphics[width=\textwidth, keepaspectratio]{resources/eie_hw} + \caption{Hardware architecture} + \end{figure} +\end{frame} + +\begin{frame}{Hardware implementation} + \begin{minipage}{0.39\linewidth} + \begin{figure}[h] + \centering + \includegraphics[width=\textwidth, keepaspectratio]{resources/eie_hw_zero} + \caption{Non-Zero detection node} + \end{figure} + \end{minipage} + \hfill + \begin{minipage}{0.59\linewidth} + \begin{itemize} + \item Filter zero elements in input vector $a$ + \item Broadcast non-zero elements $a_j$ and corresponding indices $j$ to all PEs + \end{itemize} + \end{minipage} +\end{frame} + +\begin{frame}{Hardware implementation} + \begin{minipage}{0.39\linewidth} + \begin{figure}[h] + \centering + \includegraphics[width=\textwidth, keepaspectratio]{resources/eie_hw_pointer} + \caption{Pointer read unit} + \end{figure} + \end{minipage} + \hfill + \begin{minipage}{0.59\linewidth} + \begin{itemize} + \item Pointers to columns $W_j$ are stored in seperate SRAM banks (start/end) + \end{itemize} + \end{minipage} +\end{frame} + +\begin{frame}{Hardware implementation} + \begin{minipage}{0.39\linewidth} + \begin{figure}[h] + \centering + \includegraphics[width=\textwidth, keepaspectratio]{resources/eie_hw_matrix} + \caption{Sparse matrix read unit} + \end{figure} + \end{minipage} + \hfill + \begin{minipage}{0.59\linewidth} + \begin{itemize} + \item Reads weight values $v$ and zeroes $z$ for current operation based on pointers + \item SRAM word length: 64 bit, entries of $v$ and $z$ are 4 bit each \\ + $\rightarrow$ 8 entries per word + \end{itemize} + \end{minipage} +\end{frame} + +\begin{frame}{Hardware implementation} + \begin{minipage}{0.39\linewidth} + \begin{figure}[h] + \centering + \includegraphics[width=\textwidth, keepaspectratio]{resources/eie_hw_alu} + \caption{Arithmetic unit} + \end{figure} + \end{minipage} + \hfill + \begin{minipage}{0.59\linewidth} + \begin{itemize} + \item Receives column vector $v$, destination accumulator register index $x$ and activation value $a_j$ + \item Calculates $b_x = b_x + v \cdot a_j$ + \item Accumulates indices $x$ and forwards real target address + \end{itemize} + \end{minipage} +\end{frame} + +\begin{frame}{Hardware implementation} + \begin{minipage}{0.29\linewidth} + \begin{figure}[h] + \centering + \includegraphics[width=\textwidth, keepaspectratio]{resources/eie_hw_rw} + \caption{Read/Write unit} + \end{figure} + \end{minipage} + \hfill + \begin{minipage}{0.69\linewidth} + \begin{itemize} + \item Maintains source ($a$) and destination ($b$) activation values + \item Feed-Forward-Network: destination values of one layer are activation values of next layer + \item Register banks exchange roles on each layer + \end{itemize} + \end{minipage} +\end{frame} + +\begin{frame}{Hardware implementation} + \begin{minipage}{0.24\linewidth} + \begin{figure}[h] + \centering + \includegraphics[width=\textwidth, keepaspectratio]{resources/eie_hw_lnzd} + \caption{ReLU \& Leading non-zero detection unit} + \end{figure} + \end{minipage} + \hfill + \begin{minipage}{0.74\linewidth} + \begin{itemize} + \item Performs ReLU function on destination values + \item Detects first PE-local non-zero destination value + \item Sends it to group Leading non-zero detection unit to distribute it to PEs for next cycle + \end{itemize} + \end{minipage} +\end{frame} + +\section{Evaluation} + +\begin{frame}{Table of contents} + \tableofcontents[currentsection] +\end{frame} + +\begin{frame}{Speed and energy} + \begin{figure}[h] + \centering + \includegraphics[width=\textwidth, keepaspectratio]{resources/eval_speed_png}\\ + \vspace{0.5cm} + \includegraphics[width=\textwidth, keepaspectratio]{resources/eval_energy_png} + \caption{Speedup and energy efficienty comparison} + \end{figure} + \begin{itemize} + \item Throughput: 102 GOP/s compressed $\rightarrow$ 3 TOP/s uncompressed + \end{itemize} +\end{frame} + +\begin{frame}{Accelerator comparison} + \begin{figure}[h] + \centering + \includegraphics[width=\textwidth, keepaspectratio]{resources/accelerators_table} + \caption{EIE compared with different DNN hardware accelerators} + \end{figure} +\end{frame} + +\section{Future work} + +\begin{frame}{Table of contents} + \tableofcontents[currentsection] +\end{frame} + +\begin{frame}{Inference accelerators} + \begin{itemize} + \item Exploitation of different compression methods + \begin{itemize} + \item Huffman encoding (35-49x compression) + \item HashNets (variable compression up to 64x) + \end{itemize} + \item Combination with other optimization methods + \begin{itemize} + \item In-Memory calculation + \item Approximating circuits + \end{itemize} + \item Optimize hardware itself + \begin{itemize} + \item Different storage technologies (e.g. ReRAM) + \end{itemize} + \end{itemize} +\end{frame} + +\begin{frame}[highlight] + Ende +\end{frame} + +\end{document} \ No newline at end of file diff --git a/Presentation/structure.md b/Presentation/structure.md new file mode 100644 index 0000000..8c772b6 --- /dev/null +++ b/Presentation/structure.md @@ -0,0 +1,83 @@ +# EIE: Efficient Inference Engine on Compressed Deep Neural Network + +## Deep Neural Network + +- Convolutional layers +- Fully-connected layers +- In FC-layers: Trained weights. This only focuses on inference +- Multiply-Accumulate (MAC) on each layer +- DNN dataflows +- Convolutional layers: 5% of memory, 95% of FLOPs +- FC layers: 5% of FLOPs, 90-95% of memory + +## Motivation + +- Inference metrics: throughput, latency, model size, energy use +- Uncompressed DNN: Does not fit SRAM, memory access to/from DRAM +- Von-Neumann bottleneck +- Grafik aus Chen 2016 +- Additional levels of indirection because of indices (weight reusing) + +## Compression + +- In general: Encode in such a way, that it reduces the number of bits per weight + +Trivial: + +- Use different kernels/filters to the input +- Apply pooling to the inputs (runtime memory) + +More complex: + +- Pruning (remove unimportant weights and retrain, 2 approaches) + - Encode with relative indexing +- Weight quantization with clustering + - Group similar weights to clusters + - Minimalize WCSS + - Different methods to initialize cluster centroids, e.g. random, linear, CDF-based + - Indirection because of shared weight table lookup +- Huffman encoding (binary tree with weights, globally) +- Fixed-Point-Quantization of activation functions (refer to CPU optimization) +- Extremely narrow weight engines (4 bit) +- Compressed sparse column (CSC) matrix representation + +## EIE implementation + +- Per-Activation-Formula +- Accelerates sparse and weight sharing networks +- Uses CSC representation + - PE Quickly finds non-zero elements in column +- Explain general procedure +- Show image of the architecture +- Non-Zero filtering +- Queues for load balancing +- Two different SRAM banks for pointers (16 bit) to column borders +- Each entry: 8 bit width (4 bit reference and 4 bit activation register index) +- Table lookup / weight decoding of reference in the same cycle +- Arithmetic Unit: Performs Multiply-Accumulate +- Read/Write unit + - Source and destination register files + - Change their role on each layer + - Feed-Forward networks + +## EIE evaluation + +- Speedup: 189x, 13x, 307x faster than CPU, GPU and mGPU + - EIE latency focused: Batch size of 1 +- Throughput: 102 GOP/s compressed -> 3 TOP/s uncompressed +- Energy efficiency: 24.000x, 3.400x, 2.700x more energy efficient than CPU, GPU and mGPU + + +- Speed calculation: Measure wall clock times for different workloads +- Energy calculation: Total computation time x average measured power +- Sources of energy consumption and reasons for less energy consumption: + - SRAM access instead of DRAM + - Compression type and architecture reduces amount of memory reads + - Vector sparsity encoding in CSC representation + +## Limitations / future optimizations + +- EIE only capable of matrix-multiplication +- Other optimization methods + - In-Memory Acceleration + - \ No newline at end of file diff --git a/Presentation/template.tex b/Presentation/template.tex index 0bca3f4..5fbf1c7 100644 --- a/Presentation/template.tex +++ b/Presentation/template.tex @@ -24,9 +24,9 @@ ]{tubs} % Titelseite -\title{Meine Pr\"asentation} -\subtitle{Das Corporate Design in \LaTeX} -\author{Max Mustermann} +\title{EIE: Efficient Inference Engine on Compressed Deep Neural Network} +%\subtitle{Das Corporate Design in \LaTeX} +\author{Leonard Kugis} % Titelgrafik, automatisch beschnitten, Weitere Optionen: % \titlegraphic[cropped]{\includegraphics{infozentrum.jpg}} %\titlegraphic[scaled]{\includegraphics{titlepicture.jpg}} -- cgit v1.2.1