summaryrefslogtreecommitdiff
path: root/Presentation
diff options
context:
space:
mode:
authorLeonard Kugis <leonard@kug.is>2023-01-07 14:54:34 +0100
committerLeonard Kugis <leonard@kug.is>2023-01-07 14:54:34 +0100
commit036b0c74c8f712e9fbf55ef41b8d2ae13feb2baf (patch)
treec33a3de067e1ac8ef756f05521a6534bafdfa4fb /Presentation
parent5ef7ef8d615ab3098e0b90f18af939908d4f4dfa (diff)
Finished presentation slides
Diffstat (limited to 'Presentation')
-rw-r--r--Presentation/presentation.tex488
-rw-r--r--Presentation/structure.md83
-rw-r--r--Presentation/template.tex6
3 files changed, 574 insertions, 3 deletions
diff --git a/Presentation/presentation.tex b/Presentation/presentation.tex
new file mode 100644
index 0000000..18be3aa
--- /dev/null
+++ b/Presentation/presentation.tex
@@ -0,0 +1,488 @@
+% Offizielle Beispieldatei für beamer-Vorlage aus tubslatex Version 0.3beta2
+\documentclass[fleqn,11pt,aspectratio=43]{beamer}
+
+\usepackage[english]{babel}
+\usepackage[utf8x]{inputenc}
+\usepackage{graphicx}
+\usepackage{svg}
+\usetheme[%
+ %nexus,% Nexus Fonts benutzen
+ %lnum,% Versalziffern verwenden
+ %cmyk,%<rgbprint>, Auswahl des Farbmodells
+ blue,%<orange/green/violet> Auswahl des Sekundärfarbklangs
+ dark,%<light,medium> Auswahl der Helligkeit
+ %colorhead,% Farbig hinterlegte Kopfleiste
+ %colorfoot,% Farbig hinterlegt Fußleiste auf Titelseite
+ colorblocks,% Blöcke Farbig hinterlegen
+ %nopagenum,% Keine Seitennumer in Fußzeile
+ %nodate,% Kein Datum in Fußleiste
+ tocinheader,% Inhaltsverzeichnis in Kopfleiste
+ %tinytocinheader,% kleines Kopfleisten-Inhaltsverzeichnis
+ %widetoc,% breites Kopfleisten-Inhaltsverzeichnis
+ %narrowtoc,% schmales Kopfleisten-Inhaltsverzeichnis
+ %nosubsectionsinheader,% Keine subsections im Kopfleisten-Inhaltsverzeichnis
+ %nologoinfoot,% Kein Logo im Fußbereich darstellen
+ ]{tubs}
+
+% Titelseite
+\title{EIE: Efficient Inference Engine on Compressed Deep Neural Network}
+%\subtitle{Das Corporate Design in \LaTeX}
+\author{Leonard Kugis}
+% Titelgrafik, automatisch beschnitten, Weitere Optionen: <scaled/cropx/cropy>
+% \titlegraphic[cropped]{\includegraphics{infozentrum.jpg}}
+%\titlegraphic[scaled]{\includegraphics{titlepicture.jpg}}
+
+% Logo, dass auf Titelseiten oben rechts und auf Inthaltsseiten unten rechts
+% dargestellt wird. Es wird jeweils automatisch skliert
+%\logo{\includegraphics{dummy_institut.pdf}}
+%\logo{Institut für Unkreativität\\und Schreibschwäche}
+
+\begin{document}
+
+\begin{frame}[plain]
+\titlepage
+\end{frame}
+
+\begin{frame}{Table of contents}
+ \tableofcontents
+\end{frame}
+
+\section{Deep Neural Networks}
+
+\begin{frame}{Table of contents}
+ \tableofcontents[currentsection]
+\end{frame}
+
+\begin{frame}
+ \begin{figure}[h]
+ \centering
+ \includegraphics[width=\textwidth, keepaspectratio]{resources/cnn}
+ \caption{Deep Neural Network}
+ \end{figure}
+\end{frame}
+
+\begin{frame}
+ \begin{figure}[h]
+ \centering
+ \includegraphics[width=\textwidth, keepaspectratio]{resources/fcn}
+ \caption{Fully connected layer}
+ \end{figure}
+\end{frame}
+
+\begin{frame}
+ \begin{figure}[h]
+ \centering
+ \includegraphics[width=\textwidth, keepaspectratio]{resources/fcn}
+ \caption{Fully connected layer}
+ \end{figure}
+ \begin{itemize}
+ \item $b_i = f(\sum\limits_{j=0}^{n} W_{ij} a_j)$
+ \item Multiply-Accumulate (MAC) operations
+ \item Matrix $W$ can be sparse
+ \end{itemize}
+\end{frame}
+
+\section{Motivation}
+
+\begin{frame}{Table of contents}
+ \tableofcontents[currentsection]
+\end{frame}
+
+\begin{frame}{Inference metrics}
+ \begin{itemize}
+ \item Throughput \\
+ \textcolor{gray}{Amount of data processed during one unit of time}
+ \item Latency \\
+ \textcolor{gray}{Amount of time it takes to process a single workload}
+ \item Model size \\
+ \textcolor{gray}{Storage amount to store the model (e.g. weights)}
+ \item Energy use \\
+ \textcolor{gray}{Energy consumption for processing a specific amount of data}
+ \end{itemize}
+\end{frame}
+
+\begin{frame}
+ \begin{figure}[h]
+ \centering
+ \includegraphics[width=0.65\textwidth, keepaspectratio]{resources/dnn_dataflows_png}
+ \caption{Common dataflow models in inference architectures}
+ \end{figure}
+\end{frame}
+
+\begin{frame}
+ \begin{figure}[h]
+ \centering
+ \includegraphics[width=0.65\textwidth, keepaspectratio]{resources/dnn_dataflows_access_png}
+ \caption{Common dataflow models in inference architectures}
+ \end{figure}
+\end{frame}
+
+\begin{frame}
+ \begin{figure}[h]
+ \centering
+ \includegraphics[width=0.9\textwidth, keepaspectratio]{resources/memory_latency_png}
+ \caption{Memory hierarchy and energy cost of hierarchy levels}
+ \end{figure}
+\end{frame}
+
+\section{Compression}
+
+\begin{frame}{Table of contents}
+ \tableofcontents[currentsection]
+\end{frame}
+
+\begin{frame}
+ \begin{itemize}
+ \item Dynamic
+ \begin{itemize}
+ \item Input data
+ \end{itemize}
+ \item Static (parameters)
+ \begin{itemize}
+ \item Weights
+ \item Parameters of activation functions
+ \end{itemize}
+ \end{itemize}
+\end{frame}
+
+\begin{frame}{AlexNet}
+ \begin{itemize}
+ \item 5 convolutional layers
+ \item 3 fully connected layers
+ \item $\sim 62$ million parameters
+ \item $\sim 240$ MB with 32-bit float representation
+ \end{itemize}
+\end{frame}
+
+\begin{frame}{Basis projection}
+ \begin{figure}[h]
+ \centering
+ \includegraphics[width=0.75\textwidth, keepaspectratio]{resources/basis_projection_png}
+ \caption{Basis projection and resulting weight distribution}
+ \end{figure}
+\end{frame}
+
+\begin{frame}{Pruning}
+ \begin{itemize}
+ \item Idea: Remove unimportant weights with low impact on accuracy
+ \end{itemize}
+ \begin{figure}[h]
+ \centering
+ \vspace{0.5cm}
+ \includegraphics[width=0.3\textwidth, keepaspectratio]{resources/pruning}
+ \caption{3-step pruning working principle}
+ \end{figure}
+\end{frame}
+
+\begin{frame}{Pruning}
+ \begin{minipage}{0.24\linewidth}
+ \begin{figure}[h]
+ \centering
+ \includegraphics[width=\textwidth, keepaspectratio]{resources/pruning}
+ \end{figure}
+ \end{minipage}
+ \hfill
+ \begin{minipage}{0.74\linewidth}
+ \begin{itemize}
+ \item Magnitude threshold based pruning
+ \begin{itemize}
+ \item Remove a weight, if value is below specific threshold
+ \end{itemize}
+ \item Optimal Brain Damage $(a)$ \& Optimal Brain Surgeon $(b)$
+ \begin{itemize}
+ \item Removes weights based on sensitivity on objective function
+ \item Considers first $(a)$ and second order derivatives $(b)$ to measure sensitivity
+ \item Remove lowest sensitive weights first
+ \end{itemize}
+ \item Biased weight decay
+ \begin{itemize}
+ \item Update-Time pruning
+ \item Adjust weight update term so, that large weights persist and small weights converge to zero
+ \end{itemize}
+ \end{itemize}
+ \end{minipage}
+\end{frame}
+
+\begin{frame}{Weight quantization}
+ \begin{figure}[h]
+ \centering
+ \includegraphics[width=0.5\textwidth, keepaspectratio]{resources/clustering}
+ \caption{Weight quantization}
+ \end{figure}
+ \begin{itemize}
+ \item Group similar weights into clusters
+ \item Fine tune clusters with gradient matrix during update
+ \end{itemize}
+\end{frame}
+
+\begin{frame}{Weight quantization}
+ \begin{itemize}
+ \item Minimalize Within-Cluster Sum of Squares (WCSS): $\text{argmin}_C \sum\limits_{i=1}^{k} \sum\limits_{\omega \in c_i} | \omega - c_i |^2$
+ \item Perform k-means clustering:
+ \begin{enumerate}
+ \item Initialize $k$ cluster centroids
+ \item Assign each data to the cluster with nearest centroid
+ \item Recalculate cluster centroids
+ \end{enumerate}
+ \end{itemize}
+\end{frame}
+
+\begin{frame}{Weight quantization}
+ \begin{figure}[h]
+ \centering
+ \includegraphics[width=0.8\textwidth, keepaspectratio]{resources/centroid_initialization}
+ \caption{Different centroid initialization methods}
+ \end{figure}
+\end{frame}
+
+\begin{frame}{Huffman encoding}
+ \begin{figure}[h]
+ \centering
+ \includegraphics[width=0.7\textwidth, keepaspectratio]{resources/huffman}
+ \caption{Huffman encoding example}
+ \end{figure}
+\end{frame}
+
+\begin{frame}{HashNets}
+ \begin{figure}[h]
+ \centering
+ \includegraphics[width=0.7\textwidth, keepaspectratio]{resources/hashnets}
+ \caption{HashNets encoding}
+ \end{figure}
+\end{frame}
+
+\begin{frame}{HashNets}
+ \begin{minipage}{0.49\linewidth}
+ \begin{figure}[h]
+ \centering
+ \includegraphics[width=\textwidth, keepaspectratio]{resources/hashnets}
+ \end{figure}
+ \end{minipage}
+ \hfill
+ \begin{minipage}{0.49\linewidth}
+ \begin{itemize}
+ \item Virtual weight matrix $\textbf{V}^{\ell}$
+ \item One-way hash function $h^{\ell}(i, j)$
+ \item Weight array $w^{\ell}$
+ \item Hash function returns index for weight array
+ \item $w^{\ell}_{h^{\ell}(i, j)} = \textbf{V}^{\ell}_{ij}$
+ \end{itemize}
+ \end{minipage}
+\end{frame}
+
+\begin{frame}{Storage format}
+ \begin{itemize}
+ \item Compressed sparse column (CSC) /
+ Compressed sparse row (CSR) representation
+ \item Encode each column $W_j$ as vectors $v$ and $z$
+ \begin{itemize}
+ \item $v$: Non-zero weights
+ \item $z$: Number of zeros before corresponding element in $v$
+ \end{itemize}
+ \end{itemize}
+ \begin{itemize}
+ \item E.g. column $[0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3]$ becomes
+ $v = [1, 2, 0, 3]$, $z = [2, 0, 15, 2]$
+ \item $v$'s and $z$'s for all columns are stored in a single pair of arrays
+ \item Vector $p$ with $p_j$ pointing to the first element of column $W_j$
+ \end{itemize}
+\end{frame}
+
+\section{EIE implementation}
+
+\begin{frame}{Table of contents}
+ \tableofcontents[currentsection]
+\end{frame}
+
+\begin{frame}{EIE implementation}
+ \begin{itemize}
+ \item Optimizes per-activation formula:
+ \begin{align}
+ b_i = \text{ReLU}(\sum\limits_{j=0}^{n-1} W_{ij} a_j) \overset{!}{=} \text{ReLU}(\sum\limits_{j \in X_i \cap Y} S[I_{ij}] a_j)
+ \end{align}
+ \item $X_i$: Set of columns with $W_{ij} \neq 0$
+ \item $Y$: Set of indices in $a$ for which $a_j \neq 0$
+ \item $I_{ij}$: 4-bit index
+ \item $S$: Shared lookup table
+ \end{itemize}
+\end{frame}
+
+\begin{frame}{Weight matrix segmentation}
+ \begin{figure}[h]
+ \centering
+ \includegraphics[width=0.52\textwidth, keepaspectratio]{resources/eie_matrix}
+ \includegraphics[width=0.52\textwidth, keepaspectratio]{resources/eie_layout}
+ \caption{Weight matrix segmentation and memory layout}
+ \end{figure}
+\end{frame}
+
+\begin{frame}{Hardware implementation}
+ \begin{figure}[h]
+ \centering
+ \includegraphics[width=\textwidth, keepaspectratio]{resources/eie_hw}
+ \caption{Hardware architecture}
+ \end{figure}
+\end{frame}
+
+\begin{frame}{Hardware implementation}
+ \begin{minipage}{0.39\linewidth}
+ \begin{figure}[h]
+ \centering
+ \includegraphics[width=\textwidth, keepaspectratio]{resources/eie_hw_zero}
+ \caption{Non-Zero detection node}
+ \end{figure}
+ \end{minipage}
+ \hfill
+ \begin{minipage}{0.59\linewidth}
+ \begin{itemize}
+ \item Filter zero elements in input vector $a$
+ \item Broadcast non-zero elements $a_j$ and corresponding indices $j$ to all PEs
+ \end{itemize}
+ \end{minipage}
+\end{frame}
+
+\begin{frame}{Hardware implementation}
+ \begin{minipage}{0.39\linewidth}
+ \begin{figure}[h]
+ \centering
+ \includegraphics[width=\textwidth, keepaspectratio]{resources/eie_hw_pointer}
+ \caption{Pointer read unit}
+ \end{figure}
+ \end{minipage}
+ \hfill
+ \begin{minipage}{0.59\linewidth}
+ \begin{itemize}
+ \item Pointers to columns $W_j$ are stored in seperate SRAM banks (start/end)
+ \end{itemize}
+ \end{minipage}
+\end{frame}
+
+\begin{frame}{Hardware implementation}
+ \begin{minipage}{0.39\linewidth}
+ \begin{figure}[h]
+ \centering
+ \includegraphics[width=\textwidth, keepaspectratio]{resources/eie_hw_matrix}
+ \caption{Sparse matrix read unit}
+ \end{figure}
+ \end{minipage}
+ \hfill
+ \begin{minipage}{0.59\linewidth}
+ \begin{itemize}
+ \item Reads weight values $v$ and zeroes $z$ for current operation based on pointers
+ \item SRAM word length: 64 bit, entries of $v$ and $z$ are 4 bit each \\
+ $\rightarrow$ 8 entries per word
+ \end{itemize}
+ \end{minipage}
+\end{frame}
+
+\begin{frame}{Hardware implementation}
+ \begin{minipage}{0.39\linewidth}
+ \begin{figure}[h]
+ \centering
+ \includegraphics[width=\textwidth, keepaspectratio]{resources/eie_hw_alu}
+ \caption{Arithmetic unit}
+ \end{figure}
+ \end{minipage}
+ \hfill
+ \begin{minipage}{0.59\linewidth}
+ \begin{itemize}
+ \item Receives column vector $v$, destination accumulator register index $x$ and activation value $a_j$
+ \item Calculates $b_x = b_x + v \cdot a_j$
+ \item Accumulates indices $x$ and forwards real target address
+ \end{itemize}
+ \end{minipage}
+\end{frame}
+
+\begin{frame}{Hardware implementation}
+ \begin{minipage}{0.29\linewidth}
+ \begin{figure}[h]
+ \centering
+ \includegraphics[width=\textwidth, keepaspectratio]{resources/eie_hw_rw}
+ \caption{Read/Write unit}
+ \end{figure}
+ \end{minipage}
+ \hfill
+ \begin{minipage}{0.69\linewidth}
+ \begin{itemize}
+ \item Maintains source ($a$) and destination ($b$) activation values
+ \item Feed-Forward-Network: destination values of one layer are activation values of next layer
+ \item Register banks exchange roles on each layer
+ \end{itemize}
+ \end{minipage}
+\end{frame}
+
+\begin{frame}{Hardware implementation}
+ \begin{minipage}{0.24\linewidth}
+ \begin{figure}[h]
+ \centering
+ \includegraphics[width=\textwidth, keepaspectratio]{resources/eie_hw_lnzd}
+ \caption{ReLU \& Leading non-zero detection unit}
+ \end{figure}
+ \end{minipage}
+ \hfill
+ \begin{minipage}{0.74\linewidth}
+ \begin{itemize}
+ \item Performs ReLU function on destination values
+ \item Detects first PE-local non-zero destination value
+ \item Sends it to group Leading non-zero detection unit to distribute it to PEs for next cycle
+ \end{itemize}
+ \end{minipage}
+\end{frame}
+
+\section{Evaluation}
+
+\begin{frame}{Table of contents}
+ \tableofcontents[currentsection]
+\end{frame}
+
+\begin{frame}{Speed and energy}
+ \begin{figure}[h]
+ \centering
+ \includegraphics[width=\textwidth, keepaspectratio]{resources/eval_speed_png}\\
+ \vspace{0.5cm}
+ \includegraphics[width=\textwidth, keepaspectratio]{resources/eval_energy_png}
+ \caption{Speedup and energy efficienty comparison}
+ \end{figure}
+ \begin{itemize}
+ \item Throughput: 102 GOP/s compressed $\rightarrow$ 3 TOP/s uncompressed
+ \end{itemize}
+\end{frame}
+
+\begin{frame}{Accelerator comparison}
+ \begin{figure}[h]
+ \centering
+ \includegraphics[width=\textwidth, keepaspectratio]{resources/accelerators_table}
+ \caption{EIE compared with different DNN hardware accelerators}
+ \end{figure}
+\end{frame}
+
+\section{Future work}
+
+\begin{frame}{Table of contents}
+ \tableofcontents[currentsection]
+\end{frame}
+
+\begin{frame}{Inference accelerators}
+ \begin{itemize}
+ \item Exploitation of different compression methods
+ \begin{itemize}
+ \item Huffman encoding (35-49x compression)
+ \item HashNets (variable compression up to 64x)
+ \end{itemize}
+ \item Combination with other optimization methods
+ \begin{itemize}
+ \item In-Memory calculation
+ \item Approximating circuits
+ \end{itemize}
+ \item Optimize hardware itself
+ \begin{itemize}
+ \item Different storage technologies (e.g. ReRAM)
+ \end{itemize}
+ \end{itemize}
+\end{frame}
+
+\begin{frame}[highlight]
+ Ende
+\end{frame}
+
+\end{document} \ No newline at end of file
diff --git a/Presentation/structure.md b/Presentation/structure.md
new file mode 100644
index 0000000..8c772b6
--- /dev/null
+++ b/Presentation/structure.md
@@ -0,0 +1,83 @@
+# EIE: Efficient Inference Engine on Compressed Deep Neural Network
+
+## Deep Neural Network
+
+- Convolutional layers
+- Fully-connected layers
+- In FC-layers: Trained weights. This only focuses on inference
+- Multiply-Accumulate (MAC) on each layer
+- DNN dataflows
+- Convolutional layers: 5% of memory, 95% of FLOPs
+- FC layers: 5% of FLOPs, 90-95% of memory
+
+## Motivation
+
+- Inference metrics: throughput, latency, model size, energy use
+- Uncompressed DNN: Does not fit SRAM, memory access to/from DRAM
+- Von-Neumann bottleneck
+- Grafik aus Chen 2016
+- Additional levels of indirection because of indices (weight reusing)
+
+## Compression
+
+- In general: Encode in such a way, that it reduces the number of bits per weight
+
+Trivial:
+
+- Use different kernels/filters to the input
+- Apply pooling to the inputs (runtime memory)
+
+More complex:
+
+- Pruning (remove unimportant weights and retrain, 2 approaches)
+ - Encode with relative indexing
+- Weight quantization with clustering
+ - Group similar weights to clusters
+ - Minimalize WCSS
+ - Different methods to initialize cluster centroids, e.g. random, linear, CDF-based
+ - Indirection because of shared weight table lookup
+- Huffman encoding (binary tree with weights, globally)
+- Fixed-Point-Quantization of activation functions (refer to CPU optimization)
+- Extremely narrow weight engines (4 bit)
+- Compressed sparse column (CSC) matrix representation
+
+## EIE implementation
+
+- Per-Activation-Formula
+- Accelerates sparse and weight sharing networks
+- Uses CSC representation
+ - PE Quickly finds non-zero elements in column
+- Explain general procedure
+- Show image of the architecture
+- Non-Zero filtering
+- Queues for load balancing
+- Two different SRAM banks for pointers (16 bit) to column borders
+- Each entry: 8 bit width (4 bit reference and 4 bit activation register index)
+- Table lookup / weight decoding of reference in the same cycle
+- Arithmetic Unit: Performs Multiply-Accumulate
+- Read/Write unit
+ - Source and destination register files
+ - Change their role on each layer
+ - Feed-Forward networks
+
+## EIE evaluation
+
+- Speedup: 189x, 13x, 307x faster than CPU, GPU and mGPU
+ - EIE latency focused: Batch size of 1
+- Throughput: 102 GOP/s compressed -> 3 TOP/s uncompressed
+- Energy efficiency: 24.000x, 3.400x, 2.700x more energy efficient than CPU, GPU and mGPU
+
+
+- Speed calculation: Measure wall clock times for different workloads
+- Energy calculation: Total computation time x average measured power
+- Sources of energy consumption and reasons for less energy consumption:
+ - SRAM access instead of DRAM
+ - Compression type and architecture reduces amount of memory reads
+ - Vector sparsity encoding in CSC representation
+
+## Limitations / future optimizations
+
+- EIE only capable of matrix-multiplication
+- Other optimization methods
+ - In-Memory Acceleration
+ - \ No newline at end of file
diff --git a/Presentation/template.tex b/Presentation/template.tex
index 0bca3f4..5fbf1c7 100644
--- a/Presentation/template.tex
+++ b/Presentation/template.tex
@@ -24,9 +24,9 @@
]{tubs}
% Titelseite
-\title{Meine Pr\"asentation}
-\subtitle{Das Corporate Design in \LaTeX}
-\author{Max Mustermann}
+\title{EIE: Efficient Inference Engine on Compressed Deep Neural Network}
+%\subtitle{Das Corporate Design in \LaTeX}
+\author{Leonard Kugis}
% Titelgrafik, automatisch beschnitten, Weitere Optionen: <scaled/cropx/cropy>
% \titlegraphic[cropped]{\includegraphics{infozentrum.jpg}}
%\titlegraphic[scaled]{\includegraphics{titlepicture.jpg}}