Finished presentation slides

author: Leonard Kugis <leonard@kug.is> 2023-01-07 14:54:34 +0100
committer: Leonard Kugis <leonard@kug.is> 2023-01-07 14:54:34 +0100
commit: 036b0c74c8f712e9fbf55ef41b8d2ae13feb2baf (patch)
tree: c33a3de067e1ac8ef756f05521a6534bafdfa4fb /Presentation
parent: 5ef7ef8d615ab3098e0b90f18af939908d4f4dfa (diff)
3 files changed, 574 insertions, 3 deletions
diff --git a/Presentation/presentation.tex b/Presentation/presentation.tex
new file mode 100644
index 0000000..18be3aa
--- /dev/null
+++ b/Presentation/presentation.tex
@@ -0,0 +1,488 @@
+% Offizielle Beispieldatei für beamer-Vorlage aus tubslatex Version 0.3beta2
+\documentclass[fleqn,11pt,aspectratio=43]{beamer}
+
+\usepackage[english]{babel}
+\usepackage[utf8x]{inputenc}
+\usepackage{graphicx}
+\usepackage{svg}
+\usetheme[%
+  %nexus,%        Nexus Fonts benutzen
+  %lnum,%         Versalziffern verwenden
+  %cmyk,%<rgbprint>,          Auswahl des Farbmodells
+  blue,%<orange/green/violet> Auswahl des Sekundärfarbklangs
+  dark,%<light,medium>        Auswahl der Helligkeit
+  %colorhead,%    Farbig hinterlegte Kopfleiste
+  %colorfoot,%    Farbig hinterlegt Fußleiste auf Titelseite
+  colorblocks,%   Blöcke Farbig hinterlegen
+  %nopagenum,%    Keine Seitennumer in Fußzeile
+  %nodate,%       Kein Datum in Fußleiste
+  tocinheader,%   Inhaltsverzeichnis in Kopfleiste
+  %tinytocinheader,% kleines Kopfleisten-Inhaltsverzeichnis
+  %widetoc,%      breites Kopfleisten-Inhaltsverzeichnis
+  %narrowtoc,%    schmales Kopfleisten-Inhaltsverzeichnis
+  %nosubsectionsinheader,%  Keine subsections im Kopfleisten-Inhaltsverzeichnis
+  %nologoinfoot,% Kein Logo im Fußbereich darstellen
+  ]{tubs}
+
+% Titelseite
+\title{EIE: Efficient Inference Engine on Compressed Deep Neural Network}
+%\subtitle{Das Corporate Design in  \LaTeX}
+\author{Leonard Kugis}
+% Titelgrafik, automatisch beschnitten, Weitere Optionen: <scaled/cropx/cropy>
+% \titlegraphic[cropped]{\includegraphics{infozentrum.jpg}}
+%\titlegraphic[scaled]{\includegraphics{titlepicture.jpg}}
+
+% Logo, dass auf Titelseiten oben rechts und auf Inthaltsseiten unten rechts
+% dargestellt wird. Es wird jeweils automatisch skliert
+%\logo{\includegraphics{dummy_institut.pdf}}
+%\logo{Institut für Unkreativität\\und Schreibschwäche}
+
+\begin{document}
+
+\begin{frame}[plain]
+\titlepage
+\end{frame}
+
+\begin{frame}{Table of contents}
+    \tableofcontents
+\end{frame}
+
+\section{Deep Neural Networks}
+
+\begin{frame}{Table of contents}
+  \tableofcontents[currentsection]
+\end{frame}
+
+\begin{frame}
+  \begin{figure}[h]
+    \centering
+    \includegraphics[width=\textwidth, keepaspectratio]{resources/cnn}
+    \caption{Deep Neural Network}
+  \end{figure}
+\end{frame}
+
+\begin{frame}
+  \begin{figure}[h]
+    \centering
+    \includegraphics[width=\textwidth, keepaspectratio]{resources/fcn}
+    \caption{Fully connected layer}
+  \end{figure}
+\end{frame}
+
+\begin{frame}
+  \begin{figure}[h]
+    \centering
+    \includegraphics[width=\textwidth, keepaspectratio]{resources/fcn}
+    \caption{Fully connected layer}
+  \end{figure}
+  \begin{itemize}
+    \item $b_i = f(\sum\limits_{j=0}^{n} W_{ij} a_j)$
+    \item Multiply-Accumulate (MAC) operations
+    \item Matrix $W$ can be sparse
+  \end{itemize}
+\end{frame}
+
+\section{Motivation}
+
+\begin{frame}{Table of contents}
+  \tableofcontents[currentsection]
+\end{frame}
+
+\begin{frame}{Inference metrics}
+  \begin{itemize}
+    \item Throughput \\
+      \textcolor{gray}{Amount of data processed during one unit of time}
+    \item Latency \\
+      \textcolor{gray}{Amount of time it takes to process a single workload}
+    \item Model size \\
+      \textcolor{gray}{Storage amount to store the model (e.g. weights)}
+    \item Energy use \\
+      \textcolor{gray}{Energy consumption for processing a specific amount of data}
+  \end{itemize}
+\end{frame}
+
+\begin{frame}
+  \begin{figure}[h]
+    \centering
+    \includegraphics[width=0.65\textwidth, keepaspectratio]{resources/dnn_dataflows_png}
+    \caption{Common dataflow models in inference architectures}
+  \end{figure}
+\end{frame}
+
+\begin{frame}
+  \begin{figure}[h]
+    \centering
+    \includegraphics[width=0.65\textwidth, keepaspectratio]{resources/dnn_dataflows_access_png}
+    \caption{Common dataflow models in inference architectures}
+  \end{figure}
+\end{frame}
+
+\begin{frame}
+  \begin{figure}[h]
+    \centering
+    \includegraphics[width=0.9\textwidth, keepaspectratio]{resources/memory_latency_png}
+    \caption{Memory hierarchy and energy cost of hierarchy levels}
+  \end{figure}
+\end{frame}
+
+\section{Compression}
+
+\begin{frame}{Table of contents}
+  \tableofcontents[currentsection]
+\end{frame}
+
+\begin{frame}
+  \begin{itemize}
+    \item Dynamic
+    \begin{itemize}
+      \item Input data
+    \end{itemize}
+    \item Static (parameters)
+    \begin{itemize}
+      \item Weights
+      \item Parameters of activation functions
+    \end{itemize}
+  \end{itemize}
+\end{frame}
+
+\begin{frame}{AlexNet}
+  \begin{itemize}
+    \item 5 convolutional layers
+    \item 3 fully connected layers
+    \item $\sim 62$ million parameters
+    \item $\sim 240$ MB with 32-bit float representation
+  \end{itemize}
+\end{frame}
+
+\begin{frame}{Basis projection}
+  \begin{figure}[h]
+    \centering
+    \includegraphics[width=0.75\textwidth, keepaspectratio]{resources/basis_projection_png}
+    \caption{Basis projection and resulting weight distribution}
+  \end{figure}
+\end{frame}
+
+\begin{frame}{Pruning}
+  \begin{itemize}
+    \item Idea: Remove unimportant weights with low impact on accuracy
+  \end{itemize}
+  \begin{figure}[h]
+    \centering
+    \vspace{0.5cm}
+    \includegraphics[width=0.3\textwidth, keepaspectratio]{resources/pruning}
+    \caption{3-step pruning working principle}
+  \end{figure}
+\end{frame}
+
+\begin{frame}{Pruning}
+  \begin{minipage}{0.24\linewidth}
+    \begin{figure}[h]
+      \centering
+      \includegraphics[width=\textwidth, keepaspectratio]{resources/pruning}
+    \end{figure}
+  \end{minipage}
+  \hfill
+  \begin{minipage}{0.74\linewidth}
+    \begin{itemize}
+      \item Magnitude threshold based pruning
+      \begin{itemize}
+        \item Remove a weight, if value is below specific threshold
+      \end{itemize}
+      \item Optimal Brain Damage $(a)$ \& Optimal Brain Surgeon $(b)$
+      \begin{itemize}
+        \item Removes weights based on sensitivity on objective function
+        \item Considers first $(a)$ and second order derivatives $(b)$ to measure sensitivity
+        \item Remove lowest sensitive weights first
+      \end{itemize}
+      \item Biased weight decay
+      \begin{itemize}
+        \item Update-Time pruning
+        \item Adjust weight update term so, that large weights persist and small weights converge to zero 
+      \end{itemize}
+    \end{itemize}
+  \end{minipage}
+\end{frame}
+
+\begin{frame}{Weight quantization}
+  \begin{figure}[h]
+    \centering
+    \includegraphics[width=0.5\textwidth, keepaspectratio]{resources/clustering}
+    \caption{Weight quantization}
+  \end{figure}
+  \begin{itemize}
+    \item Group similar weights into clusters
+    \item Fine tune clusters with gradient matrix during update
+  \end{itemize}
+\end{frame}
+
+\begin{frame}{Weight quantization}
+  \begin{itemize}
+    \item Minimalize Within-Cluster Sum of Squares (WCSS): $\text{argmin}_C \sum\limits_{i=1}^{k} \sum\limits_{\omega \in c_i} | \omega - c_i |^2$
+    \item Perform k-means clustering:
+    \begin{enumerate}
+      \item Initialize $k$ cluster centroids
+      \item Assign each data to the cluster with nearest centroid
+      \item Recalculate cluster centroids
+    \end{enumerate}
+  \end{itemize}
+\end{frame}
+
+\begin{frame}{Weight quantization}
+  \begin{figure}[h]
+    \centering
+    \includegraphics[width=0.8\textwidth, keepaspectratio]{resources/centroid_initialization}
+    \caption{Different centroid initialization methods}
+  \end{figure}
+\end{frame}
+
+\begin{frame}{Huffman encoding}
+  \begin{figure}[h]
+    \centering
+    \includegraphics[width=0.7\textwidth, keepaspectratio]{resources/huffman}
+    \caption{Huffman encoding example}
+  \end{figure}
+\end{frame}
+
+\begin{frame}{HashNets}
+  \begin{figure}[h]
+    \centering
+    \includegraphics[width=0.7\textwidth, keepaspectratio]{resources/hashnets}
+    \caption{HashNets encoding}
+  \end{figure}
+\end{frame}
+
+\begin{frame}{HashNets}
+  \begin{minipage}{0.49\linewidth}
+    \begin{figure}[h]
+      \centering
+      \includegraphics[width=\textwidth, keepaspectratio]{resources/hashnets}
+    \end{figure}
+  \end{minipage}
+  \hfill
+  \begin{minipage}{0.49\linewidth}
+    \begin{itemize}
+      \item Virtual weight matrix $\textbf{V}^{\ell}$
+      \item One-way hash function $h^{\ell}(i, j)$
+      \item Weight array $w^{\ell}$
+      \item Hash function returns index for weight array
+      \item $w^{\ell}_{h^{\ell}(i, j)} = \textbf{V}^{\ell}_{ij}$
+    \end{itemize}
+  \end{minipage}
+\end{frame}
+
+\begin{frame}{Storage format}
+  \begin{itemize}
+    \item Compressed sparse column (CSC) /
+      Compressed sparse row (CSR) representation
+    \item Encode each column $W_j$ as vectors $v$ and $z$
+    \begin{itemize}
+      \item $v$: Non-zero weights
+      \item $z$: Number of zeros before corresponding element in $v$
+    \end{itemize}
+  \end{itemize}
+  \begin{itemize}
+    \item E.g. column $[0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3]$ becomes
+      $v = [1, 2, 0, 3]$, $z = [2, 0, 15, 2]$
+    \item $v$'s and $z$'s for all columns are stored in a single pair of arrays
+    \item Vector $p$ with $p_j$ pointing to the first element of column $W_j$
+  \end{itemize}
+\end{frame}
+
+\section{EIE implementation}
+
+\begin{frame}{Table of contents}
+  \tableofcontents[currentsection]
+\end{frame}
+
+\begin{frame}{EIE implementation}
+  \begin{itemize}
+    \item Optimizes per-activation formula:
+    \begin{align}
+      b_i = \text{ReLU}(\sum\limits_{j=0}^{n-1} W_{ij} a_j) \overset{!}{=} \text{ReLU}(\sum\limits_{j \in X_i \cap Y} S[I_{ij}] a_j)
+    \end{align}
+    \item $X_i$: Set of columns with $W_{ij} \neq 0$
+    \item $Y$: Set of indices in $a$ for which $a_j \neq 0$
+    \item $I_{ij}$: 4-bit index
+    \item $S$: Shared lookup table
+  \end{itemize}
+\end{frame}
+
+\begin{frame}{Weight matrix segmentation}
+  \begin{figure}[h]
+    \centering
+    \includegraphics[width=0.52\textwidth, keepaspectratio]{resources/eie_matrix}
+    \includegraphics[width=0.52\textwidth, keepaspectratio]{resources/eie_layout}
+    \caption{Weight matrix segmentation and memory layout}
+  \end{figure}
+\end{frame}
+
+\begin{frame}{Hardware implementation}
+  \begin{figure}[h]
+    \centering
+    \includegraphics[width=\textwidth, keepaspectratio]{resources/eie_hw}
+    \caption{Hardware architecture}
+  \end{figure}
+\end{frame}
+
+\begin{frame}{Hardware implementation}
+  \begin{minipage}{0.39\linewidth}
+    \begin{figure}[h]
+      \centering
+      \includegraphics[width=\textwidth, keepaspectratio]{resources/eie_hw_zero}
+      \caption{Non-Zero detection node}
+    \end{figure}
+  \end{minipage}
+  \hfill
+  \begin{minipage}{0.59\linewidth}
+    \begin{itemize}
+      \item Filter zero elements in input vector $a$
+      \item Broadcast non-zero elements $a_j$ and corresponding indices $j$ to all PEs
+    \end{itemize}
+  \end{minipage}
+\end{frame}
+
+\begin{frame}{Hardware implementation}
+  \begin{minipage}{0.39\linewidth}
+    \begin{figure}[h]
+      \centering
+      \includegraphics[width=\textwidth, keepaspectratio]{resources/eie_hw_pointer}
+      \caption{Pointer read unit}
+    \end{figure}
+  \end{minipage}
+  \hfill
+  \begin{minipage}{0.59\linewidth}
+    \begin{itemize}
+      \item Pointers to columns $W_j$ are stored in seperate SRAM banks (start/end)
+    \end{itemize}
+  \end{minipage}
+\end{frame}
+
+\begin{frame}{Hardware implementation}
+  \begin{minipage}{0.39\linewidth}
+    \begin{figure}[h]
+      \centering
+      \includegraphics[width=\textwidth, keepaspectratio]{resources/eie_hw_matrix}
+      \caption{Sparse matrix read unit}
+    \end{figure}
+  \end{minipage}
+  \hfill
+  \begin{minipage}{0.59\linewidth}
+    \begin{itemize}
+      \item Reads weight values $v$ and zeroes $z$ for current operation based on pointers
+      \item SRAM word length: 64 bit, entries of $v$ and $z$ are 4 bit each \\
+        $\rightarrow$ 8 entries per word
+    \end{itemize}
+  \end{minipage}
+\end{frame}
+
+\begin{frame}{Hardware implementation}
+  \begin{minipage}{0.39\linewidth}
+    \begin{figure}[h]
+      \centering
+      \includegraphics[width=\textwidth, keepaspectratio]{resources/eie_hw_alu}
+      \caption{Arithmetic unit}
+    \end{figure}
+  \end{minipage}
+  \hfill
+  \begin{minipage}{0.59\linewidth}
+    \begin{itemize}
+      \item Receives column vector $v$, destination accumulator register index $x$ and activation value $a_j$
+      \item Calculates $b_x = b_x + v \cdot a_j$
+      \item Accumulates indices $x$ and forwards real target address 
+    \end{itemize}
+  \end{minipage}
+\end{frame}
+
+\begin{frame}{Hardware implementation}
+  \begin{minipage}{0.29\linewidth}
+    \begin{figure}[h]
+      \centering
+      \includegraphics[width=\textwidth, keepaspectratio]{resources/eie_hw_rw}
+      \caption{Read/Write unit}
+    \end{figure}
+  \end{minipage}
+  \hfill
+  \begin{minipage}{0.69\linewidth}
+    \begin{itemize}
+      \item Maintains source ($a$) and destination ($b$) activation values
+      \item Feed-Forward-Network: destination values of one layer are activation values of next layer
+      \item Register banks exchange roles on each layer
+    \end{itemize}
+  \end{minipage}
+\end{frame}
+
+\begin{frame}{Hardware implementation}
+  \begin{minipage}{0.24\linewidth}
+    \begin{figure}[h]
+      \centering
+      \includegraphics[width=\textwidth, keepaspectratio]{resources/eie_hw_lnzd}
+      \caption{ReLU \& Leading non-zero detection unit}
+    \end{figure}
+  \end{minipage}
+  \hfill
+  \begin{minipage}{0.74\linewidth}
+    \begin{itemize}
+      \item Performs ReLU function on destination values
+      \item Detects first PE-local non-zero destination value
+      \item Sends it to group Leading non-zero detection unit to distribute it to PEs for next cycle
+    \end{itemize}
+  \end{minipage}
+\end{frame}
+
+\section{Evaluation}
+
+\begin{frame}{Table of contents}
+  \tableofcontents[currentsection]
+\end{frame}
+
+\begin{frame}{Speed and energy}
+  \begin{figure}[h]
+    \centering
+    \includegraphics[width=\textwidth, keepaspectratio]{resources/eval_speed_png}\\
+    \vspace{0.5cm}
+    \includegraphics[width=\textwidth, keepaspectratio]{resources/eval_energy_png}
+    \caption{Speedup and energy efficienty comparison}
+  \end{figure}
+  \begin{itemize}
+    \item Throughput: 102 GOP/s compressed $\rightarrow$ 3 TOP/s uncompressed
+  \end{itemize}
+\end{frame}
+
+\begin{frame}{Accelerator comparison}
+  \begin{figure}[h]
+    \centering
+    \includegraphics[width=\textwidth, keepaspectratio]{resources/accelerators_table}
+    \caption{EIE compared with different DNN hardware accelerators}
+  \end{figure}
+\end{frame}
+
+\section{Future work}
+
+\begin{frame}{Table of contents}
+  \tableofcontents[currentsection]
+\end{frame}
+
+\begin{frame}{Inference accelerators}
+  \begin{itemize}
+    \item Exploitation of different compression methods
+    \begin{itemize}
+      \item Huffman encoding (35-49x compression)
+      \item HashNets (variable compression up to 64x)
+    \end{itemize}
+    \item Combination with other optimization methods
+    \begin{itemize}
+      \item In-Memory calculation
+      \item Approximating circuits
+    \end{itemize}
+    \item Optimize hardware itself
+    \begin{itemize}
+      \item Different storage technologies (e.g. ReRAM)
+    \end{itemize}
+  \end{itemize}
+\end{frame}
+
+\begin{frame}[highlight]
+  Ende
+\end{frame}
+
+\end{document}
+\ No newline at end of file
diff --git a/Presentation/structure.md b/Presentation/structure.md
new file mode 100644
index 0000000..8c772b6
--- /dev/null
+++ b/Presentation/structure.md
@@ -0,0 +1,83 @@
+# EIE: Efficient Inference Engine on Compressed Deep Neural Network
+
+## Deep Neural Network
+
+- Convolutional layers
+- Fully-connected layers
+- In FC-layers: Trained weights. This only focuses on inference
+- Multiply-Accumulate (MAC) on each layer
+- DNN dataflows
+- Convolutional layers: 5% of memory, 95% of FLOPs
+- FC layers: 5% of FLOPs, 90-95% of memory
+
+## Motivation
+
+- Inference metrics: throughput, latency, model size, energy use
+- Uncompressed DNN: Does not fit SRAM, memory access to/from DRAM
+- Von-Neumann bottleneck
+- Grafik aus Chen 2016
+- Additional levels of indirection because of indices (weight reusing)
+
+## Compression
+
+- In general: Encode in such a way, that it reduces the number of bits per weight
+
+Trivial:
+
+- Use different kernels/filters to the input
+- Apply pooling to the inputs (runtime memory)
+
+More complex:
+
+- Pruning (remove unimportant weights and retrain, 2 approaches)
+    - Encode with relative indexing
+- Weight quantization with clustering
+    - Group similar weights to clusters
+    - Minimalize WCSS
+    - Different methods to initialize cluster centroids, e.g. random, linear, CDF-based
+    - Indirection because of shared weight table lookup
+- Huffman encoding (binary tree with weights, globally)
+- Fixed-Point-Quantization of activation functions (refer to CPU optimization)
+- Extremely narrow weight engines (4 bit)
+- Compressed sparse column (CSC) matrix representation
+
+## EIE implementation
+
+- Per-Activation-Formula
+- Accelerates sparse and weight sharing networks
+- Uses CSC representation
+    - PE Quickly finds non-zero elements in column
+- Explain general procedure
+- Show image of the architecture
+- Non-Zero filtering
+- Queues for load balancing
+- Two different SRAM banks for pointers (16 bit) to column borders
+- Each entry: 8 bit width (4 bit reference and 4 bit activation register index)
+- Table lookup / weight decoding of reference in the same cycle
+- Arithmetic Unit: Performs Multiply-Accumulate
+- Read/Write unit
+    - Source and destination register files
+    - Change their role on each layer
+    - Feed-Forward networks
+
+## EIE evaluation
+
+- Speedup: 189x, 13x, 307x faster than CPU, GPU and mGPU
+    - EIE latency focused: Batch size of 1
+- Throughput: 102 GOP/s compressed -> 3 TOP/s uncompressed
+- Energy efficiency: 24.000x, 3.400x, 2.700x more energy efficient than CPU, GPU and mGPU
+
+
+- Speed calculation: Measure wall clock times for different workloads
+- Energy calculation: Total computation time x average measured power
+- Sources of energy consumption and reasons for less energy consumption:
+    - SRAM access instead of DRAM
+    - Compression type and architecture reduces amount of memory reads
+    - Vector sparsity encoding in CSC representation
+
+## Limitations / future optimizations
+
+- EIE only capable of matrix-multiplication
+- Other optimization methods
+    - In-Memory Acceleration
+    - 
+\ No newline at end of file
diff --git a/Presentation/template.tex b/Presentation/template.tex
index 0bca3f4..5fbf1c7 100644
--- a/Presentation/template.tex
+++ b/Presentation/template.tex
@@ -24,9 +24,9 @@
   ]{tubs}
 
 % Titelseite
-\title{Meine Pr\"asentation}
-\subtitle{Das Corporate Design in  \LaTeX}
-\author{Max Mustermann}
+\title{EIE: Efficient Inference Engine on Compressed Deep Neural Network}
+%\subtitle{Das Corporate Design in  \LaTeX}
+\author{Leonard Kugis}
 % Titelgrafik, automatisch beschnitten, Weitere Optionen: <scaled/cropx/cropy>
 % \titlegraphic[cropped]{\includegraphics{infozentrum.jpg}}
 %\titlegraphic[scaled]{\includegraphics{titlepicture.jpg}}
author	Leonard Kugis <leonard@kug.is>	2023-01-07 14:54:34 +0100
committer	Leonard Kugis <leonard@kug.is>	2023-01-07 14:54:34 +0100
commit	036b0c74c8f712e9fbf55ef41b8d2ae13feb2baf (patch)
tree	c33a3de067e1ac8ef756f05521a6534bafdfa4fb /Presentation
parent	5ef7ef8d615ab3098e0b90f18af939908d4f4dfa (diff)