From 8706f04e2d90c4425f0380a701358dda61f40bb8 Mon Sep 17 00:00:00 2001
From: Leonard Kugis <leonard@kug.is>
Date: Mon, 23 Jan 2023 23:50:19 +0100
Subject: Finished evaluation, finished future work

---
 Paper/paper.tex | 139 +++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 137 insertions(+), 2 deletions(-)

(limited to 'Paper/paper.tex')

diff --git a/Paper/paper.tex b/Paper/paper.tex
index 32e1bf3..b5f2269 100644
--- a/Paper/paper.tex
+++ b/Paper/paper.tex
@@ -46,6 +46,8 @@ This paper gives an overview over different compression methods for \emph{Deep N
 (section~\ref{sec:compression}), after discussing the metrices used to measure
 inference engines (section~\ref{sec:metrices})
 and shows how they are applied in an actual hardware architecture: the \emph{Efficient Inference Engine} (\emph{EIE}) (section~\ref{sec:implementation}).
+After that, it is evaluated and compared to other hardware accelerators (section~\ref{sec:eval}).
+Finally, some further optimization methods for the EIE are presented in section~\ref{sec:future}.
 
 \subsection{Deep Neural Networks}
 
@@ -386,7 +388,7 @@ but are underrepresented by occurrence using this method. This would lead to hig
 because there are less centroids used. Because of this, linear initialization in the value domain has been established as the
 best initialization method \cite{Han2015DeepCC}.
 
-\subsection{Huffman encoding}
+\subsection{Huffman encoding}\label{sec:huffman}
 
 Another compression method that can be applied to DNNs is the Huffman encoding.
 
@@ -423,7 +425,7 @@ Huffman encoding archieves $35$x - $49$x compression rate \cite{Han2015DeepCC}.
 Another remarkable advantage of this compression method is that it is lossless and has therefore
 no impact on the accuracy of the DNN.
 
-\subsection{HashNets}
+\subsection{HashNets}\label{sec:hashnets}
 
 A relatively recent compression/optimization technique for weights of DNNs are HashNets \cite{10.5555/3045118.3045361}.
 Using HashNets, no actual values need to be stored in the weight matrix (not even index values),
@@ -586,6 +588,139 @@ To know the iteration boundaries, the column pointers are stored seperately. In
 the first column has pointer $0$ (because it is the first entry in total). The second entry has pointer $3$,
 because this PE has $3$ non-zero values assigned to it in the first column.
 
+\section{Evaluation and comparison}\label{sec:eval}
+
+\begin{figure*}[t]
+    \centering
+    \includegraphics[width=\textwidth]{resources/eval_speed_png} \\
+    \vspace{0.5cm}
+    \includegraphics[width=\textwidth, keepaspectratio]{resources/eval_energy_png}
+    \caption{Speedup and energy efficiency comparison \cite{10.1109/ISCA.2016.30}}
+    \label{eval_speed_energy}
+\end{figure*}
+
+Fig.~\ref{eval_speed_energy} displays the speed and energy comparison with standard
+hardware components \emph{CPU}, \emph{GPU} and \emph{mGPU}. For benchmarking, different layers of different DNN models are used.
+\emph{Alex-6}, \emph{Alex-7}, \emph{Alex-8} are layers of the AlexNet.
+\emph{VGG-6}, \emph{VGG-7}, \emph{VGG-8} are layers of the VGG-Net (VGG: Visual Geometry Group).
+\emph{NT-We}, \emph{NT-Wd}, \emph{NT-LSTM} are layers of the NeuralTalk net.
+Speedup and energy efficiency is measured on the three platforms for each of those layers with and
+without compression. The baseline is the inference using CPU on the uncompressed model.
+
+\subsection{Methodology}
+
+\subsubsection{Hardware platforms}
+
+For the CPU an \emph{Intel Core i7 5930k} is used.
+As GPU a \emph{NVIDIA GeForce GTX Titan X} is used.
+As mGPU a \emph{NVIDIA Tegra K1} is used.
+All of them come with their own power reporting tools, used to measure the energy consumption
+and speed.
+
+\subsubsection{Speed}
+
+The speed is measured with the following formula:
+
+\begin{align}
+    \text{speed} = \frac{\text{workload}}{\text{peak throughput}}, [\text{Frames}/\text{s}]
+\end{align}
+
+Batch sizes of 1 are chosen, because the EIE is targeting real-time applications
+with low latency. In these environments, low batch sizes are the most common.
+
+\subsubsection{Energy efficiency}
+
+The energy efficiency is measured with the following formula:
+
+\begin{align}
+    \text{eff} = \frac{\text{average power consumption} \cdot \text{duration}}{\text{workload}}, [\text{Frames}/\text{J}]
+\end{align}
+
+\subsection{Results}
+
+The EIE has a speedup factor of $189$x, $13$x, $307$x compared to CPU, GPU and mGPU on the compressed models.
+Theoretically, when compared with uncompressed inference on standard architecture, the compression rate must
+be factorized: compressed inference speed of $103$ GOP/s correspond to uncompressed inference speed of $3$ TOP/s.
+However, in practice compression only yields a speedup of $3$x after compression for standard platforms.
+This shows the impact of the dedicated hardware architecture to handle compressed models.
+
+The EIE is $24000$x, $3400$x, $2700$x more energy efficient compared to CPU, GPU and mGPU on the compressed models.
+Remarkably, compression yields little to no benefit for the standard hardware architectures, while it does
+on a large scale stepping to EIE. The main reasons for this energy efficiency benefit are the change in memory technology
+from DRAM to SRAM, reduction of memory accesses through compression and the storage of weights in
+compressed sparse column representation.
+
+\subsection{Comparison with other hardware accelerators}
+
+\begin{figure*}[t]
+    \centering
+    \includegraphics[width=\textwidth]{resources/accelerators_table}
+    \caption{EIE compared with different DNN hardware accelerators \cite{10.1109/ISCA.2016.30}}
+    \label{accelerators}
+\end{figure*}
+
+Fig.~\ref{accelerators} shows a comparison between multiple hardware accelerators for inference of
+DNNs, namely A-Eye \cite{10.1145/2847263.2847265}, DaDianNao \cite{7011421} and TrueNorth \cite{10.1126/science.1254642} (amongst general purpose platforms).
+
+\subsubsection{A-Eye}
+
+A-Eye is a hardware accelerator targeting computational-centric parts of DNNs, namely the convolutional layers,
+which make up more than 90\% of computational cost \cite{9082126}. It does not approach the problems
+considered here with memory accesses of the fully connected layers. It also stores the main portion of the weights
+on external DDR3-DRAM and uses SRAM just as internal buffer. Though it uses efficient pooling to maximize
+the benefit of burst read-outs, it is not as efficient as it would be if its fully implemented in SRAM technology.
+Additionally, it is implemented on FPGA (Xilinx Zynq XC7Z045), therefore it lacks energy efficiency compared
+to ASICs like the EIE.
+
+It has an overall performance of $136.97$ GOP/s ($33$ Frames/s) and an energy efficiency of $14.22$ GOP/s/W ($3.43$ Frames/J) \cite{10.1145/2847263.2847265}.
+
+\subsubsection{DaDianNao}
+
+DaDianNao is a hardware accelerator which focuses on both, computational cost and memory accesses.
+For this purpose, it has embedded (on-chip) DRAM for parameter storage. Doing so, it archieves
+$450.65$x speedup compared to GPU. However, unlike the EIE, it is incapable of handling compressed
+DNNs and its main memory is still based on DRAM technology, while SRAM would be much faster.
+Benefits of this accelerator is the scalability. It consists of multiple nodes of the same type,
+and has been implemented in systems of up to 64 nodes, while this can be extended even further.
+
+With this 64-chip system it has a throughput of $147938$ Frames/s and an energy efficiency of $9263$ Frames/J.
+While the speed is better on a large scale due to its scalability, it has a bad energy efficiency.
+
+\subsubsection{TrueNorth}
+
+The TrueNorth supercomputer is a non-von Neumann system with transistor-based programmable neurons.
+This way, it overcomes the memory bottleneck, and technically the speed is comparable to SRAM accesses.
+For a standard VGA video at $30$ FPS, the chip consumes only $63$mW, which gives it a high energy efficiency
+of $10839$ Frames/J, compared to other hardware accelerators. However, the EIE is even better in energy efficiency by a
+factor of $13-18$, depending on process size and number of PEs.
+Also, due to its specialized architecture with programmable neurons, it has a bad area efficiency of only $4.63$ Frames/s/$\text{mm}^2$,
+which is only $\sim 0.23$\% of EIE. The throughput is relatively small compared to EIE, because it is also
+unable to handle compressed DNNs.
+
+\section{Future work}\label{sec:future}
+
+Different compression algorithms have been presented in section~\ref{sec:compression}.
+Not all of them are used for the EIE implementation. Some of them are orthogonal to the used compression methods,
+so they can be implemented and applied to the DNN without interference with the EIE.
+The different pruning strategies are an example for that. Other compression methods need an adjustment of the hardware architecture to different extends.
+
+For Huffman encoding (section~\ref{sec:huffman}) the different bit widths of the weights need to be handled
+by the hardware to fully exploit the possible compression ratio. Also, huffman tree lookups can be optimized
+to reduce the number of memory accesses. All in all this is a promising optimization with
+a lossless compression factor of up to $49$x.
+
+Another promising method are HashNets (section~\ref{sec:hashnets}). They omit the index lookup from the matrix entirely,
+and just compute lookup indices from hash functions.
+These hash functions need to be implemented in hardware to be efficient considering energy and speed,
+but it is technically possible \cite{10.5555/3045118.3045361}. A benefit of this method is the adjustable compression factor of up to
+$64$x, depending on the accuracy constraints (this compression method is not lossless). This way, the architecture,
+or at least the usage of it, can be adjusted to the users needs.
+
+Further optimization methods are technology based. For computation, MAC operations can be outsourced to the memory,
+performing in-memory computation \cite{MUTLU201928}. This would make a large portion of data transfers obsolete, which increases throughput and
+energy efficiency. Also, the existing ALU-implementation can be replaced by approximating circuits \cite{1274006},
+to the cost of a less accurate system, but another increase in speed and energy efficiency.
+
 \bibliographystyle{IEEEtran}
 \bibliography{Paper/references}
 
-- 
cgit v1.2.1