From ce974e86afd59037c38cb07f4f6b5458a81ca06a Mon Sep 17 00:00:00 2001 From: Leonard Kugis Date: Mon, 23 Jan 2023 04:29:27 +0100 Subject: Paper: Introduction, Compression, Implementation --- Paper/paper.tex | 592 +++++++++++++++++++++++++++++++++++++++++++++++++++ Paper/references.bib | 238 +++++++++++++++++++++ Paper/template.tex | 2 +- 3 files changed, 831 insertions(+), 1 deletion(-) create mode 100644 Paper/paper.tex create mode 100644 Paper/references.bib (limited to 'Paper') diff --git a/Paper/paper.tex b/Paper/paper.tex new file mode 100644 index 0000000..32e1bf3 --- /dev/null +++ b/Paper/paper.tex @@ -0,0 +1,592 @@ +\documentclass[conference]{IEEEtran} +%\IEEEoverridecommandlockouts +% The preceding line is only needed to identify funding in the first footnote. If that is unneeded, please comment it out. +\usepackage{cite} +\usepackage{amsmath,amssymb,amsfonts} +\usepackage{algorithmic} +\usepackage{graphicx} +\usepackage{textcomp} +\usepackage{xcolor} +\def\BibTeX{{\rm B\kern-.05em{\sc i\kern-.025em b}\kern-.08em + T\kern-.1667em\lower.7ex\hbox{E}\kern-.125emX}} + +\begin{document} + +\nocite{*} + +\title{EIE: Efficient Inference Engine on Compressed Deep Neural Networks} +\author{\IEEEauthorblockN{Leonard Kugis} +\IEEEauthorblockA{\textit{Abteilung Entwurf Integrierter Systeme} \\ +\textit{Technische Universität Braunschweig}\\ +Braunschweig, Germany \\ +l.kugis@tu-bs.de} +} + +\maketitle + +\begin{abstract} +With an increased use of deep neural networks instead of conventional algorithms for classification tasks, +there is demand for implementation in various environments with different constraints. +Especially high demand for inference in the embedded real time domain has emerged, with its hard +constraints on latency, energy use and storage. +To satisfy these constraints, dedicated hardware architectures need to be implemented to maximize +the efficiency, such as the \emph{Efficient Inference Engine} (\emph{EIE}). +It utilizes various compression techniques to compress existing deep neural network models +and implements a specialized hardware architecture to handle the compressed models +as efficient as possible. +\end{abstract} + +\begin{IEEEkeywords} +deep neural networks, compression, inference, weights, sparsity +\end{IEEEkeywords} + +\section{Introduction} + +This paper gives an overview over different compression methods for \emph{Deep Neural Networks} +(section~\ref{sec:compression}), after discussing the metrices used to measure +inference engines (section~\ref{sec:metrices}) +and shows how they are applied in an actual hardware architecture: the \emph{Efficient Inference Engine} (\emph{EIE}) (section~\ref{sec:implementation}). + +\subsection{Deep Neural Networks} + +\emph{Deep Neural Networks} (\emph{DNNs}) consist of different types of layers, namely +\emph{Convolutional Layers} and \emph{Fully Connected Layers}. +While for common DNNs (e.g. AlexNet \cite{10.1145/3065386}) convolutional layers make +up more than 90\% of computational cost \cite{9082126}, fully connected layers take +up over 90\% of the storage \cite{Cheng_2015_ICCV}. + +\begin{figure}[htbp] + \centerline{\includegraphics[width=\linewidth, keepaspectratio]{resources/fcn}} + \caption{Fully connected layer} + \label{fcn} +\end{figure} + +Fig.~\ref{fcn} shows a fully connected layer in a DNN. +Each fully connected layer consists of an input vector $a$, an output vector $b$ and +a weight matrix $W$. +Essentially, the weight matrix gets multiplied by the input vector, +so that each combination of the entries $W_{ij}$ get multiplied by the component $a_j$, +on which an activation function $f$ is applied, forming a component in the output vector $b_i$. + +\begin{align}\label{mac} + b_i = f(\sum\limits_{j=0}^{n} W_{ij} a_j) +\end{align} + +Equ.~\ref{mac} shows this principle formulated. The process of multiplying the weights +with the components of the input vector is referred to as \emph{Multiply-Accumulate-Operation} (\emph{MAC}). +This is the essential operation in DNNs and therefore in focus for optimization. + +Input and output vectors are considered as dynamic data, while +weights and eventual parameters of the activation function are considered as static data. + +\section{Performance metrices}\label{sec:metrices} + +When it comes to optimization of inference architectures for DNNs, there are four major categories of +metrices to optimize on: + +\begin{itemize} + \item Throughput + \item Latency + \item Model size + \item Energy use +\end{itemize} + +\emph{Throughput}. Throughput measures how much data can be processed by the network in a fixed time. + +\emph{Latency}. Latency measures how long it takes for a fixed amount of data to be processed by the network from start to finish. + +\emph{Model size}. Model size indicates the amount of storage required to store the model. + +\emph{Energy use}. Energy use measures how much energy is needed to process a fixed amount of data. + +\subsection{Memory}\label{sec:memory} + +Since the 1980s there is a more and more evolving gap between memory latency and processing speed. +Recent CPUs are magnitudes faster than their memory counterparts \cite{carvalho2002gap}. +This is problematic, because in general the CPU has to wait during each memory access for the data +to arrive in order to process it further. While this waiting time can be optimized with +efficient pipelining, the principal still remains. +The memory accesses for the weights of the DNN make up the biggest part of energy consumption of the +hardware accelerator, and correlate also with latency and throughput \cite{DBLP:journals/corr/SzeCESZ16}. + +\begin{figure}[htbp] + \centerline{\includegraphics[width=\linewidth, keepaspectratio]{resources/memory_latency}} + \caption{Memory hierarchy and energy cost of hierarchy levels \cite{DBLP:journals/corr/SzeCESZ16}} + \label{memory_latency} +\end{figure} + +Fig.~\ref{memory_latency} shows the energy consumption of each layer of a typical hardware accelerator +consisting of an external \emph{Dynamic Random Access Memory} (\emph{DRAM}) to store the model, +internal buffers (typically \emph{Static Random Access Memory} (\emph{SRAM})) and processing engines (\emph{PEs}). +Memory accesses to DRAM are 200x more costly than register accesses, and 33x more costly than accesses to +the SRAM-buffer. This results in the demand of storing most of the model in SRAM technology. +Because such on-chip SRAM capacity is limited, models must be compressed in order to fit onto the SRAM. + +\subsection{Existing DNNs} + +To put the model sizes of DNNs into perspective, Table~\ref{tab_dnn} gives an overview of +the number of parameters and storage sizes in 32-bit floating point representation. + +\begin{table}[htbp] +\caption{Model sizes of different DNN models} +\begin{center} +\begin{tabular}{|c|c|c|} +\hline +\textbf{DNN} & \textbf{Number of} & \textbf{Size in} \\ +& \textbf{Parameters} & \textbf{32-bit float} \\ +\hline +AlexNet \cite{choudhary2020comprehensive} & 61 M & 240 MB \\ +\hline +VGG-16 \cite{choudhary2020comprehensive} & 138 M & 512 MB \\ +\hline +LeNet-5 (caffe) \cite{NIPS2015_ae0eb3ee} & 431 K & 1.724 MB \\ +\hline +NeuralTalk (zoo) \cite{das2015neuraltalk} & 90 M & 360 MB \\ +\hline +\end{tabular} +\label{tab_dnn} +\end{center} +\end{table} + +These are common models to perform benchmarks on hardware accelerators +and compression algorithms. They will be used for evaluation between different +platforms later. + +\section{Compression}\label{sec:compression} + +Different compression algorithms exist for DNN models. Not only storage size benefits +from model compression, most methods also optimize the amount of MACs (Equ.~\ref{mac}) that need to be +performed. It does so by creating a sparse weight matrix from the original weight matrix. +Multiplication and accumulation with $0$-valued weights can be skipped and therefore it is desired +to have as many $0$s as weights as possible. + +\subsection{Basis projection} + +Basis projection utilizes the fact that row- and column-vectors of the weight matrices can be +represented by the linear combination of its basis vectors. +In common training algorithms, weights (the weight matrices) are trained directly and use the canonical basis vectors. +But this might not be the optimal basis vectors, resulting in the most amount of zeros. + +\begin{figure}[htbp] + \centerline{\includegraphics[width=\linewidth, keepaspectratio]{resources/basis_projection_png}} + \caption{Basis projection and resulting weight distribution \cite{DBLP:journals/corr/SuleimanZS16}} + \label{basis_projection} +\end{figure} + +Fig.~\ref{basis_projection} depicts the method of basis projection and the resulting distributions of +weight values. It can be shown that the column vectors can be represented by projected vectors $\alpha$ +and a-priory known basis vectors $S_d$. As the basis projection is a reversible operation, the reverse +can later be applied to receive the actual results. + +Using this method, experimental results show, that the number of zero-valued weights can be increased from +$7$\% to $56$\% ($8.0$x increase) \cite{DBLP:journals/corr/SuleimanZS16}. +Further experiments have shown that, when used along with other compression methods such as pruning and +weight quantization, $3.6$x compression factor can be archieved. This results in $10$x less MAC operations and +$5$x reduction in power consumption \cite{DBLP:journals/corr/SuleimanZS16}. + +\subsection{Pruning} + +The key idea behind pruning is the removal of unimportant weights from the weight matrices, which have little or +no influence on the output accuracy. This influence is also referred to as \emph{sensitivity}. +For pruning, a common procedure has established \cite{Han2015DeepCC}: + +\begin{enumerate} + \item Train weights + \item Prune network + \item Retrain network + \item Go to 2. or abort if criterion is met +\end{enumerate} + +Under the assumption that there is an already trained network, weights (connections), neurons and convolutional filters +get removed from the network based on their output sensitivity. However, convolutional filters are not in this scope, +since they are not part of the fully connected layers of a DNN. Therefore, the main focus +is on weight/connection based pruning. + +Pruned weights are simply set to value $0$ and not considered in the retraining process. +This way, they can be handled efficiently by the hardware architecture, by just skipping the +MAC-operation for these weights. Neurons are pruned by just deleting the corresponding +row/column of the weight matrix. + +For weight/connection pruning, there are different methods on how to remove the weights and what the +sensitivity is measured on. They can be classified in three different main classes: + +\begin{itemize} + \item Magnitude threshold based pruning \cite{9253578} + \item Optimal Brain Damage \cite{NIPS1989_6c9882bb} / Optimal Brain Surgeon \cite{NIPS1992_303ed4c6} + \item Biased weight decay \cite{NIPS1988_1c9ac015} +\end{itemize} + +\subsubsection{Magnitude threshold based pruning} + +Magnitude threshold based pruning methods remove weights solely based on their magnitude. +The following simple rule is applied for the weight removal: + +\begin{align}\label{mtbp} + &\text{Remove weights $W_{ij}$ for which} \nonumber \\ + &\left| W_{ij} \right| < \vartheta \\ + &\vartheta := \text{Threshold} \nonumber +\end{align} + +Weights $W_{ij}$ with magnitude below the threshold $\vartheta$ are removed (set to $0$). +This assumes that values with low magnitude have a low impact on the accuracy +of the network and/or the objective function. +This is generally not always the case and therefore not the optimal solution. +However, experiments have shown model size decreases of $33$\% for LeNet and up +to $33$\% for DeepSpeech2 \cite{9253578}. + +This approach is often chosen in environments with focus on training speed, because +it does not require the calculation of e.g. gradients. +Therefore, it archives a training cost reduction of $64$\% for LeNet and up to $71$\% +for DeepSpeech2 \cite{9253578}. + +\subsubsection{Optimal Brain Damage} + +The \emph{Optimal Brain Damage} (\emph{OBD}) method removes weights based on their influence on the +objective function. The objective function can be any function formulating the error to be +minimized with training and measures the accuracy of the DNN. + +Formally, the sensitivity is derived from the diagonals of the Hessian of the weight matrix: + +\begin{align}\label{hessian} + & h_{kk} = \sum\limits_{(i,j) \in V_k} \frac{{\partial}^2 E}{\partial w_{ij}^2} \\ + & V_k := \text{Set of index pairs for matrix $W_k$} \nonumber +\end{align} + +After that the \emph{saliencies} are calculated as a measure of sensitivity: + +\begin{align}\label{saliencies} + & s_k = \frac{h_{kk} u_k^2}{2} \\ + & u_k := \text{Element $k$ of parameter vector} \nonumber +\end{align} + +For the pruning, all weight matrices undergo the following algorithm \cite{NIPS1989_6c9882bb}: + +\begin{enumerate} + \item Choose network architecture + \item Train network + \item Compute Hessian diagonals $h_{kk}$ for each parameter + \item Compute saliencies $s_k = \frac{h_{kk} u_k^2}{2}$ for each parameter + \item Sort parameters by saliency and remove low-saliency parameters + \item Go to step 2 or abort when accuracy threshold is reached +\end{enumerate} + +Experiments show that for LeNet a significant decrease in the \emph{Minimum-Mean-Square-Error} (\emph{MSE}) function +can be obtained compared to magnitude based methods with the same level of pruning, especially for high pruning amounts. +For $500$ remaining parameters, the MSE for OBD is $3$ magnitudes smaller than for magnitude based pruning. +However, this results in a generally high error rate and is therefore undesired. Optimal solutions +are obtained for $30$\% network pruning rate using this method. + +\subsubsection{Optimal Brain Surgeon} + +\emph{Optimal Brain Surgeon} (\emph{OBS}) works similar to OBD. Compared to OBD it does not neccessarily delete +the weights, but changes the strength of those weights. + +It does so by computing the inverse of the Hessian of the weight matrix \cite{NIPS1992_303ed4c6}: + +\begin{align}\label{saliencies} + & \textbf{H}_{m+1}^{-1} = \textbf{H}_{m}^{-1} - \frac{\textbf{H}_{m}^{-1} \textbf{X}_{m+1} \textbf{X}_{m+1}^T \textbf{H}_{m}^{-1}}{P + \textbf{X}_{m+1}^T \textbf{H}_{m}^{-1} \textbf{X}_{m+1}} \\ + & \textbf{H}^{-1} = \frac{1}{P} \sum\limits_{k=1}^{P} H_k^{-1} +\end{align} + +For the pruning, all weight matrices undergo the following algorithm \cite{NIPS1992_303ed4c6}: + +\begin{enumerate} + \item Train network. + \item Compute $\textbf{H}^{-1}$. + \item Find $q$ with smallest saliency $L_q = \frac{w_q^2}{2 \textbf{H}_{qq}^{-1}}$. If this gradient increase is much smaller than $E$, + delete $w_q$ and go to step 4. Otherwise go to step 5. + \item Use the $q$ from step 3 and update all weights. Go to step 2. + \item No more weights can be deleted without a large increase in $E$. Go to step 1 or abort if lower threshold in accuracy is reached. +\end{enumerate} + +Experiments show that this method can be used to reduce the number of weights by up to $90$\% without significant +impact on the accuracy, without retraining \cite{NIPS1992_303ed4c6}. + +\subsubsection{Biased weight decay} + +The \emph{Biased weight decay} method works during training-time of the model. +It bases on the backpropagation algorithm and adds a penalty term to the weight update formula: + +\begin{align} + & w_{n+1} = \alpha (- \frac{\partial E}{\partial w_n}) + \beta w_n \label{bwd1} \\ + & w_{n+1} = \alpha (- \frac{\partial E}{\partial w_{ij}} - 2 w_n) + w_n \label{bwd2} \\ + & \alpha < \frac{1}{2} := \text{Learning rate} \nonumber +\end{align} + +Equ.~\ref{bwd1} shows the update term of the backpropagation time considering an equal weight decay +for all values. Equ.~\ref{bwd2} shows the update term considering lower values to decay faster toward +zero while keeping high-valued weights the same. + +\subsection{Weight quantization} + +Storing the weight matrices as 32-bit floating point representation is not the most optimal +way of storage. While it preserves the precision the network was originally trained with, +it can be reduced significantly without high loss of accuracy. + +\begin{figure}[htbp] + \centerline{\includegraphics[width=\linewidth, keepaspectratio]{resources/clustering}} + \caption{Weight quantization using clustering \cite{Han2015DeepCC}} + \label{clustering} +\end{figure} + +A key concept for quantization of weights is \emph{clustering}. Fig.~\ref{clustering} illustrates +this process. Using this method, similar valued weights are grouped into clusters. +The weight matrix then only stores the cluster index instead of the actual 32-bit floating point value. +Each cluster has its own \emph{centroid} for which the index value in the weight matrix +gets substituted. The centroids are the mean of all weight-values belonging to this +respective cluster. Additionally, the centroids can be fine-tuned by applying clustering to the gradient +of the weight matrix aswell, and update the centroid value with the gradient-means. + +This way, only a lookup-array with the cluster centroids and the respective indices in the weight matrix +need to be stored. In the example in Fig.~\ref{clustering} a $4 \times 4$ matrix of 32-bit floating point values +($512$ bit in total) can be reduced to a $4 \times 4$ matrix of 2-bit indices and a 4-value array of 32-bit floating point values +containing the centroids ($4 \cdot 4 \cdot 2 + 4 \cdot 32 = 160$ bit in total). + +Experiments have shown that using this quantization method only, compression factors of $37$x, $31$x and +$33$x can be archieved on AlexNet, VGG-16 and LeNet-5, while the loss in accuracy was +within tolerance \cite{Han2015DeepCC}. + +Technically, this compression method is not lossless, as it generalizes similar weights to their centroids. +So it comes down to the selection of the right centroids and which values to group into. If this is done efficiently, +the impact on accuracy is not measurable. For that, the k-means-algorithm is used. It minimizes +the \emph{Within-Cluster Sum of Squares} (\emph{WCSS}): + +\begin{align} + \text{argmin}_C \sum\limits_{i=1}^{k} \sum\limits_{\omega \in c_i} | \omega - c_i |^2 +\end{align} + +The k-means algorithm performs the following steps: + +\begin{enumerate} + \item Initialize $k$ cluster centroids + \item For all weights: + \begin{enumerate} + \item Assign weight to the cluster with nearest centroid + \item Recalculate cluster centroids + \end{enumerate} +\end{enumerate} + +A crucial degree of freedom for this is the initialization of the centroids. It determines +how fast the algorithm converges and to which values. There are different initialization methods. + +\begin{figure}[htbp] + \centerline{\includegraphics[width=\linewidth, keepaspectratio]{resources/centroid_initialization}} + \caption{Different centroid initialization methods \cite{Han2015DeepCC}} + \label{init} +\end{figure} + +Fig.~\ref{init} depicts the different possible methods for centroid initialization. +Weight value probability density function (PDF) and cumulative distribution function (CDF) are plotted. +Note the peaks for low magnitude weights around 0. +Centroids can be initialized randomly, linear, or by linear sampling of the CDF values. +Generally, sampling the CDF values linearly and taking the corresponding weight values as centroids +minimizes the total error in weight values compared to their centroids. While this makes sense in most cases, +it does not in the case of DNN-weights. Here, weights with high value have a higher impact on the output, +but are underrepresented by occurrence using this method. This would lead to high value-centroid differences for those values, +because there are less centroids used. Because of this, linear initialization in the value domain has been established as the +best initialization method \cite{Han2015DeepCC}. + +\subsection{Huffman encoding} + +Another compression method that can be applied to DNNs is the Huffman encoding. + +\begin{figure}[htbp] + \centerline{\includegraphics[width=\linewidth, keepaspectratio]{resources/huffman}} + \caption{Huffman encoding example} + \label{huffman} +\end{figure} + +Fig.~\ref{huffman} shows an encoding example. As a general principle, weights are encoded +with variable length in bits. Weights with many occurrences are encoded with less bits than +weights with less occurrences. The code book is stored as a binary tree, with edges encoding +$0$s and $1$s. The algorithm is the following: + +\begin{enumerate} + \item Count all weights and sort them by number of occurrence + \item Take the two weights with lowest number of occurrence and make them leafs of the tree + \item Connect the two nodes of the current layer together to a parent node + \item Take the weight with the next lowest occurrence and make it a leaf on the current layer + \item Go to step 3 or abort if no weight values are left +\end{enumerate} + +Weights are then encoded by the edges leading to their respective leafs. E.g. in Fig.~\ref{huffman} +the weight value $0$ is encoded as bit $0$, however weight value $2$ is encoded as bits $101$. +In this particular example, a compression rate of $74.05$\% is archieved. + +This compression method is especially efficient for models, in which one weight value is dominant. +When combined with other compression methods (e.g. pruning), which result in sparse matrices, +this is exactly the case with 0-values. + +Experiments have shown that when used together with pruning and weight quantization, +Huffman encoding archieves $35$x - $49$x compression rate \cite{Han2015DeepCC}. + +Another remarkable advantage of this compression method is that it is lossless and has therefore +no impact on the accuracy of the DNN. + +\subsection{HashNets} + +A relatively recent compression/optimization technique for weights of DNNs are HashNets \cite{10.5555/3045118.3045361}. +Using HashNets, no actual values need to be stored in the weight matrix (not even index values), +so the memory access for accessing these values can be completely omitted. +Instead, the values are depicted via a \emph{one-way hash function} $h$. + +\begin{figure}[htbp] + \centerline{\includegraphics[width=\linewidth, keepaspectratio]{resources/hashnets}} + \caption{HashNets encoding (based on \cite{10.5555/3045118.3045361})} + \label{hashnets} +\end{figure} + +Fig.~\ref{hashnets} shows this principle. The real weight values are stored in an indexed +array. The hash function $h(i,j) := (i,j) \mapsto x$ takes the weight matrix indices $(i,j)$ +and computes the index $x$ for the lookup array, so that the lookup value of the array corresponds +to the actual value of the \emph{virtual weight matrix} $\textbf{V}$: + +\begin{align} + w^{\ell}_{h^{\ell}(i, j)} = \textbf{V}^{\ell}_{ij} +\end{align} + +This way, the weight matrix only exists virtually. Only the dimensions of the weight matrix and +the weight lookup array need to be stored, and there is no additional index lookup +neccessary for the weight matrix. This comes to the cost of computing the hash function, +but as already discussed in section~\ref{sec:memory} computation is not the limiting factor +when compared to memory accesses. + +The compression factor for this method can be variably chosen, depending on accuracy requirements. +The degree of freedom here is the quantization method (how much different values to quantisize to), +thus, the resulting weight array. Also, different hash functions can be used. + +\cite{10.5555/3045118.3045361} discusses effects of compression factors of up to $64$x, and uses +\emph{xxHash} as hash function. + +\subsection{Sparse matrix storage} + +When it comes to compression of weight matrices, the storage of the resulting sparse matrix needs +to be considered aswell. The \emph{Compressed sparse column} (\emph{CSC}) / \emph{Compressed sparse row} (\emph{CSR}) +format has been established for this purpose. + +With this method, a row/column $W_j$ of weight values is encoded as vectors $v$ and $z$. +$v$ contains the non-zero weight values of the column. $z$ is of equal length than $v$ and denotes +the amount of $0$s before the corresponding entry in $v$. +E.g. column $[0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3]$ gets encoded as +$v = [1, 2, 0, 3]$ and $z = [2, 0, 15, 2]$. An additional padding value of $0$ has been added after +$15$ concurrent $0$s, because the entries are assumed to be of 4-bit integer representation, and there are more than +$15$ concurrent $0$s after the value $2$. + +For this example, the method needs storage for $8$ values instead of $23$, which equals a compression rate of +$34.78$\%. This method is also used by the EIE and optimized by its architecture. + +\section{EIE implementation}\label{sec:implementation} + +\begin{figure*}[t] + \centering + \includegraphics[width=\textwidth]{resources/eie_hw} + \caption{Hardware architecture of the EIE \cite{10.1109/ISCA.2016.30}} + \label{eie} +\end{figure*} + +In general, the EIE is an approach to optimize the per-activation formula of DNNs (Equ.~\ref{mac}): + +\begin{align} + & b_i = f(\sum\limits_{j=0}^{n-1} W_{ij} a_j) \overset{!}{=} \text{ReLU}(\sum\limits_{j \in X_i \cap Y} S[I_{ij}] a_j) \\ + & X_i := \text{Set of columns, skipping $W_{ij} = 0$} \nonumber \\ + & Y := \text{Set of indices in $a$ for which $a_j \neq 0$} \nonumber \\ + & I_{ij} := \text{4-bit index} \nonumber \\ + & S := \text{Shared lookup table} \nonumber +\end{align} + +As activation function $f$ the \emph{Rectified Linear Unit} function (\emph{ReLU}) is fixed. +It is defined as $0$ for negative values while keeping positive values unchanged. +The indices $I_{ij}$ decompressed from CSC-format are used to lookup the actual weights from a shared weight lookup table $S$. +Values of $0$ as weights or inputs are skipped for multiplication and accumulation, +as they do not need to be computed. + +Fig.~\ref{eie} shows a block diagram of the hardware architecture of the EIE. +The left side (a) is the \emph{leading non-zero detection unit} (\emph{LNZD}), which detects $0$-values in the input +vector and skips them, while only forwarding non-zero values. The right side (b) is the actual \emph{processing element} (\emph{PE}), which +receives the non-zero input values. + +For parallelism, there are generally multiple PEs working independently. $4$ PEs are grouped together and share one +LNZD-unit. + +\subsection{Leading non-zero detection unit} + +Input activations are distributed to each processing element (PE) in a hierarchical manner. +To utilize the sparsity of the input vector, a leading non-zero detection logic is employed to identify the first non-zero result. +A group of four PEs perform local leading non-zero detection on their respective inputs and the results are sent +to a LNZD node. + +\subsection{Activation queue} + +The non-zero elements of the input activation vector $a_j$, along with their corresponding indexes $j$ are broadcasted +to an activation queue in each PE. However, if any PE has a full queue, the broadcast is disabled. Each PE processes +the activation at the front of its queue at any given time. The activation queue enables each PE to accumulate +some values and indices to balance out any load imbalances that may occur due to variations in the number of non-zeros +in a particular column $j$ among the PEs. + +\subsection{Pointer Read Unit} + +To efficiently access the $v$ and $z$ arrays for a given column $j$ using single-ported SRAM arrays, +the index $j$ at the head of the activation queue is used to look up the start and end pointers ($p_j$ and $p_{j+1}$) +for that column. These are seperately stored along with $v$ and $z$ to mark the iteration boundaries. +To ensure that both pointers can be read in a single cycle, the 16-bit pointers are stored in +two separate SRAM banks and the least significant bit (LSB) of the address is used to select the appropriate bank. +As a result, $p_j$ and $p_{j+1}$ will always be located in different banks. + +\subsection{Sparse Matrix Read Unit} + +The Sparse Matrix Read Unit reads out the actual non-zero weight values from the SRAM, +for the part of the column $W_j$ associated with the respective PE. It uses the pointers +$p_j$ and $p_{j+1}$ fetched before as reading boundaries. + +As the SRAM is of 64-bit word length and an entry in $v$ and $z$ is encoded in 4-bit each, +$8$ complete values can be read out in one cycle. This $(v, z)$ pair is then forwarded +to the arithmetic unit. + +\subsection{Arithmetic Unit} + +The arithmetic unit receives the $(v,z)$ value pair. It calculates the absolute index +$x$ in the output vector $b$ from the relative index $z$ and calculates the multiplication +$b_x = b_x + v \cdot a_j$ for which the output $b_x$ is the accumulation register. + +The values are stored in an encoded and quantisized form, so for calculation they are expanded +to 16-bit floating point representation first. + +\subsection{Activation Read/Write Unit} + +The Activation Read/Write Unit handles the values of the output vector. The target values are buffered +in two register files. For feed-forward layers in NNs the output values of one layer +are the input files of the next layer, so they exchange their roles after each layer to +save memory accesses to the SRAM. + +Access to SRAM is only occurring if the buffer is full. Then the entire buffer batch is written +to the SRAM. + +\subsection{Weight value distribution} + +An important factor is how to distribute the individual values of the weight matrix +to the individual PEs. + +\begin{figure}[htbp] + \centerline{\includegraphics[width=\linewidth, keepaspectratio]{resources/eie_matrix}} + \centerline{\includegraphics[width=\linewidth, keepaspectratio]{resources/eie_layout}} + \caption{Weight matrix segmentation and memory layout \cite{10.1109/ISCA.2016.30}} + \label{eie_segmentation} +\end{figure} + +Fig.~\ref{eie_segmentation} illustrates how the values are distributed to 4 PEs, and the corresponding +memory layout for PE0. Each row belongs to one PE and are distributed sequentially, +repeating after 4 rows. Each PE calculates one element $b_x$ of the output vector $b$ with one entry +of the input vector $a_j$ at a time. The individual PEs perform the MACs for each assigned input entry, +skipping $0$-valued entries using the non-zero detection units. + +On the bottom of Fig.~\ref{eie_segmentation} the memory layout for PE0 is depicted. Each PE stores +the non-zero values assigned to it, together with the relative indices $z$ encoding the number +of zeros before each element in $v$, but only for the respective PE. +To know the iteration boundaries, the column pointers are stored seperately. In the example above, +the first column has pointer $0$ (because it is the first entry in total). The second entry has pointer $3$, +because this PE has $3$ non-zero values assigned to it in the first column. + +\bibliographystyle{IEEEtran} +\bibliography{Paper/references} + +\end{document} \ No newline at end of file diff --git a/Paper/references.bib b/Paper/references.bib new file mode 100644 index 0000000..b5ea474 --- /dev/null +++ b/Paper/references.bib @@ -0,0 +1,238 @@ +@article{choudhary2020comprehensive, + title={A comprehensive survey on model compression and acceleration}, + author={Choudhary, Tejalal and Mishra, Vipul and Goswami, Anurag and Sarangapani, Jagannathan}, + journal={Artificial Intelligence Review}, + volume={53}, + number={7}, + pages={5113--5155}, + year={2020}, + publisher={Springer} +} + +@article{10.1145/3065386, +author = {Krizhevsky, Alex and Sutskever, Ilya and Hinton, Geoffrey E.}, +title = {ImageNet Classification with Deep Convolutional Neural Networks}, +year = {2017}, +issue_date = {June 2017}, +publisher = {Association for Computing Machinery}, +address = {New York, NY, USA}, +volume = {60}, +number = {6}, +issn = {0001-0782}, +url = {https://doi.org/10.1145/3065386}, +doi = {10.1145/3065386}, +abstract = {We trained a large, deep convolutional neural network to classify the 1.2 million high-resolution images in the ImageNet LSVRC-2010 contest into the 1000 different classes. On the test data, we achieved top-1 and top-5 error rates of 37.5% and 17.0%, respectively, which is considerably better than the previous state-of-the-art. The neural network, which has 60 million parameters and 650,000 neurons, consists of five convolutional layers, some of which are followed by max-pooling layers, and three fully connected layers with a final 1000-way softmax. To make training faster, we used non-saturating neurons and a very efficient GPU implementation of the convolution operation. To reduce overfitting in the fully connected layers we employed a recently developed regularization method called "dropout" that proved to be very effective. We also entered a variant of this model in the ILSVRC-2012 competition and achieved a winning top-5 test error rate of 15.3%, compared to 26.2% achieved by the second-best entry.}, +journal = {Commun. ACM}, +month = {may}, +pages = {84–90}, +numpages = {7} +} + +@inproceedings{NIPS1988_1c9ac015, + author = {Hanson, Stephen and Pratt, Lorien}, + booktitle = {Advances in Neural Information Processing Systems}, + editor = {D. Touretzky}, + pages = {}, + publisher = {Morgan-Kaufmann}, + title = {Comparing Biases for Minimal Network Construction with Back-Propagation}, + url = {https://proceedings.neurips.cc/paper/1988/file/1c9ac0159c94d8d0cbedc973445af2da-Paper.pdf}, + volume = {1}, + year = {1988} +} + +@ARTICLE{9082126, + author={Xiang, Yachen and Huang, Peng and Han, Runze and Li, Chu and Wang, Kunliang and Liu, Xiaoyan and Kang, Jinfeng}, + journal={IEEE Transactions on Electron Devices}, + title={Efficient and Robust Spike-Driven Deep Convolutional Neural Networks Based on NOR Flash Computing Array}, + year={2020}, + volume={67}, + number={6}, + pages={2329-2335}, + doi={10.1109/TED.2020.2987439}} + +@InProceedings{Cheng_2015_ICCV, +author = {Cheng, Yu and Yu, Felix X. and Feris, Rogerio S. and Kumar, Sanjiv and Choudhary, Alok and Chang, Shi-Fu}, +title = {An Exploration of Parameter Redundancy in Deep Networks With Circulant Projections}, +booktitle = {Proceedings of the IEEE International Conference on Computer Vision (ICCV)}, +month = {December}, +year = {2015} +} + +@ARTICLE{726791, + author={Lecun, Y. and Bottou, L. and Bengio, Y. and Haffner, P.}, + journal={Proceedings of the IEEE}, + title={Gradient-based learning applied to document recognition}, + year={1998}, + volume={86}, + number={11}, + pages={2278-2324}, + doi={10.1109/5.726791} +} + +@article{10.1145/3007787.3001177, +author = {Chen, Yu-Hsin and Emer, Joel and Sze, Vivienne}, +title = {Eyeriss: A Spatial Architecture for Energy-Efficient Dataflow for Convolutional Neural Networks}, +year = {2016}, +issue_date = {June 2016}, +publisher = {Association for Computing Machinery}, +address = {New York, NY, USA}, +volume = {44}, +number = {3}, +issn = {0163-5964}, +url = {https://doi.org/10.1145/3007787.3001177}, +doi = {10.1145/3007787.3001177}, +abstract = {Deep convolutional neural networks (CNNs) are widely used in modern AI systems for their superior accuracy but at the cost of high computational complexity. The complexity comes from the need to simultaneously process hundreds of filters and channels in the high-dimensional convolutions, which involve a significant amount of data movement. Although highly-parallel compute paradigms, such as SIMD/SIMT, effectively address the computation requirement to achieve high throughput, energy consumption still remains high as data movement can be more expensive than computation. Accordingly, finding a dataflow that supports parallel processing with minimal data movement cost is crucial to achieving energy-efficient CNN processing without compromising accuracy.In this paper, we present a novel dataflow, called row-stationary (RS), that minimizes data movement energy consumption on a spatial architecture. This is realized by exploiting local data reuse of filter weights and feature map pixels, i.e., activations, in the high-dimensional convolutions, and minimizing data movement of partial sum accumulations. Unlike dataflows used in existing designs, which only reduce certain types of data movement, the proposed RS dataflow can adapt to different CNN shape configurations and reduces all types of data movement through maximally utilizing the processing engine (PE) local storage, direct inter-PE communication and spatial parallelism. To evaluate the energy efficiency of the different dataflows, we propose an analysis framework that compares energy cost under the same hardware area and processing parallelism constraints. Experiments using the CNN configurations of AlexNet show that the proposed RS dataflow is more energy efficient than existing dataflows in both convolutional (1.4\texttimes{} to 2.5\texttimes{}) and fully-connected layers (at least 1.3\texttimes{} for batch size larger than 16). The RS dataflow has also been demonstrated on a fabricated chip, which verifies our energy analysis.}, +journal = {SIGARCH Comput. Archit. News}, +month = {jun}, +pages = {367–379}, +numpages = {13} +} + +@inproceedings{carvalho2002gap, + title={The gap between processor and memory speeds}, + author={Carvalho, Carlos}, + booktitle={Proc. of IEEE International Conference on Control and Automation}, + year={2002} +} + +@article{DBLP:journals/corr/SzeCESZ16, + author = {Vivienne Sze and + Yu{-}Hsin Chen and + Joel S. Emer and + Amr Suleiman and + Zhengdong Zhang}, + title = {Hardware for Machine Learning: Challenges and Opportunities}, + journal = {CoRR}, + volume = {abs/1612.07625}, + year = {2016}, + url = {http://arxiv.org/abs/1612.07625}, + eprinttype = {arXiv}, + eprint = {1612.07625}, + timestamp = {Wed, 11 Dec 2019 16:23:12 +0100}, + biburl = {https://dblp.org/rec/journals/corr/SzeCESZ16.bib}, + bibsource = {dblp computer science bibliography, https://dblp.org} +} + +@article{DBLP:journals/corr/SuleimanZS16, + author = {Amr Suleiman and + Zhengdong Zhang and + Vivienne Sze}, + title = {A 58.6mW Real-Time Programmable Object Detector with Multi-Scale Multi-Object + Support Using Deformable Parts Model on 1920x1080 Video at 30fps}, + journal = {CoRR}, + volume = {abs/1607.08635}, + year = {2016}, + url = {http://arxiv.org/abs/1607.08635}, + eprinttype = {arXiv}, + eprint = {1607.08635}, + timestamp = {Wed, 11 Dec 2019 16:23:12 +0100}, + biburl = {https://dblp.org/rec/journals/corr/SuleimanZS16.bib}, + bibsource = {dblp computer science bibliography, https://dblp.org} +} + +@inproceedings{10.5555/3045118.3045361, +author = {Chen, Wenlin and Wilson, James T. and Tyree, Stephen and Weinberger, Kilian Q. and Chen, Yixin}, +title = {Compressing Neural Networks with the Hashing Trick}, +year = {2015}, +publisher = {JMLR.org}, +abstract = {As deep nets are increasingly used in applications suited for mobile devices, a fundamental dilemma becomes apparent: the trend in deep learning is to grow models to absorb ever-increasing data set sizes; however mobile devices are designed with very little memory and cannot store such large models. We present a novel network architecture, HashedNets, that exploits inherent redundancy in neural networks to achieve drastic reductions in model sizes. HashedNets uses a low-cost hash function to randomly group connection weights into hash buckets, and all connections within the same hash bucket share a single parameter value. These parameters are tuned to adjust to the HashedNets weight sharing architecture with standard backprop during training. Our hashing procedure introduces no additional memory overhead, and we demonstrate on several benchmark data sets that HashedNets shrink the storage requirements of neural networks substantially while mostly preserving generalization performance.}, +booktitle = {Proceedings of the 32nd International Conference on International Conference on Machine Learning - Volume 37}, +pages = {2285–2294}, +numpages = {10}, +location = {Lille, France}, +series = {ICML'15} +} + +@inproceedings{10.1109/ISCA.2016.30, +author = {Han, Song and Liu, Xingyu and Mao, Huizi and Pu, Jing and Pedram, Ardavan and Horowitz, Mark A. and Dally, William J.}, +title = {EIE: Efficient Inference Engine on Compressed Deep Neural Network}, +year = {2016}, +isbn = {9781467389471}, +publisher = {IEEE Press}, +url = {https://doi.org/10.1109/ISCA.2016.30}, +doi = {10.1109/ISCA.2016.30}, +abstract = {State-of-the-art deep neural networks (DNNs) have hundreds of millions of connections and are both computationally and memory intensive, making them difficult to deploy on embedded systems with limited hardware resources and power budgets. While custom hardware helps the computation, fetching weights from DRAM is two orders of magnitude more expensive than ALU operations, and dominates the required power.Previously proposed 'Deep Compression' makes it possible to fit large DNNs (AlexNet and VGGNet) fully in on-chip SRAM. This compression is achieved by pruning the redundant connections and having multiple connections share the same weight. We propose an energy efficient inference engine (EIE) that performs inference on this compressed network model and accelerates the resulting sparse matrix-vector multiplication with weight sharing. Going from DRAM to SRAM gives EIE 120\texttimes{} energy saving; Exploiting sparsity saves 10\texttimes{}; Weight sharing gives 8\texttimes{}; Skipping zero activations from ReLU saves another 3\texttimes{}. Evaluated on nine DNN benchmarks, EIE is 189\texttimes{} and 13\texttimes{} faster when compared to CPU and GPU implementations of the same DNN without compression. EIE has a processing power of 102 GOPS working directly on a compressed network, corresponding to 3 TOPS on an uncompressed network, and processes FC layers of AlexNet at 1.88\texttimes{}104 frames/sec with a power dissipation of only 600mW. It is 24,000\texttimes{} and 3,400\texttimes{} more energy efficient than a CPU and GPU respectively. Compared with DaDianNao, EIE has 2.9\texttimes{}, 19\texttimes{} and 3\texttimes{} better throughput, energy efficiency and area efficiency.}, +booktitle = {Proceedings of the 43rd International Symposium on Computer Architecture}, +pages = {243–254}, +numpages = {12}, +keywords = {hardware acceleration, ASIC, algorithm-hardware co-design, model compression, deep learning}, +location = {Seoul, Republic of Korea}, +series = {ISCA '16} +} + +@article{Han2015DeepCC, + title={Deep Compression: Compressing Deep Neural Network with Pruning, Trained Quantization and Huffman Coding}, + author={Song Han and Huizi Mao and William J. Dally}, + journal={arXiv: Computer Vision and Pattern Recognition}, + year={2015} +} + +@ARTICLE{9253578, + author={Dai, Xiaoliang and Yin, Hongxu and Jha, Niraj K.}, + journal={IEEE Transactions on Emerging Topics in Computing}, + title={Incremental Learning Using a Grow-and-Prune Paradigm With Efficient Neural Networks}, + year={2022}, + volume={10}, + number={2}, + pages={752-762}, + doi={10.1109/TETC.2020.3037052} +} + +@inproceedings{NIPS2015_ae0eb3ee, + author = {Han, Song and Pool, Jeff and Tran, John and Dally, William}, + booktitle = {Advances in Neural Information Processing Systems}, + editor = {C. Cortes and N. Lawrence and D. Lee and M. Sugiyama and R. Garnett}, + pages = {}, + publisher = {Curran Associates, Inc.}, + title = {Learning both Weights and Connections for Efficient Neural Network}, + url = {https://proceedings.neurips.cc/paper/2015/file/ae0eb3eed39d2bcef4622b2499a05fe6-Paper.pdf}, + volume = {28}, + year = {2015} +} + +@article{das2015neuraltalk, + title={NeuralTalk on Embedded System and GPU-accelerated RNN}, + author={Das, Subhasis and Han, Song}, + journal={Subhasis Das and Song Han}, + year={2015} +} + +@inproceedings{NIPS1989_6c9882bb, + author = {LeCun, Yann and Denker, John and Solla, Sara}, + booktitle = {Advances in Neural Information Processing Systems}, + editor = {D. Touretzky}, + pages = {}, + publisher = {Morgan-Kaufmann}, + title = {Optimal Brain Damage}, + url = {https://proceedings.neurips.cc/paper/1989/file/6c9882bbac1c7093bd25041881277658-Paper.pdf}, + volume = {2}, + year = {1989} +} + +@ARTICLE{8704878, +author = {X. Dai and H. Yin and N. K. Jha}, +journal = {IEEE Transactions on Computers}, +title = {NeST: A Neural Network Synthesis Tool Based on a Grow-and-Prune Paradigm}, +year = {2019}, +volume = {68}, +number = {10}, +issn = {1557-9956}, +pages = {1487-1497}, +keywords = {neurons;computer architecture;training;biological neural networks;tools;manganese;correlation}, +doi = {10.1109/TC.2019.2914438}, +publisher = {IEEE Computer Society}, +address = {Los Alamitos, CA, USA}, +month = {oct} +} + +@inproceedings{NIPS1992_303ed4c6, + author = {Hassibi, Babak and Stork, David}, + booktitle = {Advances in Neural Information Processing Systems}, + editor = {S. Hanson and J. Cowan and C. Giles}, + pages = {}, + publisher = {Morgan-Kaufmann}, + title = {Second order derivatives for network pruning: Optimal Brain Surgeon}, + url = {https://proceedings.neurips.cc/paper/1992/file/303ed4c69846ab36c2904d3ba8573050-Paper.pdf}, + volume = {5}, + year = {1992} +} \ No newline at end of file diff --git a/Paper/template.tex b/Paper/template.tex index 8d636c7..189975c 100644 --- a/Paper/template.tex +++ b/Paper/template.tex @@ -229,7 +229,7 @@ copy& More table copy$^{\mathrm{a}}$& & \\ \end{table} \begin{figure}[htbp] -\centerline{\includegraphics{Resources/fig1.png}} +\centerline{\includegraphics{resources/fig1.png}} \caption{Example of a figure caption.} \label{fig} \end{figure} -- cgit v1.2.1