1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
|
@inproceedings{NIPS1989_6c9882bb,
author = {LeCun, Yann and Denker, John and Solla, Sara},
booktitle = {Advances in Neural Information Processing Systems},
editor = {D. Touretzky},
pages = {},
publisher = {Morgan-Kaufmann},
title = {Optimal Brain Damage},
url = {https://proceedings.neurips.cc/paper/1989/file/6c9882bbac1c7093bd25041881277658-Paper.pdf},
volume = {2},
year = {1989}
}
@INPROCEEDINGS{7993626,
author={Sze, Vivienne and Chen, Yu-Hsin and Emer, Joel and Suleiman, Amr and Zhang, Zhengdong},
booktitle={2017 IEEE Custom Integrated Circuits Conference (CICC)},
title={Hardware for machine learning: Challenges and opportunities},
year={2017},
volume={},
number={},
pages={1-8},
doi={10.1109/CICC.2017.7993626}
}
@article{10.1145/3007787.3001163,
author = {Han, Song and Liu, Xingyu and Mao, Huizi and Pu, Jing and Pedram, Ardavan and Horowitz, Mark A. and Dally, William J.},
title = {EIE: Efficient Inference Engine on Compressed Deep Neural Network},
year = {2016},
issue_date = {June 2016},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
volume = {44},
number = {3},
issn = {0163-5964},
url = {https://doi.org/10.1145/3007787.3001163},
doi = {10.1145/3007787.3001163},
abstract = {State-of-the-art deep neural networks (DNNs) have hundreds of millions of connections and are both computationally and memory intensive, making them difficult to deploy on embedded systems with limited hardware resources and power budgets. While custom hardware helps the computation, fetching weights from DRAM is two orders of magnitude more expensive than ALU operations, and dominates the required power.Previously proposed 'Deep Compression' makes it possible to fit large DNNs (AlexNet and VGGNet) fully in on-chip SRAM. This compression is achieved by pruning the redundant connections and having multiple connections share the same weight. We propose an energy efficient inference engine (EIE) that performs inference on this compressed network model and accelerates the resulting sparse matrix-vector multiplication with weight sharing. Going from DRAM to SRAM gives EIE 120\texttimes{} energy saving; Exploiting sparsity saves 10\texttimes{}; Weight sharing gives 8\texttimes{}; Skipping zero activations from ReLU saves another 3\texttimes{}. Evaluated on nine DNN benchmarks, EIE is 189\texttimes{} and 13\texttimes{} faster when compared to CPU and GPU implementations of the same DNN without compression. EIE has a processing power of 102 GOPS working directly on a compressed network, corresponding to 3 TOPS on an uncompressed network, and processes FC layers of AlexNet at 1.88\texttimes{}104 frames/sec with a power dissipation of only 600mW. It is 24,000\texttimes{} and 3,400\texttimes{} more energy efficient than a CPU and GPU respectively. Compared with DaDianNao, EIE has 2.9\texttimes{}, 19\texttimes{} and 3\texttimes{} better throughput, energy efficiency and area efficiency.},
journal = {SIGARCH Comput. Archit. News},
month = {jun},
pages = {243–254},
numpages = {12},
keywords = {ASIC, hardware acceleration, algorithm-hardware co-design, model compression, deep learning}
}
@inproceedings{10.1109/ISCA.2016.30,
author = {Han, Song and Liu, Xingyu and Mao, Huizi and Pu, Jing and Pedram, Ardavan and Horowitz, Mark A. and Dally, William J.},
title = {EIE: Efficient Inference Engine on Compressed Deep Neural Network},
year = {2016},
isbn = {9781467389471},
publisher = {IEEE Press},
url = {https://doi.org/10.1109/ISCA.2016.30},
doi = {10.1109/ISCA.2016.30},
abstract = {State-of-the-art deep neural networks (DNNs) have hundreds of millions of connections and are both computationally and memory intensive, making them difficult to deploy on embedded systems with limited hardware resources and power budgets. While custom hardware helps the computation, fetching weights from DRAM is two orders of magnitude more expensive than ALU operations, and dominates the required power.Previously proposed 'Deep Compression' makes it possible to fit large DNNs (AlexNet and VGGNet) fully in on-chip SRAM. This compression is achieved by pruning the redundant connections and having multiple connections share the same weight. We propose an energy efficient inference engine (EIE) that performs inference on this compressed network model and accelerates the resulting sparse matrix-vector multiplication with weight sharing. Going from DRAM to SRAM gives EIE 120\texttimes{} energy saving; Exploiting sparsity saves 10\texttimes{}; Weight sharing gives 8\texttimes{}; Skipping zero activations from ReLU saves another 3\texttimes{}. Evaluated on nine DNN benchmarks, EIE is 189\texttimes{} and 13\texttimes{} faster when compared to CPU and GPU implementations of the same DNN without compression. EIE has a processing power of 102 GOPS working directly on a compressed network, corresponding to 3 TOPS on an uncompressed network, and processes FC layers of AlexNet at 1.88\texttimes{}104 frames/sec with a power dissipation of only 600mW. It is 24,000\texttimes{} and 3,400\texttimes{} more energy efficient than a CPU and GPU respectively. Compared with DaDianNao, EIE has 2.9\texttimes{}, 19\texttimes{} and 3\texttimes{} better throughput, energy efficiency and area efficiency.},
booktitle = {Proceedings of the 43rd International Symposium on Computer Architecture},
pages = {243–254},
numpages = {12},
keywords = {hardware acceleration, ASIC, deep learning, algorithm-hardware co-design, model compression},
location = {Seoul, Republic of Korea},
series = {ISCA '16}
}
@article{10.1145/3007787.3001177,
author = {Chen, Yu-Hsin and Emer, Joel and Sze, Vivienne},
title = {Eyeriss: A Spatial Architecture for Energy-Efficient Dataflow for Convolutional Neural Networks},
year = {2016},
issue_date = {June 2016},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
volume = {44},
number = {3},
issn = {0163-5964},
url = {https://doi.org/10.1145/3007787.3001177},
doi = {10.1145/3007787.3001177},
abstract = {Deep convolutional neural networks (CNNs) are widely used in modern AI systems for their superior accuracy but at the cost of high computational complexity. The complexity comes from the need to simultaneously process hundreds of filters and channels in the high-dimensional convolutions, which involve a significant amount of data movement. Although highly-parallel compute paradigms, such as SIMD/SIMT, effectively address the computation requirement to achieve high throughput, energy consumption still remains high as data movement can be more expensive than computation. Accordingly, finding a dataflow that supports parallel processing with minimal data movement cost is crucial to achieving energy-efficient CNN processing without compromising accuracy.In this paper, we present a novel dataflow, called row-stationary (RS), that minimizes data movement energy consumption on a spatial architecture. This is realized by exploiting local data reuse of filter weights and feature map pixels, i.e., activations, in the high-dimensional convolutions, and minimizing data movement of partial sum accumulations. Unlike dataflows used in existing designs, which only reduce certain types of data movement, the proposed RS dataflow can adapt to different CNN shape configurations and reduces all types of data movement through maximally utilizing the processing engine (PE) local storage, direct inter-PE communication and spatial parallelism. To evaluate the energy efficiency of the different dataflows, we propose an analysis framework that compares energy cost under the same hardware area and processing parallelism constraints. Experiments using the CNN configurations of AlexNet show that the proposed RS dataflow is more energy efficient than existing dataflows in both convolutional (1.4\texttimes{} to 2.5\texttimes{}) and fully-connected layers (at least 1.3\texttimes{} for batch size larger than 16). The RS dataflow has also been demonstrated on a fabricated chip, which verifies our energy analysis.},
journal = {SIGARCH Comput. Archit. News},
month = {jun},
pages = {367–379},
numpages = {13}
}
@inproceedings{10.1109/ISCA.2016.40,
author = {Chen, Yu-Hsin and Emer, Joel and Sze, Vivienne},
title = {Eyeriss: A Spatial Architecture for Energy-Efficient Dataflow for Convolutional Neural Networks},
year = {2016},
isbn = {9781467389471},
publisher = {IEEE Press},
url = {https://doi.org/10.1109/ISCA.2016.40},
doi = {10.1109/ISCA.2016.40},
abstract = {Deep convolutional neural networks (CNNs) are widely used in modern AI systems for their superior accuracy but at the cost of high computational complexity. The complexity comes from the need to simultaneously process hundreds of filters and channels in the high-dimensional convolutions, which involve a significant amount of data movement. Although highly-parallel compute paradigms, such as SIMD/SIMT, effectively address the computation requirement to achieve high throughput, energy consumption still remains high as data movement can be more expensive than computation. Accordingly, finding a dataflow that supports parallel processing with minimal data movement cost is crucial to achieving energy-efficient CNN processing without compromising accuracy.In this paper, we present a novel dataflow, called row-stationary (RS), that minimizes data movement energy consumption on a spatial architecture. This is realized by exploiting local data reuse of filter weights and feature map pixels, i.e., activations, in the high-dimensional convolutions, and minimizing data movement of partial sum accumulations. Unlike dataflows used in existing designs, which only reduce certain types of data movement, the proposed RS dataflow can adapt to different CNN shape configurations and reduces all types of data movement through maximally utilizing the processing engine (PE) local storage, direct inter-PE communication and spatial parallelism. To evaluate the energy efficiency of the different dataflows, we propose an analysis framework that compares energy cost under the same hardware area and processing parallelism constraints. Experiments using the CNN configurations of AlexNet show that the proposed RS dataflow is more energy efficient than existing dataflows in both convolutional (1.4\texttimes{} to 2.5\texttimes{}) and fully-connected layers (at least 1.3\texttimes{} for batch size larger than 16). The RS dataflow has also been demonstrated on a fabricated chip, which verifies our energy analysis.},
booktitle = {Proceedings of the 43rd International Symposium on Computer Architecture},
pages = {367–379},
numpages = {13},
location = {Seoul, Republic of Korea},
series = {ISCA '16}
}
|