1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
|
@article{choudhary2020comprehensive,
title={A comprehensive survey on model compression and acceleration},
author={Choudhary, Tejalal and Mishra, Vipul and Goswami, Anurag and Sarangapani, Jagannathan},
journal={Artificial Intelligence Review},
volume={53},
number={7},
pages={5113--5155},
year={2020},
publisher={Springer}
}
@article{10.1145/3065386,
author = {Krizhevsky, Alex and Sutskever, Ilya and Hinton, Geoffrey E.},
title = {ImageNet Classification with Deep Convolutional Neural Networks},
year = {2017},
issue_date = {June 2017},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
volume = {60},
number = {6},
issn = {0001-0782},
url = {https://doi.org/10.1145/3065386},
doi = {10.1145/3065386},
abstract = {We trained a large, deep convolutional neural network to classify the 1.2 million high-resolution images in the ImageNet LSVRC-2010 contest into the 1000 different classes. On the test data, we achieved top-1 and top-5 error rates of 37.5% and 17.0%, respectively, which is considerably better than the previous state-of-the-art. The neural network, which has 60 million parameters and 650,000 neurons, consists of five convolutional layers, some of which are followed by max-pooling layers, and three fully connected layers with a final 1000-way softmax. To make training faster, we used non-saturating neurons and a very efficient GPU implementation of the convolution operation. To reduce overfitting in the fully connected layers we employed a recently developed regularization method called "dropout" that proved to be very effective. We also entered a variant of this model in the ILSVRC-2012 competition and achieved a winning top-5 test error rate of 15.3%, compared to 26.2% achieved by the second-best entry.},
journal = {Commun. ACM},
month = {may},
pages = {84–90},
numpages = {7}
}
@inproceedings{NIPS1988_1c9ac015,
author = {Hanson, Stephen and Pratt, Lorien},
booktitle = {Advances in Neural Information Processing Systems},
editor = {D. Touretzky},
pages = {},
publisher = {Morgan-Kaufmann},
title = {Comparing Biases for Minimal Network Construction with Back-Propagation},
url = {https://proceedings.neurips.cc/paper/1988/file/1c9ac0159c94d8d0cbedc973445af2da-Paper.pdf},
volume = {1},
year = {1988}
}
@ARTICLE{9082126,
author={Xiang, Yachen and Huang, Peng and Han, Runze and Li, Chu and Wang, Kunliang and Liu, Xiaoyan and Kang, Jinfeng},
journal={IEEE Transactions on Electron Devices},
title={Efficient and Robust Spike-Driven Deep Convolutional Neural Networks Based on NOR Flash Computing Array},
year={2020},
volume={67},
number={6},
pages={2329-2335},
doi={10.1109/TED.2020.2987439}}
@InProceedings{Cheng_2015_ICCV,
author = {Cheng, Yu and Yu, Felix X. and Feris, Rogerio S. and Kumar, Sanjiv and Choudhary, Alok and Chang, Shi-Fu},
title = {An Exploration of Parameter Redundancy in Deep Networks With Circulant Projections},
booktitle = {Proceedings of the IEEE International Conference on Computer Vision (ICCV)},
month = {December},
year = {2015}
}
@ARTICLE{726791,
author={Lecun, Y. and Bottou, L. and Bengio, Y. and Haffner, P.},
journal={Proceedings of the IEEE},
title={Gradient-based learning applied to document recognition},
year={1998},
volume={86},
number={11},
pages={2278-2324},
doi={10.1109/5.726791}
}
@article{10.1145/3007787.3001177,
author = {Chen, Yu-Hsin and Emer, Joel and Sze, Vivienne},
title = {Eyeriss: A Spatial Architecture for Energy-Efficient Dataflow for Convolutional Neural Networks},
year = {2016},
issue_date = {June 2016},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
volume = {44},
number = {3},
issn = {0163-5964},
url = {https://doi.org/10.1145/3007787.3001177},
doi = {10.1145/3007787.3001177},
abstract = {Deep convolutional neural networks (CNNs) are widely used in modern AI systems for their superior accuracy but at the cost of high computational complexity. The complexity comes from the need to simultaneously process hundreds of filters and channels in the high-dimensional convolutions, which involve a significant amount of data movement. Although highly-parallel compute paradigms, such as SIMD/SIMT, effectively address the computation requirement to achieve high throughput, energy consumption still remains high as data movement can be more expensive than computation. Accordingly, finding a dataflow that supports parallel processing with minimal data movement cost is crucial to achieving energy-efficient CNN processing without compromising accuracy.In this paper, we present a novel dataflow, called row-stationary (RS), that minimizes data movement energy consumption on a spatial architecture. This is realized by exploiting local data reuse of filter weights and feature map pixels, i.e., activations, in the high-dimensional convolutions, and minimizing data movement of partial sum accumulations. Unlike dataflows used in existing designs, which only reduce certain types of data movement, the proposed RS dataflow can adapt to different CNN shape configurations and reduces all types of data movement through maximally utilizing the processing engine (PE) local storage, direct inter-PE communication and spatial parallelism. To evaluate the energy efficiency of the different dataflows, we propose an analysis framework that compares energy cost under the same hardware area and processing parallelism constraints. Experiments using the CNN configurations of AlexNet show that the proposed RS dataflow is more energy efficient than existing dataflows in both convolutional (1.4\texttimes{} to 2.5\texttimes{}) and fully-connected layers (at least 1.3\texttimes{} for batch size larger than 16). The RS dataflow has also been demonstrated on a fabricated chip, which verifies our energy analysis.},
journal = {SIGARCH Comput. Archit. News},
month = {jun},
pages = {367–379},
numpages = {13}
}
@inproceedings{carvalho2002gap,
title={The gap between processor and memory speeds},
author={Carvalho, Carlos},
booktitle={Proc. of IEEE International Conference on Control and Automation},
year={2002}
}
@article{DBLP:journals/corr/SzeCESZ16,
author = {Vivienne Sze and
Yu{-}Hsin Chen and
Joel S. Emer and
Amr Suleiman and
Zhengdong Zhang},
title = {Hardware for Machine Learning: Challenges and Opportunities},
journal = {CoRR},
volume = {abs/1612.07625},
year = {2016},
url = {http://arxiv.org/abs/1612.07625},
eprinttype = {arXiv},
eprint = {1612.07625},
timestamp = {Wed, 11 Dec 2019 16:23:12 +0100},
biburl = {https://dblp.org/rec/journals/corr/SzeCESZ16.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/SuleimanZS16,
author = {Amr Suleiman and
Zhengdong Zhang and
Vivienne Sze},
title = {A 58.6mW Real-Time Programmable Object Detector with Multi-Scale Multi-Object
Support Using Deformable Parts Model on 1920x1080 Video at 30fps},
journal = {CoRR},
volume = {abs/1607.08635},
year = {2016},
url = {http://arxiv.org/abs/1607.08635},
eprinttype = {arXiv},
eprint = {1607.08635},
timestamp = {Wed, 11 Dec 2019 16:23:12 +0100},
biburl = {https://dblp.org/rec/journals/corr/SuleimanZS16.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{10.5555/3045118.3045361,
author = {Chen, Wenlin and Wilson, James T. and Tyree, Stephen and Weinberger, Kilian Q. and Chen, Yixin},
title = {Compressing Neural Networks with the Hashing Trick},
year = {2015},
publisher = {JMLR.org},
abstract = {As deep nets are increasingly used in applications suited for mobile devices, a fundamental dilemma becomes apparent: the trend in deep learning is to grow models to absorb ever-increasing data set sizes; however mobile devices are designed with very little memory and cannot store such large models. We present a novel network architecture, HashedNets, that exploits inherent redundancy in neural networks to achieve drastic reductions in model sizes. HashedNets uses a low-cost hash function to randomly group connection weights into hash buckets, and all connections within the same hash bucket share a single parameter value. These parameters are tuned to adjust to the HashedNets weight sharing architecture with standard backprop during training. Our hashing procedure introduces no additional memory overhead, and we demonstrate on several benchmark data sets that HashedNets shrink the storage requirements of neural networks substantially while mostly preserving generalization performance.},
booktitle = {Proceedings of the 32nd International Conference on International Conference on Machine Learning - Volume 37},
pages = {2285–2294},
numpages = {10},
location = {Lille, France},
series = {ICML'15}
}
@inproceedings{10.1109/ISCA.2016.30,
author = {Han, Song and Liu, Xingyu and Mao, Huizi and Pu, Jing and Pedram, Ardavan and Horowitz, Mark A. and Dally, William J.},
title = {EIE: Efficient Inference Engine on Compressed Deep Neural Network},
year = {2016},
isbn = {9781467389471},
publisher = {IEEE Press},
url = {https://doi.org/10.1109/ISCA.2016.30},
doi = {10.1109/ISCA.2016.30},
abstract = {State-of-the-art deep neural networks (DNNs) have hundreds of millions of connections and are both computationally and memory intensive, making them difficult to deploy on embedded systems with limited hardware resources and power budgets. While custom hardware helps the computation, fetching weights from DRAM is two orders of magnitude more expensive than ALU operations, and dominates the required power.Previously proposed 'Deep Compression' makes it possible to fit large DNNs (AlexNet and VGGNet) fully in on-chip SRAM. This compression is achieved by pruning the redundant connections and having multiple connections share the same weight. We propose an energy efficient inference engine (EIE) that performs inference on this compressed network model and accelerates the resulting sparse matrix-vector multiplication with weight sharing. Going from DRAM to SRAM gives EIE 120\texttimes{} energy saving; Exploiting sparsity saves 10\texttimes{}; Weight sharing gives 8\texttimes{}; Skipping zero activations from ReLU saves another 3\texttimes{}. Evaluated on nine DNN benchmarks, EIE is 189\texttimes{} and 13\texttimes{} faster when compared to CPU and GPU implementations of the same DNN without compression. EIE has a processing power of 102 GOPS working directly on a compressed network, corresponding to 3 TOPS on an uncompressed network, and processes FC layers of AlexNet at 1.88\texttimes{}104 frames/sec with a power dissipation of only 600mW. It is 24,000\texttimes{} and 3,400\texttimes{} more energy efficient than a CPU and GPU respectively. Compared with DaDianNao, EIE has 2.9\texttimes{}, 19\texttimes{} and 3\texttimes{} better throughput, energy efficiency and area efficiency.},
booktitle = {Proceedings of the 43rd International Symposium on Computer Architecture},
pages = {243–254},
numpages = {12},
keywords = {hardware acceleration, ASIC, algorithm-hardware co-design, model compression, deep learning},
location = {Seoul, Republic of Korea},
series = {ISCA '16}
}
@article{Han2015DeepCC,
title={Deep Compression: Compressing Deep Neural Network with Pruning, Trained Quantization and Huffman Coding},
author={Song Han and Huizi Mao and William J. Dally},
journal={arXiv: Computer Vision and Pattern Recognition},
year={2015}
}
@ARTICLE{9253578,
author={Dai, Xiaoliang and Yin, Hongxu and Jha, Niraj K.},
journal={IEEE Transactions on Emerging Topics in Computing},
title={Incremental Learning Using a Grow-and-Prune Paradigm With Efficient Neural Networks},
year={2022},
volume={10},
number={2},
pages={752-762},
doi={10.1109/TETC.2020.3037052}
}
@inproceedings{NIPS2015_ae0eb3ee,
author = {Han, Song and Pool, Jeff and Tran, John and Dally, William},
booktitle = {Advances in Neural Information Processing Systems},
editor = {C. Cortes and N. Lawrence and D. Lee and M. Sugiyama and R. Garnett},
pages = {},
publisher = {Curran Associates, Inc.},
title = {Learning both Weights and Connections for Efficient Neural Network},
url = {https://proceedings.neurips.cc/paper/2015/file/ae0eb3eed39d2bcef4622b2499a05fe6-Paper.pdf},
volume = {28},
year = {2015}
}
@article{das2015neuraltalk,
title={NeuralTalk on Embedded System and GPU-accelerated RNN},
author={Das, Subhasis and Han, Song},
journal={Subhasis Das and Song Han},
year={2015}
}
@inproceedings{NIPS1989_6c9882bb,
author = {LeCun, Yann and Denker, John and Solla, Sara},
booktitle = {Advances in Neural Information Processing Systems},
editor = {D. Touretzky},
pages = {},
publisher = {Morgan-Kaufmann},
title = {Optimal Brain Damage},
url = {https://proceedings.neurips.cc/paper/1989/file/6c9882bbac1c7093bd25041881277658-Paper.pdf},
volume = {2},
year = {1989}
}
@ARTICLE{8704878,
author = {X. Dai and H. Yin and N. K. Jha},
journal = {IEEE Transactions on Computers},
title = {NeST: A Neural Network Synthesis Tool Based on a Grow-and-Prune Paradigm},
year = {2019},
volume = {68},
number = {10},
issn = {1557-9956},
pages = {1487-1497},
keywords = {neurons;computer architecture;training;biological neural networks;tools;manganese;correlation},
doi = {10.1109/TC.2019.2914438},
publisher = {IEEE Computer Society},
address = {Los Alamitos, CA, USA},
month = {oct}
}
@inproceedings{NIPS1992_303ed4c6,
author = {Hassibi, Babak and Stork, David},
booktitle = {Advances in Neural Information Processing Systems},
editor = {S. Hanson and J. Cowan and C. Giles},
pages = {},
publisher = {Morgan-Kaufmann},
title = {Second order derivatives for network pruning: Optimal Brain Surgeon},
url = {https://proceedings.neurips.cc/paper/1992/file/303ed4c69846ab36c2904d3ba8573050-Paper.pdf},
volume = {5},
year = {1992}
}
|