From 8706f04e2d90c4425f0380a701358dda61f40bb8 Mon Sep 17 00:00:00 2001 From: Leonard Kugis Date: Mon, 23 Jan 2023 23:50:19 +0100 Subject: Finished evaluation, finished future work --- Paper/references.bib | 68 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) (limited to 'Paper/references.bib') diff --git a/Paper/references.bib b/Paper/references.bib index b5ea474..e1596b3 100644 --- a/Paper/references.bib +++ b/Paper/references.bib @@ -143,6 +143,74 @@ location = {Lille, France}, series = {ICML'15} } +@inproceedings{10.1145/2847263.2847265, +author = {Qiu, Jiantao and Wang, Jie and Yao, Song and Guo, Kaiyuan and Li, Boxun and Zhou, Erjin and Yu, Jincheng and Tang, Tianqi and Xu, Ningyi and Song, Sen and Wang, Yu and Yang, Huazhong}, +title = {Going Deeper with Embedded FPGA Platform for Convolutional Neural Network}, +year = {2016}, +isbn = {9781450338561}, +publisher = {Association for Computing Machinery}, +address = {New York, NY, USA}, +url = {https://doi.org/10.1145/2847263.2847265}, +doi = {10.1145/2847263.2847265}, +abstract = {In recent years, convolutional neural network (CNN) based methods have achieved great success in a large number of applications and have been among the most powerful and widely used techniques in computer vision. However, CNN-based methods are com-putational-intensive and resource-consuming, and thus are hard to be integrated into embedded systems such as smart phones, smart glasses, and robots. FPGA is one of the most promising platforms for accelerating CNN, but the limited bandwidth and on-chip memory size limit the performance of FPGA accelerator for CNN.In this paper, we go deeper with the embedded FPGA platform on accelerating CNNs and propose a CNN accelerator design on embedded FPGA for Image-Net large-scale image classification. We first present an in-depth analysis of state-of-the-art CNN models and show that Convolutional layers are computational-centric and Fully-Connected layers are memory-centric.Then the dynamic-precision data quantization method and a convolver design that is efficient for all layer types in CNN are proposed to improve the bandwidth and resource utilization. Results show that only 0.4% accuracy loss is introduced by our data quantization flow for the very deep VGG16 model when 8/4-bit quantization is used. A data arrangement method is proposed to further ensure a high utilization of the external memory bandwidth. Finally, a state-of-the-art CNN, VGG16-SVD, is implemented on an embedded FPGA platform as a case study. VGG16-SVD is the largest and most accurate network that has been implemented on FPGA end-to-end so far. The system on Xilinx Zynq ZC706 board achieves a frame rate at 4.45 fps with the top-5 accuracy of 86.66% using 16-bit quantization. The average performance of convolutional layers and the full CNN is 187.8 GOP/s and 137.0 GOP/s under 150MHz working frequency, which outperform previous approaches significantly.}, +booktitle = {Proceedings of the 2016 ACM/SIGDA International Symposium on Field-Programmable Gate Arrays}, +pages = {26–35}, +numpages = {10}, +keywords = {embedded fpga, dynamic-precision data quantization, bandwidth utilization, convolutional neural network (cnn)}, +location = {Monterey, California, USA}, +series = {FPGA '16} +} + +@INPROCEEDINGS{7011421, + author={Chen, Yunji and Luo, Tao and Liu, Shaoli and Zhang, Shijin and He, Liqiang and Wang, Jia and Li, Ling and Chen, Tianshi and Xu, Zhiwei and Sun, Ninghui and Temam, Olivier}, + booktitle={2014 47th Annual IEEE/ACM International Symposium on Microarchitecture}, + title={DaDianNao: A Machine-Learning Supercomputer}, + year={2014}, + volume={}, + number={}, + pages={609-622}, + doi={10.1109/MICRO.2014.58} +} + +@article{MUTLU201928, +title = {Processing data where it makes sense: Enabling in-memory computation}, +journal = {Microprocessors and Microsystems}, +volume = {67}, +pages = {28-41}, +year = {2019}, +issn = {0141-9331}, +doi = {https://doi.org/10.1016/j.micpro.2019.01.009}, +url = {https://www.sciencedirect.com/science/article/pii/S0141933118302291}, +author = {Onur Mutlu and Saugata Ghose and Juan Gómez-Luna and Rachata Ausavarungnirun}, +keywords = {Data movement, Main memory, Processing-in-memory, 3D-Stacked memory, Near-data processing}, +abstract = {Today’s systems are overwhelmingly designed to move data to computation. This design choice goes directly against at least three key trends in systems that cause performance, scalability and energy bottlenecks: (1) data access from memory is already a key bottleneck as applications become more data-intensive and memory bandwidth and energy do not scale well, (2) energy consumption is a key constraint in especially mobile and server systems, (3) data movement is very expensive in terms of bandwidth, energy and latency, much more so than computation. These trends are especially severely-felt in the data-intensive server and energy-constrained mobile systems of today. At the same time, conventional memory technology is facing many scaling challenges in terms of reliability, energy, and performance. As a result, memory system architects are open to organizing memory in different ways and making it more intelligent, at the expense of higher cost. The emergence of 3D-stacked memory plus logic as well as the adoption of error correcting codes inside DRAM chips, and the necessity for designing new solutions to serious reliability and security issues, such as the RowHammer phenomenon, are an evidence of this trend. In this work, we discuss some recent research that aims to practically enable computation close to data. After motivating trends in applications as well as technology, we discuss at least two promising directions for processing-in-memory (PIM): (1) performing massively-parallel bulk operations in memory by exploiting the analog operational properties of DRAM, with low-cost changes, (2) exploiting the logic layer in 3D-stacked memory technology to accelerate important data-intensive applications. In both approaches, we describe and tackle relevant cross-layer research, design, and adoption challenges in devices, architecture, systems, and programming models. Our focus is on the development of in-memory processing designs that can be adopted in real computing platforms at low cost.} +} + +@ARTICLE{1274006, + author={Shih-Lien Lu}, + journal={Computer}, + title={Speeding up processing with approximation circuits}, + year={2004}, + volume={37}, + number={3}, + pages={67-73}, + doi={10.1109/MC.2004.1274006} +} + +@article{10.1126/science.1254642, +author = {Paul A. Merolla and John V. Arthur and Rodrigo Alvarez-Icaza and Andrew S. Cassidy and Jun Sawada and Filipp Akopyan and Bryan L. Jackson and Nabil Imam and Chen Guo and Yutaka Nakamura and Bernard Brezzo and Ivan Vo and Steven K. Esser and Rathinakumar Appuswamy and Brian Taba and Arnon Amir and Myron D. Flickner and William P. Risk and Rajit Manohar and Dharmendra S. Modha }, +title = {A million spiking-neuron integrated circuit with a scalable communication network and interface}, +journal = {Science}, +volume = {345}, +number = {6197}, +pages = {668-673}, +year = {2014}, +doi = {10.1126/science.1254642}, +URL = {https://www.science.org/doi/abs/10.1126/science.1254642}, +eprint = {https://www.science.org/doi/pdf/10.1126/science.1254642}, +abstract = {Computers are nowhere near as versatile as our own brains. Merolla et al. applied our present knowledge of the structure and function of the brain to design a new computer chip that uses the same wiring rules and architecture. The flexible, scalable chip operated efficiently in real time, while using very little power. Science, this issue p. 668 A large-scale computer chip mimics many features of a real brain. Inspired by the brain’s structure, we have developed an efficient, scalable, and flexible non–von Neumann architecture that leverages contemporary silicon technology. To demonstrate, we built a 5.4-billion-transistor chip with 4096 neurosynaptic cores interconnected via an intrachip network that integrates 1 million programmable spiking neurons and 256 million configurable synapses. Chips can be tiled in two dimensions via an interchip communication interface, seamlessly scaling the architecture to a cortexlike sheet of arbitrary size. The architecture is well suited to many applications that use complex neural networks in real time, for example, multiobject detection and classification. With 400-pixel-by-240-pixel video input at 30 frames per second, the chip consumes 63 milliwatts.} +} + @inproceedings{10.1109/ISCA.2016.30, author = {Han, Song and Liu, Xingyu and Mao, Huizi and Pu, Jing and Pedram, Ardavan and Horowitz, Mark A. and Dally, William J.}, title = {EIE: Efficient Inference Engine on Compressed Deep Neural Network}, -- cgit v1.2.1