@misc{ben-cho-pur-bon-18-ab-fpgabits, author = {Benara, Vinamra and Choudhury, Ziaul and Purini, Suresh and Bondhugula, Uday}, title = {Synthesizing Power and Area Efficient Image Processing Pipelines on {FPGAs} using Customized Bit-widths}, howpublished = {Online document at the arXiv-CS repository, ID 1803.02660, version 3}, year = 2018, month = dec, url = {https://arxiv.org/abs/1803.02660v3}, doi = {10.48550/arXiv.1803.02660}, comment = {Describes AA use for bit width choice in image processing FPGas. Criticizes and then proposes an alternative using Satisfiability Modulo Theory (SMT).}, abstract = {High-level synthesis (HLS) has received significant attention in recent years for improving programmability of FPGAs. One could raise the level of abstraction further by using domain-specific languages (DSLs), improving productivity and performance simultaneously. PolyMage is a domain-specific language and compiler for image processing pipelines. Its PolyMage-HLS backend translates an input expressed as a DAG of image processing stages through the DSL into an equivalent circuit that can be synthesized on FPGAs, while leveraging an HLS suite. The power and area savings while performing arithmetic operations on fixed-point data type are well known to be significant over using floating-point data type. PolyMage-HLS stores data at each stage of a pipeline using a fixed-point data type ($\alpha$, $\beta$) where $\alpha$ and $\beta$ denote the number of integral and fractional bits. The integral bitwidth ($\alpha$) requirement at a pipeline stage can be inferred from its range. In this paper, we first propose an interval-arithmetic based range analysis algorithm to estimate the number of bits required to store the integral part of the data at each stage of an image processing pipeline. The analysis algorithm uses the homogeneity of pixel signals at each stage to cluster them and perform a combined range analysis. Secondly, we propose a software architecture for easily deploying any kind of interval/affine arithmetic based range analyses in the DSL compiler. Thirdly, we show that interval/affine arithmetic based techniques fail to take into account correlated computations across stages and hence could lead to poor range estimates. These errors in range estimates accumulate across stages, especially for iterative programs, such as Horn-Schunck Optical Flow, resulting in estimates nearly unusable in practice. Then, we propose a new range analysis technique using Satisfiability Modulo Theory (SMT) solvers, and show that the range estimates obtained through it are very close to the lower bounds obtained through profile-driven analysis. Finally, for estimating fractional bitwidth ($\beta$) requirement at each stage of the pipeline, we propose a simple and practical heuristic search algorithm, which makes very few profile passes, as opposed to techniques such as simulated annealing used in prior work. The analysis algorithm attempts to minimize the number of fractional bits required at each stage while preserving an application-specific quality metric. We evaluated our bitwidth analysis algorithms on four image processing benchmarks listed in the order of increasing complexity: Unsharp Mask, Down-Up Sampling, Harris Corner Detection and Horn-Schunck Optical Flow. The performance metrics considered are quality, power and area. For example, on Optical Flow, the interval analysis based approach showed an 1.4$\times$ and 1.14$\times$ improvement on area and power metrics over floating-point representation respectively; whereas the SMT solver based approach showed 2.49$\times$ and 1.58$\times$ improvement on area and power metrics when compared to interval analysis.} }