The histogram needs two frames to populate the histogram array and to get correct results in the auto exposure frame. Auto white balance, GTM and other tone-mapping functions need one extra frame in each to populate its parameters and apply those parameters to get a correct image. For the specific example below, four iterations are needed because the AEC, AWB, and LTM modules are selected.
// Create a kernel:
OCL_CHECK(err, cl::Kernel kernel(program, "ISPPipeline_accel", &err));
int loop_count = 4;
for (int i = 0; i < loop_count; i++) {
OCL_CHECK(err, q.enqueueWriteBuffer(buffer_inVec, // buffer on the FPGA
CL_TRUE, // blocking call
0, // buffer offset in bytes
vec_in_size_bytes, // Size in bytes
gamma_lut));
OCL_CHECK(err, q.enqueueWriteBuffer(buffer_R_IR_C1, // buffer on the FPGA
CL_TRUE, // blocking call
0, // buffer offset in bytes
filter1_in_size_bytes, // Size in bytes
R_IR_C1_wgts));
OCL_CHECK(err, q.enqueueWriteBuffer(buffer_R_IR_C2, // buffer on the FPGA
CL_TRUE, // blocking call
0, // buffer offset in bytes
filter1_in_size_bytes, // Size in bytes
R_IR_C2_wgts));
OCL_CHECK(err, q.enqueueWriteBuffer(buffer_B_at_R, // buffer on the FPGA
CL_TRUE, // blocking call
0, // buffer offset in bytes
filter1_in_size_bytes, // Size in bytes
B_at_R_wgts));
OCL_CHECK(err, q.enqueueWriteBuffer(buffer_IR_at_R, // buffer on the FPGA
CL_TRUE, // blocking call
0, // buffer offset in bytes
filter2_in_size_bytes, // Size in bytes
IR_at_R_wgts));
OCL_CHECK(err, q.enqueueWriteBuffer(buffer_IR_at_B, // buffer on the FPGA
CL_TRUE, // blocking call
0, // buffer offset in bytes
filter2_in_size_bytes, // Size in bytes
IR_at_B_wgts));
OCL_CHECK(err, q.enqueueWriteBuffer(buffer_sub_wgts, // buffer on the FPGA
CL_TRUE, // blocking call
0, // buffer offset in bytes
sub_wgts_in_size_bytes, // Size in bytes
sub_wgts));
OCL_CHECK(err, q.enqueueWriteBuffer(buffer_decompand_params, // buffer on the FPGA
CL_TRUE, // blocking call
0, // buffer offset in bytes
decompand_params_in_size_bytes, // Size in bytes
params_decomand));
OCL_CHECK(err, q.enqueueWriteBuffer(buffer_degamma_params, // buffer on the FPGA
CL_TRUE, // blocking call
0, // buffer offset in bytes
degamma_params_in_size_bytes, // Size in bytes
params_degamma));
if (USE_HDR_FUSION) {
OCL_CHECK(err, q.enqueueWriteBuffer(buffer_inVec_Weights, // buffer on the FPGA
CL_TRUE, // blocking call
0, // buffer offset in bytes
vec_weight_size_bytes, // Size in bytes
wr_hls));
OCL_CHECK(err, q.enqueueWriteBuffer(imageToDevice, CL_TRUE, 0,
image_in_size_bytes, interleaved_img.data));
}
else {
OCL_CHECK(err, q.enqueueWriteBuffer(imageToDevice, CL_TRUE, 0,
image_in_size_bytes, out_img_12bit.data));
}
OCL_CHECK(err, q.enqueueWriteBuffer(buffer_inLut, // buffer on the FPGA
CL_TRUE, // blocking call
0, // buffer offset in bytes
lut_in_size_bytes, // Size in bytes
casted_lut, // Pointer to the data to copy
nullptr));
// Profiling Objects
cl_ulong start = 0;
cl_ulong end = 0;
double diff_prof = 0.0f;
cl::Event event_sp;
// Launch the kernel
OCL_CHECK(err, err = q.enqueueTask(kernel, NULL, &event_sp));
clWaitForEvents(1, (const cl_event*)&event_sp);
event_sp.getProfilingInfo(CL_PROFILING_COMMAND_START, &start);
event_sp.getProfilingInfo(CL_PROFILING_COMMAND_END, &end);
diff_prof = end - start;
std::cout << (diff_prof / 1000000) << std::endl;
exec_sum = exec_sum + diff_prof;
// Copying Device result data to Host memory
q.enqueueReadBuffer(imageFromDevice, CL_TRUE, 0, image_out_size_bytes, out_img.data);
if (USE_RGBIR) {
q.enqueueReadBuffer(imageFromDevice_ir, CL_TRUE, 0, image_out_ir_size_bytes, out_img_ir.data);
}
}
Resource Utilization
The following table summarizes the resource utilization of ISP all_in_one generated using Vitis HLS 2023.1 tool on ZCU102 board.
Operating Mode | Operating Frequency (MHz) | Utilization Estimate | |||
---|---|---|---|---|---|
BRAM | DSP | CLB Registers | CLB LUT | ||
1 Pixel | 150 | 111 | 302 | 42504 | 44000 |
Performance Estimate
The following table summarizes the performance of the ISP all_in_one in 1-pixel mode as generated using Vitis HLS 2023.1 tool on ZCU102 board.
Estimated average latency is obtained by running the accel with four iterations. The input to the accel is a 12-bit non-linearized full-HD (1920x1080) image.
Operating Mode | Latency Estimate |
---|---|
Average latency(ms) | |
1 pixel operation (150 MHz) | 22.357 |