Create and Launch Kernel in the Testbench: - 2024.1 English

Vitis Libraries

Release Date
2024-08-06
Version
2024.1 English

The histogram needs two frames to populate the histogram array and to get correct results in the auto exposure frame. Auto white balance, GTM and other tone-mapping functions need one extra frame in each to populate its parameters and apply those parameters to get a correct image. For the specific example below, four iterations are needed because the AEC, AWB, and LTM modules are selected.

    // Create a kernel:
    OCL_CHECK(err, cl::Kernel kernel(program, "ISPPipeline_accel", &err));

    int loop_count = 4;
    for (int i = 0; i < loop_count; i++) {
        OCL_CHECK(err, q.enqueueWriteBuffer(buffer_inVec,          // buffer on the FPGA
                                        CL_TRUE,                   // blocking call
                                        0,                         // buffer offset in bytes
                                        vec_in_size_bytes,         // Size in bytes
                                        gamma_lut));

        OCL_CHECK(err, q.enqueueWriteBuffer(buffer_R_IR_C1,        // buffer on the FPGA
                                        CL_TRUE,                   // blocking call
                                        0,                         // buffer offset in bytes
                                        filter1_in_size_bytes,     // Size in bytes
                                        R_IR_C1_wgts));

        OCL_CHECK(err, q.enqueueWriteBuffer(buffer_R_IR_C2,        // buffer on the FPGA
                                        CL_TRUE,                   // blocking call
                                        0,                         // buffer offset in bytes
                                        filter1_in_size_bytes,     // Size in bytes
                                        R_IR_C2_wgts));

        OCL_CHECK(err, q.enqueueWriteBuffer(buffer_B_at_R,         // buffer on the FPGA
                                        CL_TRUE,                   // blocking call
                                        0,                         // buffer offset in bytes
                                        filter1_in_size_bytes,     // Size in bytes
                                        B_at_R_wgts));

        OCL_CHECK(err, q.enqueueWriteBuffer(buffer_IR_at_R,        // buffer on the FPGA
                                        CL_TRUE,                   // blocking call
                                        0,                         // buffer offset in bytes
                                        filter2_in_size_bytes,     // Size in bytes
                                        IR_at_R_wgts));

        OCL_CHECK(err, q.enqueueWriteBuffer(buffer_IR_at_B,        // buffer on the FPGA
                                        CL_TRUE,                   // blocking call
                                        0,                         // buffer offset in bytes
                                        filter2_in_size_bytes,     // Size in bytes
                                        IR_at_B_wgts));

        OCL_CHECK(err, q.enqueueWriteBuffer(buffer_sub_wgts,        // buffer on the FPGA
                                        CL_TRUE,                    // blocking call
                                        0,                          // buffer offset in bytes
                                        sub_wgts_in_size_bytes,     // Size in bytes
                                        sub_wgts));

        OCL_CHECK(err, q.enqueueWriteBuffer(buffer_decompand_params,        // buffer on the FPGA
                                        CL_TRUE,                            // blocking call
                                        0,                                  // buffer offset in bytes
                                        decompand_params_in_size_bytes,     // Size in bytes
                                        params_decomand));

        OCL_CHECK(err, q.enqueueWriteBuffer(buffer_degamma_params,          // buffer on the FPGA
                                        CL_TRUE,                            // blocking call
                                        0,                                  // buffer offset in bytes
                                        degamma_params_in_size_bytes,       // Size in bytes
                                        params_degamma));

        if (USE_HDR_FUSION) {
           OCL_CHECK(err, q.enqueueWriteBuffer(buffer_inVec_Weights,        // buffer on the FPGA
                                            CL_TRUE,                        // blocking call
                                            0,                              // buffer offset in bytes
                                            vec_weight_size_bytes,          // Size in bytes
                                            wr_hls));

           OCL_CHECK(err, q.enqueueWriteBuffer(imageToDevice, CL_TRUE, 0,
                          image_in_size_bytes, interleaved_img.data));
        }

        else {
           OCL_CHECK(err, q.enqueueWriteBuffer(imageToDevice, CL_TRUE, 0,
                          image_in_size_bytes, out_img_12bit.data));
       }

           OCL_CHECK(err, q.enqueueWriteBuffer(buffer_inLut,                // buffer on the FPGA
                                        CL_TRUE,                            // blocking call
                                        0,                                  // buffer offset in bytes
                                        lut_in_size_bytes,                  // Size in bytes
                                        casted_lut,                         // Pointer to the data to copy
                                        nullptr));
    // Profiling Objects
    cl_ulong start = 0;
    cl_ulong end = 0;
    double diff_prof = 0.0f;
    cl::Event event_sp;

    // Launch the kernel
    OCL_CHECK(err, err = q.enqueueTask(kernel, NULL, &event_sp));

    clWaitForEvents(1, (const cl_event*)&event_sp);

    event_sp.getProfilingInfo(CL_PROFILING_COMMAND_START, &start);

    event_sp.getProfilingInfo(CL_PROFILING_COMMAND_END, &end);

    diff_prof = end - start;
    std::cout << (diff_prof / 1000000) << std::endl;
    exec_sum = exec_sum + diff_prof;

    // Copying Device result data to Host memory
    q.enqueueReadBuffer(imageFromDevice, CL_TRUE, 0, image_out_size_bytes, out_img.data);

    if (USE_RGBIR) {
        q.enqueueReadBuffer(imageFromDevice_ir, CL_TRUE, 0, image_out_ir_size_bytes, out_img_ir.data);
    }
}

Resource Utilization

The following table summarizes the resource utilization of ISP all_in_one generated using Vitis HLS 2023.1 tool on ZCU102 board.

Table 261 Table: ISP all_in_one Resource Utilization Summary
Operating Mode Operating Frequency (MHz) Utilization Estimate
BRAM DSP CLB Registers CLB LUT
1 Pixel 150 111 302 42504 44000

Performance Estimate

The following table summarizes the performance of the ISP all_in_one in 1-pixel mode as generated using Vitis HLS 2023.1 tool on ZCU102 board.

Estimated average latency is obtained by running the accel with four iterations. The input to the accel is a 12-bit non-linearized full-HD (1920x1080) image.

Table 262 Table: ISP all_in_one Performance Estimate Summary
Operating Mode Latency Estimate
Average latency(ms)
1 pixel operation (150 MHz) 22.357