Navigate to
$LAB_WORK_DIR/reference_files
, and with a file editor, openrun_sw_overlap_multiDDR.cpp
.From the host code, you will need to send the words to both DDR banks alternatively. The DDR bank assignment in the host code is supported by a AMD vendor extension to the OpenCL™ API. Two AMD extension pointer objects (
cl_mem_ext_ptr_t
) are created,buffer_words_ext[0]
andbuffer_words_ext[1]
. Theflags
will determine which DDR bank the buffer will be send to, so that kernel can access it.cl_mem_ext_ptr_t buffer_words_ext[2]; buffer_words_ext[0].flags = 1 | XCL_MEM_TOPOLOGY; // DDR[1] buffer_words_ext[0].param = 0; buffer_words_ext[0].obj = input_doc_words; buffer_words_ext[1].flags = 2 | XCL_MEM_TOPOLOGY; // DDR[2] buffer_words_ext[1].param = 0; buffer_words_ext[1].obj = input_doc_words;
The next two buffers,
buffer_doc_words[0]
andbuffer_doc_words[1]
, are created inDDR[1]
andDDR[2]
as follows.buffer_doc_words[0] = cl::Buffer(context, CL_MEM_EXT_PTR_XILINX | CL_MEM_USE_HOST_PTR | CL_MEM_READ_ONLY, total_size*sizeof(uint), &buffer_words_ext[0]); buffer_doc_words[1] = cl::Buffer(context, CL_MEM_EXT_PTR_XILINX | CL_MEM_USE_HOST_PTR | CL_MEM_READ_ONLY, total_size*sizeof(uint), &buffer_words_ext[1]); buffer_inh_flags = cl::Buffer(context, CL_MEM_USE_HOST_PTR | CL_MEM_WRITE_ONLY, total_size*sizeof(char),output_inh_flags); buffer_bloom_filter = cl::Buffer(context, CL_MEM_USE_HOST_PTR | CL_MEM_READ_ONLY, bloom_filter_size*sizeof(uint),bloom_filter); // Set buffer kernel arguments (needed to migrate the buffers in the correct memory) kernel.setArg(0, buffer_inh_flags); kernel.setArg(1, buffer_doc_words[0]); kernel.setArg(2, buffer_bloom_filter); // Make buffers resident in the device q.enqueueMigrateMemObjects({buffer_bloom_filter, buffer_doc_words[0], buffer_doc_words[1], buffer_inh_flags}, CL_MIGRATE_MEM_OBJECT_CONTENT_UNDEFINED); // Create sub-buffers, one for each transaction unsigned subbuf_doc_sz = total_doc_size/num_iter; unsigned subbuf_inh_sz = total_doc_size/num_iter; cl_buffer_region subbuf_inh_info[num_iter]; cl_buffer_region subbuf_doc_info[num_iter]; cl::Buffer subbuf_inh_flags[num_iter]; cl::Buffer subbuf_doc_words[num_iter]; for (int i=0; i<num_iter; i++) { subbuf_inh_info[i]={i*subbuf_inh_sz*sizeof(char), subbuf_inh_sz*sizeof(char)}; subbuf_doc_info[i]={i*subbuf_doc_sz*sizeof(uint), subbuf_doc_sz*sizeof(uint)}; subbuf_inh_flags[i] = buffer_inh_flags.createSubBuffer(CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, &subbuf_inh_info[i]); // The doc words sub-buffers will be alternating in DDR[1] and DDR[2] subbuf_doc_words[i] = buffer_doc_words[i%2].createSubBuffer (CL_MEM_READ_ONLY, CL_BUFFER_CREATE_TYPE_REGION, &subbuf_doc_info[i]); }
The kernel argument, input word is set to the array of sub-buffers created from
buffer_doc_words[0]
andbuffer_doc_words[1]
alternatively; hence, data is sent to DDR bank 1 and 2 alternatively in each kernel execution.for (int i=0; i<num_iter; i++) { cl::Event buffDone, krnlDone, flagDone; total_size = subbuf_doc_info[i].size / sizeof(uint); load_filter = false; kernel.setArg(0, subbuf_inh_flags[i]); kernel.setArg(1, subbuf_doc_words[i]); kernel.setArg(3, total_size); kernel.setArg(4, load_filter); q.enqueueMigrateMemObjects({subbuf_doc_words[i]}, 0, &wordWait, &buffDone); wordWait.push_back(buffDone); q.enqueueTask(kernel, &wordWait, &krnlDone); krnlWait.push_back(krnlDone); q.enqueueMigrateMemObjects({subbuf_inh_flags[i]}, CL_MIGRATE_MEM_OBJECT_HOST, &krnlWait, &flagDone); flagWait.push_back(flagDone); }