c++ - cuda multiple image erosion not work -
i'm trying implement multiple black(0) , white(255) image erosion cuda,i use square (5x5)structure element.the kernel had implemented take unsigned char array buffer in stored nimg images 200x200 px . allow erosion of multiple image simultaneosly make grid 3d structure:
- each block has dimension of strel (5x5)
- the grid has height = image_height/blockdim.y , width = image_width/blockdim.x , z = nimg
i've try implement extending sample.
the problem if store pixels block of threads consider shared buffer shared between threads of block; allow fast memory access, algorithm doesn't work properly.i try change bindex me make mistake,but cannot found solution.
any suggestion?
here's code:
//strel size #define strel_w 5 #define strel_h 5 // distance cente of strel strel width or height #define r (strel_h/2) //size of 2d region each block consider i.e neighborns each thread in block consider #define block_w (strel_w+(2*r)) #define block_h (strel_h+(2*r)) __global__ void erode_multiple_img_sm(unsigned char * buffer_in, unsigned char * buffer_out, int w, int h ){ //array stored in shared memory,that contain pixel neighborns each thread in block consider __shared__ unsigned char fast_acc_arr[block_w*block_h]; // map thread in 3d structure int col = blockidx.x * strel_w + threadidx.x -r ; int row = blockidx.y * strel_h + threadidx.y -r ; int plane = blockidx.z * blockdim.z + threadidx.z; // check if foreground px of strel not contain in region of image size of strel (if 1 px not contain image eroded) bool is_contain = true; // clamp edge of image col = max(0,col); col = min(col,w-1); row = max(0,row); row = min(row,h-1); //map each thread in 1 dim coord map 3d structure(grid) image buffer(1d) unsigned int index = (plane * h * w) + (row * w) + col; unsigned int bindex = threadidx.y * blockdim.y + threadidx.x; //each thread copy pixel of block shared memory (shared thread of block) fast_acc_arr[bindex] = buffer_in[index]; __syncthreads(); //the strel must contain in image, thread.x , thread.y coords of center of mask correspond strel in image, , must contain in image if((threadidx.x >= r) && (threadidx.x < block_w-r) && (threadidx.y >= r) && (threadidx.y <block_h-r)){ for(int dy=-r; dy<=r; dy++){ if(is_contain == false) break; (int dx = -r ; dx <= r; dx++) { //if 1 element in mask different value of strel el --> strel not contain in mask --> center of mask eroded (and it's no necessary consider other el of mask motivation of break) if (fast_acc_arr[bindex + (dy * blockdim.x) + dx ] != 255 ){ buffer_out[index ] = 0; is_contain = false; break; } } } // if strel contain image the center not eroded if(is_contain == true) buffer_out[index] = 255; } }
that kernel settings:
dim3 block(5,5,1); dim3 grid(200/(block.x),200/(block.y),nimg);
my kernel call:
erode_multiple_img_sm<<<grid,block>>>(dimage_src,dimage_dst,200,200);
my image input , output:
input: output(150 buff element):
code without shared memory(low speed):
__global__ void erode_multiple_img(unsigned char * buffer_in, unsigned char * buffer_out, int w,int h ){ int col = blockidx.x * blockdim.x + threadidx.x; int row = blockidx.y * blockdim.y + threadidx.y; int plane = blockidx.z * blockdim.z +threadidx.z; bool is_contain = true; col = max(0,col); col = min(col,w-1); row = max(0,row); row = min(row,h-1); for(int dy=-strel_h/2; dy<=strel_h/2; dy++){ if(is_contain == false) break; (int dx = -strel_w/2 ; dx <= strel_w/2; dx++) { if (buffer_in[(plane * h * w) +( row + dy) * w + (col + dx) ] !=255 ){ buffer_out[(plane * h * w) + row * w + col ] = 0; is_contain = false; break; } } } if(is_contain == true) buffer_out[(plane * h * w) + row * w +col ] = 255; }
updated algorithm
i try follow samples convolution.i change input image, has 512x512 size , wrote algorithm:
#define strel_size 5 #define tile_w 16 #define tile_h 16 #define r (strel_h/2) #define block_w (tile_w+(2*r)) #define block_h (tile_h+(2*r)) __global__ void erode_multiple_img_sm_v2(unsigned char * buffer_in, unsigned char * buffer_out, int w,int h ){ // data cache: threadidx.x , threadidx.y __shared__ unsigned char data[tile_w +strel_size ][tile_w +strel_size ]; // global mem address of thread int col = blockidx.x * blockdim.x + threadidx.x; int row = blockidx.y * blockdim.y + threadidx.y; int plane = blockidx.z * blockdim.z +threadidx.z; int gloc = (plane*h/w)+ row*w +col; bool is_contain = true; // load cache (32x32 shared memory, 16x16 threads blocks) // each threads loads 4 values global memory shared mem int x, y; // image based coordinate if((col<w)&&(row<h)) { data[threadidx.x][threadidx.y]=buffer_in[gloc]; if (threadidx.y > (h-strel_size)) data[threadidx.x][threadidx.y + strel_size]=buffer_in[gloc + strel_size]; if (threadidx.x >(w-strel_size)) data[threadidx.x + strel_size][threadidx.y]=buffer_in[gloc+strel_size]; if ((threadidx.x >(w-strel_size)) && (threadidx.y > (h-strel_size))) data[threadidx.x+strel_size][threadidx.y+strel_size] = buffer_in[gloc+2*strel_size]; //wait threads finish read __syncthreads(); //buffer_out[gloc] = data[threadidx.x][threadidx.y]; unsigned char min_value = 255; for(x=0;x<strel_size;x++){ for(y=0;y<strel_size;y++){ min_value = min( (data[threadidx.x+x][threadidx.y+y]) , min_value); } } buffer_out[gloc]= min_value; }
}
my kernel settings now are:
dim3 block(16,16); dim3 grid(512/(block.x),512/(block.y),nimg);
seems pixels of apron not copyied in ouput buffer
you may want read following links more detailed description , better example code on how implement image convolution cuda kernel function.
http://igm.univ-mlv.fr/~biri/enseignement/mii2/donnees/convolutionseparable.pdf
https://www.evl.uic.edu/sjames/cs525/final.html
basically using convolution filter of size (5 x 5) not mean setting size of thread block (5 x 5).
typically, non-separable convolution, use thread block of size (16 x 16), calculate block of (16 x 16) pixels on output image. achieve need read block of ((2+16+2) x (2+16+2)) pixels input image shared memory, using (16 x 16) threads collaboratively.
Comments
Post a Comment