c++ - cuda multiple image erosion not work -


i'm trying implement multiple black(0) , white(255) image erosion cuda,i use square (5x5)structure element.the kernel had implemented take unsigned char array buffer in stored nimg images 200x200 px . allow erosion of multiple image simultaneosly make grid 3d structure:

  • each block has dimension of strel (5x5)
  • the grid has height = image_height/blockdim.y , width = image_width/blockdim.x , z = nimg

i've try implement extending sample.

the problem if store pixels block of threads consider shared buffer shared between threads of block; allow fast memory access, algorithm doesn't work properly.i try change bindex me make mistake,but cannot found solution.

any suggestion?

here's code:

//strel size #define strel_w 5 #define strel_h 5  // distance cente of strel strel width or height #define r (strel_h/2)  //size of 2d region each block consider i.e  neighborns each thread in block consider #define block_w (strel_w+(2*r)) #define block_h (strel_h+(2*r))  __global__ void erode_multiple_img_sm(unsigned char * buffer_in,                                        unsigned char * buffer_out,                                        int w,                                        int h ){      //array stored in shared memory,that contain pixel neighborns each thread in block consider      __shared__ unsigned char fast_acc_arr[block_w*block_h];      // map thread in 3d structure      int col = blockidx.x * strel_w + threadidx.x -r ;     int row = blockidx.y * strel_h + threadidx.y -r ;     int plane = blockidx.z * blockdim.z + threadidx.z;      // check if foreground px of strel not contain in region of image size of strel (if 1 px not contain image eroded)     bool is_contain = true;       // clamp edge of image     col = max(0,col);     col = min(col,w-1);      row = max(0,row);     row = min(row,h-1);      //map each thread in 1 dim coord map 3d structure(grid) image buffer(1d)      unsigned int index =  (plane * h * w) + (row * w) + col;      unsigned int bindex =  threadidx.y * blockdim.y + threadidx.x;       //each thread copy pixel of block shared memory (shared thread of block)     fast_acc_arr[bindex] = buffer_in[index];      __syncthreads();        //the strel must contain in image, thread.x , thread.y coords of center of mask correspond strel in image, , must contain in image     if((threadidx.x >= r) && (threadidx.x < block_w-r) && (threadidx.y >= r) && (threadidx.y <block_h-r)){          for(int dy=-r; dy<=r; dy++){             if(is_contain == false)             break;              (int dx = -r ; dx <= r; dx++) {                  //if 1 element in mask different value of strel el --> strel not contain in mask --> center of mask eroded (and it's no necessary consider other el of mask motivation of break)                 if (fast_acc_arr[bindex + (dy * blockdim.x) + dx ] != 255 ){                      buffer_out[index ] = 0;                     is_contain = false;                     break;                  }              }          }         // if strel contain image the center not eroded         if(is_contain == true)             buffer_out[index] = 255;       }  } 

that kernel settings:

 dim3 block(5,5,1);  dim3 grid(200/(block.x),200/(block.y),nimg); 

my kernel call:

 erode_multiple_img_sm<<<grid,block>>>(dimage_src,dimage_dst,200,200); 

my image input , output:

input: input output(150 buff element): output

code without shared memory(low speed):

__global__ void erode_multiple_img(unsigned char * buffer_in,                             unsigned char * buffer_out,                             int w,int h ){        int col = blockidx.x * blockdim.x + threadidx.x;     int row = blockidx.y * blockdim.y + threadidx.y;     int plane = blockidx.z * blockdim.z +threadidx.z;      bool is_contain = true;     col = max(0,col);     col = min(col,w-1);      row = max(0,row);     row = min(row,h-1);        for(int dy=-strel_h/2; dy<=strel_h/2; dy++){         if(is_contain == false)             break;         (int dx = -strel_w/2 ; dx <= strel_w/2; dx++) {             if (buffer_in[(plane * h * w) +( row + dy) * w + (col + dx) ] !=255 ){                 buffer_out[(plane * h * w) + row * w + col ] = 0;                 is_contain = false;                 break;             }         }     }     if(is_contain == true)         buffer_out[(plane * h * w) + row * w +col ] = 255;  } 

updated algorithm

i try follow samples convolution.i change input image, has 512x512 size , wrote algorithm:

#define strel_size 5   #define tile_w 16 #define tile_h 16  #define r (strel_h/2)  #define block_w (tile_w+(2*r)) #define block_h (tile_h+(2*r))  __global__ void erode_multiple_img_sm_v2(unsigned char * buffer_in,                             unsigned char * buffer_out,                             int w,int h ){      // data cache: threadidx.x , threadidx.y     __shared__ unsigned char data[tile_w +strel_size ][tile_w +strel_size ];      // global mem address of thread     int col = blockidx.x * blockdim.x + threadidx.x;     int row = blockidx.y * blockdim.y + threadidx.y;      int plane = blockidx.z * blockdim.z +threadidx.z;      int gloc = (plane*h/w)+ row*w +col;      bool is_contain = true;      // load cache (32x32 shared memory, 16x16 threads blocks)     // each threads loads 4 values global memory shared mem     int x, y;   // image based coordinate        if((col<w)&&(row<h)) {         data[threadidx.x][threadidx.y]=buffer_in[gloc];      if (threadidx.y > (h-strel_size))         data[threadidx.x][threadidx.y + strel_size]=buffer_in[gloc + strel_size];      if (threadidx.x >(w-strel_size))         data[threadidx.x + strel_size][threadidx.y]=buffer_in[gloc+strel_size];      if ((threadidx.x >(w-strel_size)) && (threadidx.y > (h-strel_size)))         data[threadidx.x+strel_size][threadidx.y+strel_size] =   buffer_in[gloc+2*strel_size];     //wait threads finish read      __syncthreads();        //buffer_out[gloc] = data[threadidx.x][threadidx.y];        unsigned char min_value = 255;       for(x=0;x<strel_size;x++){           for(y=0;y<strel_size;y++){               min_value = min( (data[threadidx.x+x][threadidx.y+y]) , min_value);               }           }       buffer_out[gloc]= min_value;       } 

}

my kernel settings now are:

dim3 block(16,16); dim3 grid(512/(block.x),512/(block.y),nimg); 

input: input

output: output

seems pixels of apron not copyied in ouput buffer

you may want read following links more detailed description , better example code on how implement image convolution cuda kernel function.

http://igm.univ-mlv.fr/~biri/enseignement/mii2/donnees/convolutionseparable.pdf

https://www.evl.uic.edu/sjames/cs525/final.html

basically using convolution filter of size (5 x 5) not mean setting size of thread block (5 x 5).

typically, non-separable convolution, use thread block of size (16 x 16), calculate block of (16 x 16) pixels on output image. achieve need read block of ((2+16+2) x (2+16+2)) pixels input image shared memory, using (16 x 16) threads collaboratively.


Comments

Popular posts from this blog

sql - invalid in the select list because it is not contained in either an aggregate function -

Angularjs unit testing - ng-disabled not working when adding text to textarea -

How to start daemon on android by adb -