Anda di halaman 1dari 3

//Matrix multiplication using multiple blocks

#include<stdio.h>
#include<conio.h>
#include<cuda.h>
//Kernel funtion
__global__ void matrix_mul_blocks(float *first, float *second,float *result,int
tile_width,int width1,int width2)
{
float a,b,sum;
int idx = threadIdx.x;
int idy = threadIdx.y;
int bx = blockIdx.x;
int by = blockIdx.y;
int k ,uidx , uidy ;
uidx = bx*tile_width + idx;
uidy = by*tile_width + idy;
sum =0;
for(k=0 ; k<width1 ;k++)
{
a = first[uidy*width1+k];
b = second[uidx + k*width2];
sum = sum + (a*b);
}
result[uidy*width2 + uidx] = sum;

}
void main()
{
float *a_h, *b_h, *c_h, *a_d, *b_d, *c_d;
int row1, col1, row2, col2 ;
int tile_width , width1 , width2;
char option;
//acquiring the size
printf("Enter the size of the first matrix(row col)>>");
scanf("%d %d",&row1,&col1);
printf("Enter the size of the second matrix(row col)>>");
scanf("%d %d",&row2,&col2);
//memory allocation on host
a_h = (float*)malloc(sizeof(float)*row1*col1);
b_h = (float*)malloc(sizeof(float)*row2*col2);
c_h = (float*)malloc(sizeof(float)*row1*col2);
//memory allocation on device
cudaMalloc((void**)&a_d,sizeof(float)*row1*col1);
cudaMalloc((void**)&b_d,sizeof(float)*row2*col2);
cudaMalloc((void**)&c_d,sizeof(float)*row1*col2);
//getting the data on host
printf("Want to enter automatically(Y/N)>>");
scanf(" %c",&option);
if(option== 'Y')
{
for(int i =0;i<row1;i++)
{
for(int j =0;j<col1;j++)
{
a_h[i*col1 +j] = i+j;
}
}
for(int i=0;i<row2;i++)
{
for(int j=0;j<col2;j++)
{
b_h[i*col2 +j] = i*j;
}
}
}
else if(option == 'N')
{
printf("Enter the data for the first matrix\n");
for(int i =0;i<row1;i++)
{
for(int j=0;j<col1;j++)
{
printf("Enter the element a_h[%d][%d]>>",i,j);
scanf("%f",&a_h[i*col1+j]);
}
}
printf("Enter the data for the second matrix\n");
for(int i =0;i<row2;i++)
{
for(int j=0;j<col2;j++)
{
printf("Enter the element b_h[%d][%d]>>",i,j);
scanf("%f",&b_h[i*col2+j]);
}
}
}
//printing the data entered
for(int i =0;i<row1;i++)
{
for(int j=0;j<col1;j++)
{
printf("%f\t",a_h[i*col1+j]);
}
printf("\n");
}
printf("\n");
for(int i =0;i<row2;i++)
{
for(int j=0;j<col2;j++)
{
printf("%f\t",b_h[i*col2+j]);
}
printf("\n");
}
printf("\n");

//copying the data to device


cudaMemcpy(a_d,a_h,sizeof(float)*row1*col1,cudaMemcpyHostToDevice);
cudaMemcpy(b_d,b_h,sizeof(float)*row2*col2,cudaMemcpyHostToDevice);
//determing width
width1 = col1;
width2 = col2;
//defining the no of blocks and threads per block
dim3 dimGrid(col2/2 , row1/2 ,1);
dim3 dimBlock(2,2,1);
tile_width = 2;
//call to the kernel function
matrix_mul_blocks<<<dimGrid , dimBlock>>>(a_d,b_d,c_d,tile_width,width1,
width2);
//retrieving data from device
cudaMemcpy(c_h,c_d,sizeof(float)*row1*col2,cudaMemcpyDeviceToHost);
//displaying the result
for(int i =0;i<row1;i++)
{
for(int j=0;j<col2;j++)
{
printf("%f\t",c_h[i*col2+j]);
}
printf("\n");
}
getch();
//deallocating the memory
free(a_h);
free(b_h);
free(c_h);
cudaFree(a_d);
cudaFree(b_d);
cudaFree(c_d);
}

Anda mungkin juga menyukai