I'm trying to cyclically distribute row blocks of a matrix (represented as a 1D array) across processes using MPI_Type_create_darray.
After creating the datatype and inspecting it with MPI_Type_get_true_extent and MPI_Type_size, everything appears to be calculated correctly. However, when I actually send the data using MPI_Scatterv, all processes receive the same data, the offsets are not being applied during communication.
Here is a minimal example ready to compile and execute.
#mpicc P2_SO.c -o P2 #COMPILE
#mpirun -np 3 ./P2 6 #EXECUTE
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <mpi.h>
#define ROOT_RANK 0
#define ROWS_PER_BLOCK 2
int main(int argc, char **argv)
{
// Obtain matrix size from argument
int N = atoi(argv[1]);
// Initialize the MPI environment
MPI_Init(NULL, NULL);
// Get the rank of the process
int world_rank;
MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
// Get the number of processes
int world_size;
MPI_Comm_size(MPI_COMM_WORLD, &world_size);
int gsize[1] = {N * N};
int distrib[1] = {MPI_DISTRIBUTE_CYCLIC};
int dargs[1] = {ROWS_PER_BLOCK * N};
int psize[1] = {world_size};
MPI_Datatype myType;
MPI_Type_create_darray(world_size, world_rank, 1,
gsize,
distrib,
dargs,
psize,
MPI_ORDER_C,
MPI_DOUBLE,
&myType);
MPI_Type_commit(&myType);
///// DEBUG //////
MPI_Aint lb, extent;
MPI_Type_get_extent(myType, &lb, &extent);
MPI_Aint true_lb, true_extent;
MPI_Type_get_true_extent(myType, &true_lb, &true_extent);
int count_bytes;
MPI_Type_size(myType, &count_bytes);
int num_elements = count_bytes / (int)sizeof(double);
///// DEBUG //////
int *sendcounts = malloc((size_t)world_size * sizeof(int));
int *displs = malloc((size_t)world_size * sizeof(int));
for (int i = 0; i < world_size; i++)
{
sendcounts[i] = 1; // Each process receives one datatype of cyclic_type
displs[i] = 0;
}
double *buf = malloc((size_t)num_elements * sizeof(double)); // rows_per_process[world_rank]
if (world_rank == ROOT_RANK)
{
printf("Process %d: true_lb = %ld == %d\n", world_rank, true_lb / 8, N * ROWS_PER_BLOCK * world_rank);
printf("Process %d has %d elements\n", world_rank, num_elements);
double *A;
// Reserves memory and inicializate matrix
A = (double *)malloc(sizeof(double) * (size_t)N * (size_t)N);
for (int i = 0; i < N; i++)
{
for (int j = 0; j < N; j++)
{
A[i * N + j] = i * N + j;
}
}
MPI_Scatterv(A, sendcounts, displs, myType,
buf, num_elements, MPI_DOUBLE,
ROOT_RANK, MPI_COMM_WORLD);
}
else
{
sleep((unsigned int)world_rank);
printf("\nPROCESS %d:\n", world_rank);
printf("Process %d: true_lb = %ld == %d\n", world_rank, true_lb / 8, N * ROWS_PER_BLOCK * world_rank);
printf("Process %d has %d elements\n", world_rank, num_elements);
MPI_Scatterv(NULL, sendcounts, displs, myType,
buf, num_elements, MPI_DOUBLE,
ROOT_RANK, MPI_COMM_WORLD);
}
for (int i = 0; i < num_elements; i++)
{
printf("Position %d of process %d has value %f\n", i, world_rank, buf[i]);
}
free(sendcounts);
free(displs);
free(buf);
MPI_Type_free(&myType);
MPI_Finalize();
return 0;
}
displsare0. What did you expect?