问题
I am reading "Using MPI" and try to execute the code myself. There is a grid decomposition code in Chapter 6.3. It compiles with no warnings or errors, and runs with small number processes, but fails with larger numbers, say 30, on my laptop. My laptop is 4 core, hyperthreaded, and 8G RAM. Both versions of la_grid_2d_new do not work, but the first one tolerate a little larger number, say 35, but fails for 40 processes. I am not sure why. Could you help me please? Thanks a lot.
#include <stdio.h>
#include <mpi.h>
#include <stdlib.h>
typedef struct
{
    int P, Q;
    int p, q;
    MPI_Comm grid_comm;
    MPI_Comm row_comm;
    MPI_Comm col_comm;
} LA_Grid_2d;
LA_Grid_2d *la_grid_2d_new(MPI_Comm comm, int P, int Q)
{
    LA_Grid_2d *grid;
    MPI_Comm row, col;
    int my_rank, p, q;
    MPI_Comm_rank(comm, &my_rank);
    p=my_rank/Q;
    q=my_rank%Q;
    MPI_Comm_split(comm, p, q, &row);
    MPI_Comm_split(comm, q, p, &col);
    grid=(LA_Grid_2d *)malloc(sizeof(LA_Grid_2d));
    grid->grid_comm=comm;
    grid->row_comm=row;
    grid->col_comm=col;
    grid->P=P;
    grid->Q=Q;
    grid->p=p;
    grid->q=q;
    return grid;
}
LA_Grid_2d *la_grid_2d_new_II(MPI_Comm comm, int P, int Q)
{
    LA_Grid_2d *grid;
    MPI_Comm comm_2d, row, col;
    int my_rank, p, q;
    int dims[2]={P,Q}, local[2], period[2]={0,0}, remain_dims[2];
    MPI_Cart_create(comm, 2, dims, period, 1, &comm_2d);
    MPI_Comm_rank(comm, &my_rank);
    MPI_Cart_coords(comm_2d, my_rank, 2, local);
    p=local[0];
    q=local[1];
    remain_dims[0]=0;
    remain_dims[1]=1;
    MPI_Cart_sub(comm_2d, remain_dims, &row);
    remain_dims[0]=1;
    remain_dims[1]=0;
    MPI_Cart_sub(comm_2d, remain_dims, &col);
    grid=(LA_Grid_2d *)malloc(sizeof(LA_Grid_2d));
    grid->grid_comm=comm;
    grid->row_comm=row;
    grid->col_comm=col;
    grid->P=P;
    grid->Q=Q;
    grid->p=p;
    grid->q=q;
    return grid;
}
void la_grid_2d_delete(LA_Grid_2d *grid)
{
    free(grid);
}
int main(int argc, char **argv)
{
    LA_Grid_2d *pgrid;
    int size, rank, dims[2]={0,0}, row, col;
    MPI_Init(&argc, &argv);
    MPI_Comm_size(MPI_COMM_WORLD, &size);
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    if(rank==0)
        printf("size=%d rank=%d\n", size, rank);
    MPI_Dims_create(size, 2, dims);
//  pgrid=la_grid_2d_new(MPI_COMM_WORLD, dims[0], dims[1]);
    pgrid=la_grid_2d_new_II(MPI_COMM_WORLD, dims[0], dims[1]);
    if(rank==0)
        printf("dims[0]=%d dims[1]=%d\n", dims[0], dims[1]);
    MPI_Reduce(&rank, &row, 1, MPI_INT, MPI_SUM, 0, pgrid->row_comm);
    MPI_Reduce(&rank, &col, 1, MPI_INT, MPI_SUM, 0, pgrid->col_comm);
    la_grid_2d_delete(pgrid);
    MPI_Finalize();
    if(rank==0)
        printf("row=%d col=%d\n", row, col);
    return 0;
}
The error messages are:
shuang@phoebe:~/usingMPI$ mpiexec -n 20 ./grid
size=20 rank=0
dims[0]=5 dims[1]=4
row=6 col=40
shuang@phoebe:~/usingMPI$ mpiexec -n 30 ./grid
size=30 rank=0
dims[0]=6 dims[1]=5
[phoebe:14939] *** Process received signal ***
[phoebe:14939] Signal: Floating point exception (8)
[phoebe:14939] Signal code: Integer divide-by-zero (1)
[phoebe:14939] Failing at address: 0x7fb1e599e6f7
[phoebe:14939] [ 0] /lib/x86_64-linux-gnu/libpthread.so.0(+0xfcb0) [0x7fb1e5714cb0]
[phoebe:14939] [ 1] /usr/lib/libmpi.so.0(mca_topo_base_cart_coords+0x57) [0x7fb1e599e6f7]
[phoebe:14939] [ 2] /usr/lib/libmpi.so.0(mca_topo_base_cart_sub+0x166) [0x7fb1e599ec36]
[phoebe:14939] [ 3] /usr/lib/libmpi.so.0(PMPI_Cart_sub+0xba) [0x7fb1e596f34a]
[phoebe:14939] [ 4] ./grid(la_grid_2d_new_II+0xd6) [0x400df6]
[phoebe:14939] [ 5] ./grid(main+0x98) [0x400f07]
[phoebe:14939] [ 6] /lib/x86_64-linux-gnu/libc.so.6(__libc_start_main+0xed) [0x7fb1e536776d]
[phoebe:14939] [ 7] ./grid() [0x400b99]
[phoebe:14939] *** End of error message ***
--------------------------------------------------------------------------
mpiexec noticed that process rank 22 with PID 14939 on node phoebe exited on signal 8 (Floating point exception).
--------------------------------------------------------------------------
回答1:
@Sean If you want to try another OpenMPI you can normally download it and compile with something like
./configure --prefix=/opt/ompi-[version]
make
sudo make install
Since this will install to a non-standard location(for easy removal later) you will need to set LD_LIBRARY_PATH=/opt/ompi-[version]/lib and specify the full path to mpicc and mpirun to ensure you call the right version. Somewhere in the build process it will remind you about setting LD_LIBRARY_PATH.
来源:https://stackoverflow.com/questions/25217233/does-the-number-of-processes-in-mpi-have-a-limit