furnace/extern/fftw/mpi/choose-radix.c

84 lines
3.4 KiB
C
Raw Normal View History

/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
#include "ifftw-mpi.h"
/* Return the radix r for a 1d MPI transform of a distributed dimension d,
with the given flags and transform size. That is, decomposes d.n
as r * m, Cooley-Tukey style. Also computes the block sizes rblock
and mblock. Returns 0 if such a decomposition is not feasible.
This is unfortunately somewhat complicated.
A distributed Cooley-Tukey algorithm works as follows (see dft-rank1.c):
d.n is initially distributed as an m x r array with block size mblock[IB].
Then it is internally transposed to an r x m array with block size
rblock[IB]. Then it is internally transposed to m x r again with block
size mblock[OB]. Finally, it is transposed to r x m with block size
rblock[IB].
If flags & SCRAMBLED_IN, then the first transpose is skipped (the array
starts out as r x m). If flags & SCRAMBLED_OUT, then the last transpose
is skipped (the array ends up as m x r). To make sure the forward
and backward transforms use the same "scrambling" format, we swap r
and m when sign != FFT_SIGN.
There are some downsides to this, especially in the case where
either m or r is not divisible by n_pes. For one thing, it means
that in general we can't use the same block size for the input and
output. For another thing, it means that we can't in general honor
a user's "requested" block sizes in d.b[]. Therefore, for simplicity,
we simply ignore d.b[] for now.
*/
INT XM(choose_radix)(ddim d, int n_pes, unsigned flags, int sign,
INT rblock[2], INT mblock[2])
{
INT r, m;
UNUSED(flags); /* we would need this if we paid attention to d.b[*] */
/* If n_pes is a factor of d.n, then choose r to be d.n / n_pes.
This not only ensures that the input (the m dimension) is
equally distributed if possible, and at the r dimension is
maximally equally distributed (if d.n/n_pes >= n_pes), it also
makes one of the local transpositions in the algorithm
trivial. */
if (d.n % n_pes == 0 /* it's good if n_pes divides d.n ...*/
&& d.n / n_pes >= n_pes /* .. unless we can't use n_pes processes */)
r = d.n / n_pes;
else { /* n_pes does not divide d.n, pick a factor close to sqrt(d.n) */
for (r = X(isqrt)(d.n); d.n % r != 0; ++r)
;
}
if (r == 1 || r == d.n) return 0; /* punt if we can't reduce size */
if (sign != FFT_SIGN) { /* swap {m,r} so that scrambling is reversible */
m = r;
r = d.n / m;
}
else
m = d.n / r;
rblock[IB] = rblock[OB] = XM(default_block)(r, n_pes);
mblock[IB] = mblock[OB] = XM(default_block)(m, n_pes);
return r;
}