mirror of
https://github.com/tildearrow/furnace.git
synced 2024-11-12 15:55:06 +00:00
54e93db207
not reliable yet
1143 lines
34 KiB
C
1143 lines
34 KiB
C
/*
|
|
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
|
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
|
*
|
|
* This program is free software; you can redistribute it and/or modify
|
|
* it under the terms of the GNU General Public License as published by
|
|
* the Free Software Foundation; either version 2 of the License, or
|
|
* (at your option) any later version.
|
|
*
|
|
* This program is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
* GNU General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU General Public License
|
|
* along with this program; if not, write to the Free Software
|
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
*
|
|
*/
|
|
|
|
|
|
/* FFTW internal header file */
|
|
#ifndef __IFFTW_H__
|
|
#define __IFFTW_H__
|
|
|
|
#include "config.h"
|
|
|
|
#include <stdlib.h> /* size_t */
|
|
#include <stdarg.h> /* va_list */
|
|
#include <stddef.h> /* ptrdiff_t */
|
|
#include <limits.h> /* INT_MAX */
|
|
|
|
#if HAVE_SYS_TYPES_H
|
|
# include <sys/types.h>
|
|
#endif
|
|
|
|
#if HAVE_STDINT_H
|
|
# include <stdint.h> /* uintptr_t, maybe */
|
|
#endif
|
|
|
|
#if HAVE_INTTYPES_H
|
|
# include <inttypes.h> /* uintptr_t, maybe */
|
|
#endif
|
|
|
|
#ifdef __cplusplus
|
|
extern "C"
|
|
{
|
|
#endif /* __cplusplus */
|
|
|
|
/* Windows annoyances -- since tests/hook.c uses some internal
|
|
FFTW functions, we need to given them the dllexport attribute
|
|
under Windows when compiling as a DLL (see api/fftw3.h). */
|
|
#if defined(FFTW_EXTERN)
|
|
# define IFFTW_EXTERN FFTW_EXTERN
|
|
#elif (defined(FFTW_DLL) || defined(DLL_EXPORT)) \
|
|
&& (defined(_WIN32) || defined(__WIN32__))
|
|
# define IFFTW_EXTERN extern __declspec(dllexport)
|
|
#else
|
|
# define IFFTW_EXTERN extern
|
|
#endif
|
|
|
|
/* determine precision and name-mangling scheme */
|
|
#define CONCAT(prefix, name) prefix ## name
|
|
#if defined(FFTW_SINGLE)
|
|
typedef float R;
|
|
# define X(name) CONCAT(fftwf_, name)
|
|
#elif defined(FFTW_LDOUBLE)
|
|
typedef long double R;
|
|
# define X(name) CONCAT(fftwl_, name)
|
|
# define TRIGREAL_IS_LONG_DOUBLE
|
|
#elif defined(FFTW_QUAD)
|
|
typedef __float128 R;
|
|
# define X(name) CONCAT(fftwq_, name)
|
|
# define TRIGREAL_IS_QUAD
|
|
#else
|
|
typedef double R;
|
|
# define X(name) CONCAT(fftw_, name)
|
|
#endif
|
|
|
|
/*
|
|
integral type large enough to contain a stride (what ``int'' should
|
|
have been in the first place.
|
|
*/
|
|
typedef ptrdiff_t INT;
|
|
|
|
/* dummy use of unused parameters to silence compiler warnings */
|
|
#define UNUSED(x) (void)x
|
|
|
|
#define NELEM(array) ((sizeof(array) / sizeof((array)[0])))
|
|
|
|
#define FFT_SIGN (-1) /* sign convention for forward transforms */
|
|
extern void X(extract_reim)(int sign, R *c, R **r, R **i);
|
|
|
|
#define REGISTER_SOLVER(p, s) X(solver_register)(p, s)
|
|
|
|
#define STRINGIZEx(x) #x
|
|
#define STRINGIZE(x) STRINGIZEx(x)
|
|
#define CIMPLIES(ante, post) (!(ante) || (post))
|
|
|
|
/* define HAVE_SIMD if any simd extensions are supported */
|
|
#if defined(HAVE_SSE) || defined(HAVE_SSE2) || \
|
|
defined(HAVE_AVX) || defined(HAVE_AVX_128_FMA) || \
|
|
defined(HAVE_AVX2) || defined(HAVE_AVX512) || \
|
|
defined(HAVE_KCVI) || \
|
|
defined(HAVE_ALTIVEC) || defined(HAVE_VSX) || \
|
|
defined(HAVE_MIPS_PS) || \
|
|
defined(HAVE_GENERIC_SIMD128) || defined(HAVE_GENERIC_SIMD256)
|
|
#define HAVE_SIMD 1
|
|
#else
|
|
#define HAVE_SIMD 0
|
|
#endif
|
|
|
|
extern int X(have_simd_sse2)(void);
|
|
extern int X(have_simd_avx)(void);
|
|
extern int X(have_simd_avx_128_fma)(void);
|
|
extern int X(have_simd_avx2)(void);
|
|
extern int X(have_simd_avx2_128)(void);
|
|
extern int X(have_simd_avx512)(void);
|
|
extern int X(have_simd_altivec)(void);
|
|
extern int X(have_simd_vsx)(void);
|
|
extern int X(have_simd_neon)(void);
|
|
|
|
/* forward declarations */
|
|
typedef struct problem_s problem;
|
|
typedef struct plan_s plan;
|
|
typedef struct solver_s solver;
|
|
typedef struct planner_s planner;
|
|
typedef struct printer_s printer;
|
|
typedef struct scanner_s scanner;
|
|
|
|
/*-----------------------------------------------------------------------*/
|
|
/* alloca: */
|
|
#if HAVE_SIMD
|
|
# if defined(HAVE_KCVI) || defined(HAVE_AVX512)
|
|
# define MIN_ALIGNMENT 64
|
|
# elif defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_GENERIC_SIMD256)
|
|
# define MIN_ALIGNMENT 32 /* best alignment for AVX, conservative for
|
|
* everything else */
|
|
# else
|
|
/* Note that we cannot use 32-byte alignment for all SIMD. For
|
|
example, MacOS X malloc is 16-byte aligned, but there was no
|
|
posix_memalign in MacOS X until version 10.6. */
|
|
# define MIN_ALIGNMENT 16
|
|
# endif
|
|
#endif
|
|
|
|
#if defined(HAVE_ALLOCA) && defined(FFTW_ENABLE_ALLOCA)
|
|
/* use alloca if available */
|
|
|
|
#ifndef alloca
|
|
#ifdef __GNUC__
|
|
# define alloca __builtin_alloca
|
|
#else
|
|
# ifdef _MSC_VER
|
|
# include <malloc.h>
|
|
# define alloca _alloca
|
|
# else
|
|
# if HAVE_ALLOCA_H
|
|
# include <alloca.h>
|
|
# else
|
|
# ifdef _AIX
|
|
#pragma alloca
|
|
# else
|
|
# ifndef alloca /* predefined by HP cc +Olibcalls */
|
|
void *alloca(size_t);
|
|
# endif
|
|
# endif
|
|
# endif
|
|
# endif
|
|
#endif
|
|
#endif
|
|
|
|
# ifdef MIN_ALIGNMENT
|
|
# define STACK_MALLOC(T, p, n) \
|
|
{ \
|
|
p = (T)alloca((n) + MIN_ALIGNMENT); \
|
|
p = (T)(((uintptr_t)p + (MIN_ALIGNMENT - 1)) & \
|
|
(~(uintptr_t)(MIN_ALIGNMENT - 1))); \
|
|
}
|
|
# define STACK_FREE(n)
|
|
# else /* HAVE_ALLOCA && !defined(MIN_ALIGNMENT) */
|
|
# define STACK_MALLOC(T, p, n) p = (T)alloca(n)
|
|
# define STACK_FREE(n)
|
|
# endif
|
|
|
|
#else /* ! HAVE_ALLOCA */
|
|
/* use malloc instead of alloca */
|
|
# define STACK_MALLOC(T, p, n) p = (T)MALLOC(n, OTHER)
|
|
# define STACK_FREE(n) X(ifree)(n)
|
|
#endif /* ! HAVE_ALLOCA */
|
|
|
|
/* allocation of buffers. If these grow too large use malloc(), else
|
|
use STACK_MALLOC (hopefully reducing to alloca()). */
|
|
|
|
/* 64KiB ought to be enough for anybody */
|
|
#define MAX_STACK_ALLOC ((size_t)64 * 1024)
|
|
|
|
#define BUF_ALLOC(T, p, n) \
|
|
{ \
|
|
if (n < MAX_STACK_ALLOC) { \
|
|
STACK_MALLOC(T, p, n); \
|
|
} else { \
|
|
p = (T)MALLOC(n, BUFFERS); \
|
|
} \
|
|
}
|
|
|
|
#define BUF_FREE(p, n) \
|
|
{ \
|
|
if (n < MAX_STACK_ALLOC) { \
|
|
STACK_FREE(p); \
|
|
} else { \
|
|
X(ifree)(p); \
|
|
} \
|
|
}
|
|
|
|
/*-----------------------------------------------------------------------*/
|
|
/* define uintptr_t if it is not already defined */
|
|
|
|
#ifndef HAVE_UINTPTR_T
|
|
# if SIZEOF_VOID_P == 0
|
|
# error sizeof void* is unknown!
|
|
# elif SIZEOF_UNSIGNED_INT == SIZEOF_VOID_P
|
|
typedef unsigned int uintptr_t;
|
|
# elif SIZEOF_UNSIGNED_LONG == SIZEOF_VOID_P
|
|
typedef unsigned long uintptr_t;
|
|
# elif SIZEOF_UNSIGNED_LONG_LONG == SIZEOF_VOID_P
|
|
typedef unsigned long long uintptr_t;
|
|
# else
|
|
# error no unsigned integer type matches void* sizeof!
|
|
# endif
|
|
#endif
|
|
|
|
/*-----------------------------------------------------------------------*/
|
|
/* We can do an optimization for copying pairs of (aligned) floats
|
|
when in single precision if 2*float = double. */
|
|
|
|
#define FFTW_2R_IS_DOUBLE (defined(FFTW_SINGLE) \
|
|
&& SIZEOF_FLOAT != 0 \
|
|
&& SIZEOF_DOUBLE == 2*SIZEOF_FLOAT)
|
|
|
|
#define DOUBLE_ALIGNED(p) ((((uintptr_t)(p)) % sizeof(double)) == 0)
|
|
|
|
/*-----------------------------------------------------------------------*/
|
|
/* assert.c: */
|
|
IFFTW_EXTERN void X(assertion_failed)(const char *s,
|
|
int line, const char *file);
|
|
|
|
/* always check */
|
|
#define CK(ex) \
|
|
(void)((ex) || (X(assertion_failed)(#ex, __LINE__, __FILE__), 0))
|
|
|
|
#ifdef FFTW_DEBUG
|
|
/* check only if debug enabled */
|
|
#define A(ex) \
|
|
(void)((ex) || (X(assertion_failed)(#ex, __LINE__, __FILE__), 0))
|
|
#else
|
|
#define A(ex) /* nothing */
|
|
#endif
|
|
|
|
extern void X(debug)(const char *format, ...);
|
|
#define D X(debug)
|
|
|
|
/*-----------------------------------------------------------------------*/
|
|
/* kalloc.c: */
|
|
extern void *X(kernel_malloc)(size_t n);
|
|
extern void X(kernel_free)(void *p);
|
|
|
|
/*-----------------------------------------------------------------------*/
|
|
/* alloc.c: */
|
|
|
|
/* objects allocated by malloc, for statistical purposes */
|
|
enum malloc_tag {
|
|
EVERYTHING,
|
|
PLANS,
|
|
SOLVERS,
|
|
PROBLEMS,
|
|
BUFFERS,
|
|
HASHT,
|
|
TENSORS,
|
|
PLANNERS,
|
|
SLVDESCS,
|
|
TWIDDLES,
|
|
STRIDES,
|
|
OTHER,
|
|
MALLOC_WHAT_LAST /* must be last */
|
|
};
|
|
|
|
IFFTW_EXTERN void X(ifree)(void *ptr);
|
|
extern void X(ifree0)(void *ptr);
|
|
|
|
IFFTW_EXTERN void *X(malloc_plain)(size_t sz);
|
|
#define MALLOC(n, what) X(malloc_plain)(n)
|
|
|
|
/*-----------------------------------------------------------------------*/
|
|
/* low-resolution clock */
|
|
|
|
#ifdef FAKE_CRUDE_TIME
|
|
typedef int crude_time;
|
|
#else
|
|
# if TIME_WITH_SYS_TIME
|
|
# include <sys/time.h>
|
|
# include <time.h>
|
|
# else
|
|
# if HAVE_SYS_TIME_H
|
|
# include <sys/time.h>
|
|
# else
|
|
# include <time.h>
|
|
# endif
|
|
# endif
|
|
|
|
# ifdef HAVE_BSDGETTIMEOFDAY
|
|
# ifndef HAVE_GETTIMEOFDAY
|
|
# define gettimeofday BSDgettimeofday
|
|
# define HAVE_GETTIMEOFDAY 1
|
|
# endif
|
|
# endif
|
|
|
|
# if defined(HAVE_GETTIMEOFDAY)
|
|
typedef struct timeval crude_time;
|
|
# else
|
|
typedef clock_t crude_time;
|
|
# endif
|
|
#endif /* else FAKE_CRUDE_TIME */
|
|
|
|
crude_time X(get_crude_time)(void);
|
|
double X(elapsed_since)(const planner *plnr, const problem *p,
|
|
crude_time t0); /* time in seconds since t0 */
|
|
|
|
/*-----------------------------------------------------------------------*/
|
|
/* ops.c: */
|
|
/*
|
|
* ops counter. The total number of additions is add + fma
|
|
* and the total number of multiplications is mul + fma.
|
|
* Total flops = add + mul + 2 * fma
|
|
*/
|
|
typedef struct {
|
|
double add;
|
|
double mul;
|
|
double fma;
|
|
double other;
|
|
} opcnt;
|
|
|
|
void X(ops_zero)(opcnt *dst);
|
|
void X(ops_other)(INT o, opcnt *dst);
|
|
void X(ops_cpy)(const opcnt *src, opcnt *dst);
|
|
|
|
void X(ops_add)(const opcnt *a, const opcnt *b, opcnt *dst);
|
|
void X(ops_add2)(const opcnt *a, opcnt *dst);
|
|
|
|
/* dst = m * a + b */
|
|
void X(ops_madd)(INT m, const opcnt *a, const opcnt *b, opcnt *dst);
|
|
|
|
/* dst += m * a */
|
|
void X(ops_madd2)(INT m, const opcnt *a, opcnt *dst);
|
|
|
|
|
|
/*-----------------------------------------------------------------------*/
|
|
/* minmax.c: */
|
|
INT X(imax)(INT a, INT b);
|
|
INT X(imin)(INT a, INT b);
|
|
|
|
/*-----------------------------------------------------------------------*/
|
|
/* iabs.c: */
|
|
INT X(iabs)(INT a);
|
|
|
|
/* inline version */
|
|
#define IABS(x) (((x) < 0) ? (0 - (x)) : (x))
|
|
|
|
/*-----------------------------------------------------------------------*/
|
|
/* md5.c */
|
|
|
|
#if SIZEOF_UNSIGNED_INT >= 4
|
|
typedef unsigned int md5uint;
|
|
#else
|
|
typedef unsigned long md5uint; /* at least 32 bits as per C standard */
|
|
#endif
|
|
|
|
typedef md5uint md5sig[4];
|
|
|
|
typedef struct {
|
|
md5sig s; /* state and signature */
|
|
|
|
/* fields not meant to be used outside md5.c: */
|
|
unsigned char c[64]; /* stuff not yet processed */
|
|
unsigned l; /* total length. Should be 64 bits long, but this is
|
|
good enough for us */
|
|
} md5;
|
|
|
|
void X(md5begin)(md5 *p);
|
|
void X(md5putb)(md5 *p, const void *d_, size_t len);
|
|
void X(md5puts)(md5 *p, const char *s);
|
|
void X(md5putc)(md5 *p, unsigned char c);
|
|
void X(md5int)(md5 *p, int i);
|
|
void X(md5INT)(md5 *p, INT i);
|
|
void X(md5unsigned)(md5 *p, unsigned i);
|
|
void X(md5end)(md5 *p);
|
|
|
|
/*-----------------------------------------------------------------------*/
|
|
/* tensor.c: */
|
|
#define STRUCT_HACK_KR
|
|
#undef STRUCT_HACK_C99
|
|
|
|
typedef struct {
|
|
INT n;
|
|
INT is; /* input stride */
|
|
INT os; /* output stride */
|
|
} iodim;
|
|
|
|
typedef struct {
|
|
int rnk;
|
|
#if defined(STRUCT_HACK_KR)
|
|
iodim dims[1];
|
|
#elif defined(STRUCT_HACK_C99)
|
|
iodim dims[];
|
|
#else
|
|
iodim *dims;
|
|
#endif
|
|
} tensor;
|
|
|
|
/*
|
|
Definition of rank -infinity.
|
|
This definition has the property that if you want rank 0 or 1,
|
|
you can simply test for rank <= 1. This is a common case.
|
|
|
|
A tensor of rank -infinity has size 0.
|
|
*/
|
|
#define RNK_MINFTY INT_MAX
|
|
#define FINITE_RNK(rnk) ((rnk) != RNK_MINFTY)
|
|
|
|
typedef enum { INPLACE_IS, INPLACE_OS } inplace_kind;
|
|
|
|
tensor *X(mktensor)(int rnk);
|
|
tensor *X(mktensor_0d)(void);
|
|
tensor *X(mktensor_1d)(INT n, INT is, INT os);
|
|
tensor *X(mktensor_2d)(INT n0, INT is0, INT os0,
|
|
INT n1, INT is1, INT os1);
|
|
tensor *X(mktensor_3d)(INT n0, INT is0, INT os0,
|
|
INT n1, INT is1, INT os1,
|
|
INT n2, INT is2, INT os2);
|
|
tensor *X(mktensor_4d)(INT n0, INT is0, INT os0,
|
|
INT n1, INT is1, INT os1,
|
|
INT n2, INT is2, INT os2,
|
|
INT n3, INT is3, INT os3);
|
|
tensor *X(mktensor_5d)(INT n0, INT is0, INT os0,
|
|
INT n1, INT is1, INT os1,
|
|
INT n2, INT is2, INT os2,
|
|
INT n3, INT is3, INT os3,
|
|
INT n4, INT is4, INT os4);
|
|
INT X(tensor_sz)(const tensor *sz);
|
|
void X(tensor_md5)(md5 *p, const tensor *t);
|
|
INT X(tensor_max_index)(const tensor *sz);
|
|
INT X(tensor_min_istride)(const tensor *sz);
|
|
INT X(tensor_min_ostride)(const tensor *sz);
|
|
INT X(tensor_min_stride)(const tensor *sz);
|
|
int X(tensor_inplace_strides)(const tensor *sz);
|
|
int X(tensor_inplace_strides2)(const tensor *a, const tensor *b);
|
|
int X(tensor_strides_decrease)(const tensor *sz, const tensor *vecsz,
|
|
inplace_kind k);
|
|
tensor *X(tensor_copy)(const tensor *sz);
|
|
int X(tensor_kosherp)(const tensor *x);
|
|
|
|
tensor *X(tensor_copy_inplace)(const tensor *sz, inplace_kind k);
|
|
tensor *X(tensor_copy_except)(const tensor *sz, int except_dim);
|
|
tensor *X(tensor_copy_sub)(const tensor *sz, int start_dim, int rnk);
|
|
tensor *X(tensor_compress)(const tensor *sz);
|
|
tensor *X(tensor_compress_contiguous)(const tensor *sz);
|
|
tensor *X(tensor_append)(const tensor *a, const tensor *b);
|
|
void X(tensor_split)(const tensor *sz, tensor **a, int a_rnk, tensor **b);
|
|
int X(tensor_tornk1)(const tensor *t, INT *n, INT *is, INT *os);
|
|
void X(tensor_destroy)(tensor *sz);
|
|
void X(tensor_destroy2)(tensor *a, tensor *b);
|
|
void X(tensor_destroy4)(tensor *a, tensor *b, tensor *c, tensor *d);
|
|
void X(tensor_print)(const tensor *sz, printer *p);
|
|
int X(dimcmp)(const iodim *a, const iodim *b);
|
|
int X(tensor_equal)(const tensor *a, const tensor *b);
|
|
int X(tensor_inplace_locations)(const tensor *sz, const tensor *vecsz);
|
|
|
|
/*-----------------------------------------------------------------------*/
|
|
/* problem.c: */
|
|
enum {
|
|
/* a problem that cannot be solved */
|
|
PROBLEM_UNSOLVABLE,
|
|
|
|
PROBLEM_DFT,
|
|
PROBLEM_RDFT,
|
|
PROBLEM_RDFT2,
|
|
|
|
/* for mpi/ subdirectory */
|
|
PROBLEM_MPI_DFT,
|
|
PROBLEM_MPI_RDFT,
|
|
PROBLEM_MPI_RDFT2,
|
|
PROBLEM_MPI_TRANSPOSE,
|
|
|
|
PROBLEM_LAST
|
|
};
|
|
|
|
typedef struct {
|
|
int problem_kind;
|
|
void (*hash) (const problem *ego, md5 *p);
|
|
void (*zero) (const problem *ego);
|
|
void (*print) (const problem *ego, printer *p);
|
|
void (*destroy) (problem *ego);
|
|
} problem_adt;
|
|
|
|
struct problem_s {
|
|
const problem_adt *adt;
|
|
};
|
|
|
|
problem *X(mkproblem)(size_t sz, const problem_adt *adt);
|
|
void X(problem_destroy)(problem *ego);
|
|
problem *X(mkproblem_unsolvable)(void);
|
|
|
|
/*-----------------------------------------------------------------------*/
|
|
/* print.c */
|
|
struct printer_s {
|
|
void (*print)(printer *p, const char *format, ...);
|
|
void (*vprint)(printer *p, const char *format, va_list ap);
|
|
void (*putchr)(printer *p, char c);
|
|
void (*cleanup)(printer *p);
|
|
int indent;
|
|
int indent_incr;
|
|
};
|
|
|
|
printer *X(mkprinter)(size_t size,
|
|
void (*putchr)(printer *p, char c),
|
|
void (*cleanup)(printer *p));
|
|
IFFTW_EXTERN void X(printer_destroy)(printer *p);
|
|
|
|
/*-----------------------------------------------------------------------*/
|
|
/* scan.c */
|
|
struct scanner_s {
|
|
int (*scan)(scanner *sc, const char *format, ...);
|
|
int (*vscan)(scanner *sc, const char *format, va_list ap);
|
|
int (*getchr)(scanner *sc);
|
|
int ungotc;
|
|
};
|
|
|
|
scanner *X(mkscanner)(size_t size, int (*getchr)(scanner *sc));
|
|
void X(scanner_destroy)(scanner *sc);
|
|
|
|
/*-----------------------------------------------------------------------*/
|
|
/* plan.c: */
|
|
|
|
enum wakefulness {
|
|
SLEEPY,
|
|
AWAKE_ZERO,
|
|
AWAKE_SQRTN_TABLE,
|
|
AWAKE_SINCOS
|
|
};
|
|
|
|
typedef struct {
|
|
void (*solve)(const plan *ego, const problem *p);
|
|
void (*awake)(plan *ego, enum wakefulness wakefulness);
|
|
void (*print)(const plan *ego, printer *p);
|
|
void (*destroy)(plan *ego);
|
|
} plan_adt;
|
|
|
|
struct plan_s {
|
|
const plan_adt *adt;
|
|
opcnt ops;
|
|
double pcost;
|
|
enum wakefulness wakefulness; /* used for debugging only */
|
|
int could_prune_now_p;
|
|
};
|
|
|
|
plan *X(mkplan)(size_t size, const plan_adt *adt);
|
|
void X(plan_destroy_internal)(plan *ego);
|
|
IFFTW_EXTERN void X(plan_awake)(plan *ego, enum wakefulness wakefulness);
|
|
void X(plan_null_destroy)(plan *ego);
|
|
|
|
/*-----------------------------------------------------------------------*/
|
|
/* solver.c: */
|
|
typedef struct {
|
|
int problem_kind;
|
|
plan *(*mkplan)(const solver *ego, const problem *p, planner *plnr);
|
|
void (*destroy)(solver *ego);
|
|
} solver_adt;
|
|
|
|
struct solver_s {
|
|
const solver_adt *adt;
|
|
int refcnt;
|
|
};
|
|
|
|
solver *X(mksolver)(size_t size, const solver_adt *adt);
|
|
void X(solver_use)(solver *ego);
|
|
void X(solver_destroy)(solver *ego);
|
|
void X(solver_register)(planner *plnr, solver *s);
|
|
|
|
/* shorthand */
|
|
#define MKSOLVER(type, adt) (type *)X(mksolver)(sizeof(type), adt)
|
|
|
|
/*-----------------------------------------------------------------------*/
|
|
/* planner.c */
|
|
|
|
typedef struct slvdesc_s {
|
|
solver *slv;
|
|
const char *reg_nam;
|
|
unsigned nam_hash;
|
|
int reg_id;
|
|
int next_for_same_problem_kind;
|
|
} slvdesc;
|
|
|
|
typedef struct solution_s solution; /* opaque */
|
|
|
|
/* interpretation of L and U:
|
|
|
|
- if it returns a plan, the planner guarantees that all applicable
|
|
plans at least as impatient as U have been tried, and that each
|
|
plan in the solution is at least as impatient as L.
|
|
|
|
- if it returns 0, the planner guarantees to have tried all solvers
|
|
at least as impatient as L, and that none of them was applicable.
|
|
|
|
The structure is packed to fit into 64 bits.
|
|
*/
|
|
|
|
typedef struct {
|
|
unsigned l:20;
|
|
unsigned hash_info:3;
|
|
# define BITS_FOR_TIMELIMIT 9
|
|
unsigned timelimit_impatience:BITS_FOR_TIMELIMIT;
|
|
unsigned u:20;
|
|
|
|
/* abstraction break: we store the solver here to pad the
|
|
structure to 64 bits. Otherwise, the struct is padded to 64
|
|
bits anyway, and another word is allocated for slvndx. */
|
|
# define BITS_FOR_SLVNDX 12
|
|
unsigned slvndx:BITS_FOR_SLVNDX;
|
|
} flags_t;
|
|
|
|
/* impatience flags */
|
|
enum {
|
|
BELIEVE_PCOST = 0x0001,
|
|
ESTIMATE = 0x0002,
|
|
NO_DFT_R2HC = 0x0004,
|
|
NO_SLOW = 0x0008,
|
|
NO_VRECURSE = 0x0010,
|
|
NO_INDIRECT_OP = 0x0020,
|
|
NO_LARGE_GENERIC = 0x0040,
|
|
NO_RANK_SPLITS = 0x0080,
|
|
NO_VRANK_SPLITS = 0x0100,
|
|
NO_NONTHREADED = 0x0200,
|
|
NO_BUFFERING = 0x0400,
|
|
NO_FIXED_RADIX_LARGE_N = 0x0800,
|
|
NO_DESTROY_INPUT = 0x1000,
|
|
NO_SIMD = 0x2000,
|
|
CONSERVE_MEMORY = 0x4000,
|
|
NO_DHT_R2HC = 0x8000,
|
|
NO_UGLY = 0x10000,
|
|
ALLOW_PRUNING = 0x20000
|
|
};
|
|
|
|
/* hashtable information */
|
|
enum {
|
|
BLESSING = 0x1u, /* save this entry */
|
|
H_VALID = 0x2u, /* valid hastable entry */
|
|
H_LIVE = 0x4u /* entry is nonempty, implies H_VALID */
|
|
};
|
|
|
|
#define PLNR_L(plnr) ((plnr)->flags.l)
|
|
#define PLNR_U(plnr) ((plnr)->flags.u)
|
|
#define PLNR_TIMELIMIT_IMPATIENCE(plnr) ((plnr)->flags.timelimit_impatience)
|
|
|
|
#define ESTIMATEP(plnr) (PLNR_U(plnr) & ESTIMATE)
|
|
#define BELIEVE_PCOSTP(plnr) (PLNR_U(plnr) & BELIEVE_PCOST)
|
|
#define ALLOW_PRUNINGP(plnr) (PLNR_U(plnr) & ALLOW_PRUNING)
|
|
|
|
#define NO_INDIRECT_OP_P(plnr) (PLNR_L(plnr) & NO_INDIRECT_OP)
|
|
#define NO_LARGE_GENERICP(plnr) (PLNR_L(plnr) & NO_LARGE_GENERIC)
|
|
#define NO_RANK_SPLITSP(plnr) (PLNR_L(plnr) & NO_RANK_SPLITS)
|
|
#define NO_VRANK_SPLITSP(plnr) (PLNR_L(plnr) & NO_VRANK_SPLITS)
|
|
#define NO_VRECURSEP(plnr) (PLNR_L(plnr) & NO_VRECURSE)
|
|
#define NO_DFT_R2HCP(plnr) (PLNR_L(plnr) & NO_DFT_R2HC)
|
|
#define NO_SLOWP(plnr) (PLNR_L(plnr) & NO_SLOW)
|
|
#define NO_UGLYP(plnr) (PLNR_L(plnr) & NO_UGLY)
|
|
#define NO_FIXED_RADIX_LARGE_NP(plnr) \
|
|
(PLNR_L(plnr) & NO_FIXED_RADIX_LARGE_N)
|
|
#define NO_NONTHREADEDP(plnr) \
|
|
((PLNR_L(plnr) & NO_NONTHREADED) && (plnr)->nthr > 1)
|
|
|
|
#define NO_DESTROY_INPUTP(plnr) (PLNR_L(plnr) & NO_DESTROY_INPUT)
|
|
#define NO_SIMDP(plnr) (PLNR_L(plnr) & NO_SIMD)
|
|
#define CONSERVE_MEMORYP(plnr) (PLNR_L(plnr) & CONSERVE_MEMORY)
|
|
#define NO_DHT_R2HCP(plnr) (PLNR_L(plnr) & NO_DHT_R2HC)
|
|
#define NO_BUFFERINGP(plnr) (PLNR_L(plnr) & NO_BUFFERING)
|
|
|
|
typedef enum { FORGET_ACCURSED, FORGET_EVERYTHING } amnesia;
|
|
|
|
typedef enum {
|
|
/* WISDOM_NORMAL: planner may or may not use wisdom */
|
|
WISDOM_NORMAL,
|
|
|
|
/* WISDOM_ONLY: planner must use wisdom and must avoid searching */
|
|
WISDOM_ONLY,
|
|
|
|
/* WISDOM_IS_BOGUS: planner must return 0 as quickly as possible */
|
|
WISDOM_IS_BOGUS,
|
|
|
|
/* WISDOM_IGNORE_INFEASIBLE: planner ignores infeasible wisdom */
|
|
WISDOM_IGNORE_INFEASIBLE,
|
|
|
|
/* WISDOM_IGNORE_ALL: planner ignores all */
|
|
WISDOM_IGNORE_ALL
|
|
} wisdom_state_t;
|
|
|
|
typedef struct {
|
|
void (*register_solver)(planner *ego, solver *s);
|
|
plan *(*mkplan)(planner *ego, const problem *p);
|
|
void (*forget)(planner *ego, amnesia a);
|
|
void (*exprt)(planner *ego, printer *p); /* ``export'' is a reserved
|
|
word in C++. */
|
|
int (*imprt)(planner *ego, scanner *sc);
|
|
} planner_adt;
|
|
|
|
/* hash table of solutions */
|
|
typedef struct {
|
|
solution *solutions;
|
|
unsigned hashsiz, nelem;
|
|
|
|
/* statistics */
|
|
int lookup, succ_lookup, lookup_iter;
|
|
int insert, insert_iter, insert_unknown;
|
|
int nrehash;
|
|
} hashtab;
|
|
|
|
typedef enum { COST_SUM, COST_MAX } cost_kind;
|
|
|
|
struct planner_s {
|
|
const planner_adt *adt;
|
|
void (*hook)(struct planner_s *plnr, plan *pln,
|
|
const problem *p, int optimalp);
|
|
double (*cost_hook)(const problem *p, double t, cost_kind k);
|
|
int (*wisdom_ok_hook)(const problem *p, flags_t flags);
|
|
void (*nowisdom_hook)(const problem *p);
|
|
wisdom_state_t (*bogosity_hook)(wisdom_state_t state, const problem *p);
|
|
|
|
/* solver descriptors */
|
|
slvdesc *slvdescs;
|
|
unsigned nslvdesc, slvdescsiz;
|
|
const char *cur_reg_nam;
|
|
int cur_reg_id;
|
|
int slvdescs_for_problem_kind[PROBLEM_LAST];
|
|
|
|
wisdom_state_t wisdom_state;
|
|
|
|
hashtab htab_blessed;
|
|
hashtab htab_unblessed;
|
|
|
|
int nthr;
|
|
flags_t flags;
|
|
|
|
crude_time start_time;
|
|
double timelimit; /* elapsed_since(start_time) at which to bail out */
|
|
int timed_out; /* whether most recent search timed out */
|
|
int need_timeout_check;
|
|
|
|
/* various statistics */
|
|
int nplan; /* number of plans evaluated */
|
|
double pcost, epcost; /* total pcost of measured/estimated plans */
|
|
int nprob; /* number of problems evaluated */
|
|
};
|
|
|
|
planner *X(mkplanner)(void);
|
|
void X(planner_destroy)(planner *ego);
|
|
|
|
/*
|
|
Iterate over all solvers. Read:
|
|
|
|
@article{ baker93iterators,
|
|
author = "Henry G. Baker, Jr.",
|
|
title = "Iterators: Signs of Weakness in Object-Oriented Languages",
|
|
journal = "{ACM} {OOPS} Messenger",
|
|
volume = "4",
|
|
number = "3",
|
|
pages = "18--25"
|
|
}
|
|
*/
|
|
#define FORALL_SOLVERS(ego, s, p, what) \
|
|
{ \
|
|
unsigned _cnt; \
|
|
for (_cnt = 0; _cnt < ego->nslvdesc; ++_cnt) { \
|
|
slvdesc *p = ego->slvdescs + _cnt; \
|
|
solver *s = p->slv; \
|
|
what; \
|
|
} \
|
|
}
|
|
|
|
#define FORALL_SOLVERS_OF_KIND(kind, ego, s, p, what) \
|
|
{ \
|
|
int _cnt = ego->slvdescs_for_problem_kind[kind]; \
|
|
while (_cnt >= 0) { \
|
|
slvdesc *p = ego->slvdescs + _cnt; \
|
|
solver *s = p->slv; \
|
|
what; \
|
|
_cnt = p->next_for_same_problem_kind; \
|
|
} \
|
|
}
|
|
|
|
|
|
/* make plan, destroy problem */
|
|
plan *X(mkplan_d)(planner *ego, problem *p);
|
|
plan *X(mkplan_f_d)(planner *ego, problem *p,
|
|
unsigned l_set, unsigned u_set, unsigned u_reset);
|
|
|
|
/*-----------------------------------------------------------------------*/
|
|
/* stride.c: */
|
|
|
|
/* If PRECOMPUTE_ARRAY_INDICES is defined, precompute all strides. */
|
|
#if (defined(__i386__) || defined(__x86_64__) || _M_IX86 >= 500) && !defined(FFTW_LDOUBLE)
|
|
#define PRECOMPUTE_ARRAY_INDICES
|
|
#endif
|
|
|
|
extern const INT X(an_INT_guaranteed_to_be_zero);
|
|
|
|
#ifdef PRECOMPUTE_ARRAY_INDICES
|
|
typedef INT *stride;
|
|
#define WS(stride, i) (stride[i])
|
|
extern stride X(mkstride)(INT n, INT s);
|
|
void X(stride_destroy)(stride p);
|
|
/* hackery to prevent the compiler from copying the strides array
|
|
onto the stack */
|
|
#define MAKE_VOLATILE_STRIDE(nptr, x) (x) = (x) + X(an_INT_guaranteed_to_be_zero)
|
|
#else
|
|
|
|
typedef INT stride;
|
|
#define WS(stride, i) (stride * i)
|
|
#define fftwf_mkstride(n, stride) stride
|
|
#define fftw_mkstride(n, stride) stride
|
|
#define fftwl_mkstride(n, stride) stride
|
|
#define fftwf_stride_destroy(p) ((void) p)
|
|
#define fftw_stride_destroy(p) ((void) p)
|
|
#define fftwl_stride_destroy(p) ((void) p)
|
|
|
|
/* hackery to prevent the compiler from ``optimizing'' induction
|
|
variables in codelet loops. The problem is that for each K and for
|
|
each expression of the form P[I + STRIDE * K] in a loop, most
|
|
compilers will try to lift an induction variable PK := &P[I + STRIDE * K].
|
|
For large values of K this behavior overflows the
|
|
register set, which is likely worse than doing the index computation
|
|
in the first place.
|
|
|
|
If we guess that there are more than
|
|
ESTIMATED_AVAILABLE_INDEX_REGISTERS such pointers, we deliberately confuse
|
|
the compiler by setting STRIDE ^= ZERO, where ZERO is a value guaranteed to
|
|
be 0, but the compiler does not know this.
|
|
|
|
16 registers ought to be enough for anybody, or so the amd64 and ARM ISA's
|
|
seem to imply.
|
|
*/
|
|
#define ESTIMATED_AVAILABLE_INDEX_REGISTERS 16
|
|
#define MAKE_VOLATILE_STRIDE(nptr, x) \
|
|
(nptr <= ESTIMATED_AVAILABLE_INDEX_REGISTERS ? \
|
|
0 : \
|
|
((x) = (x) ^ X(an_INT_guaranteed_to_be_zero)))
|
|
#endif /* PRECOMPUTE_ARRAY_INDICES */
|
|
|
|
/*-----------------------------------------------------------------------*/
|
|
/* solvtab.c */
|
|
|
|
struct solvtab_s { void (*reg)(planner *); const char *reg_nam; };
|
|
typedef struct solvtab_s solvtab[];
|
|
void X(solvtab_exec)(const solvtab tbl, planner *p);
|
|
#define SOLVTAB(s) { s, STRINGIZE(s) }
|
|
#define SOLVTAB_END { 0, 0 }
|
|
|
|
/*-----------------------------------------------------------------------*/
|
|
/* pickdim.c */
|
|
int X(pickdim)(int which_dim, const int *buddies, size_t nbuddies,
|
|
const tensor *sz, int oop, int *dp);
|
|
|
|
/*-----------------------------------------------------------------------*/
|
|
/* twiddle.c */
|
|
/* little language to express twiddle factors computation */
|
|
enum { TW_COS = 0, TW_SIN = 1, TW_CEXP = 2, TW_NEXT = 3,
|
|
TW_FULL = 4, TW_HALF = 5 };
|
|
|
|
typedef struct {
|
|
unsigned char op;
|
|
signed char v;
|
|
short i;
|
|
} tw_instr;
|
|
|
|
typedef struct twid_s {
|
|
R *W; /* array of twiddle factors */
|
|
INT n, r, m; /* transform order, radix, # twiddle rows */
|
|
int refcnt;
|
|
const tw_instr *instr;
|
|
struct twid_s *cdr;
|
|
enum wakefulness wakefulness;
|
|
} twid;
|
|
|
|
INT X(twiddle_length)(INT r, const tw_instr *p);
|
|
void X(twiddle_awake)(enum wakefulness wakefulness,
|
|
twid **pp, const tw_instr *instr, INT n, INT r, INT m);
|
|
|
|
/*-----------------------------------------------------------------------*/
|
|
/* trig.c */
|
|
#if defined(TRIGREAL_IS_LONG_DOUBLE)
|
|
typedef long double trigreal;
|
|
#elif defined(TRIGREAL_IS_QUAD)
|
|
typedef __float128 trigreal;
|
|
#else
|
|
typedef double trigreal;
|
|
#endif
|
|
|
|
typedef struct triggen_s triggen;
|
|
|
|
struct triggen_s {
|
|
void (*cexp)(triggen *t, INT m, R *result);
|
|
void (*cexpl)(triggen *t, INT m, trigreal *result);
|
|
void (*rotate)(triggen *p, INT m, R xr, R xi, R *res);
|
|
|
|
INT twshft;
|
|
INT twradix;
|
|
INT twmsk;
|
|
trigreal *W0, *W1;
|
|
INT n;
|
|
};
|
|
|
|
triggen *X(mktriggen)(enum wakefulness wakefulness, INT n);
|
|
void X(triggen_destroy)(triggen *p);
|
|
|
|
/*-----------------------------------------------------------------------*/
|
|
/* primes.c: */
|
|
|
|
#define MULMOD(x, y, p) \
|
|
(((x) <= 92681 - (y)) ? ((x) * (y)) % (p) : X(safe_mulmod)(x, y, p))
|
|
|
|
INT X(safe_mulmod)(INT x, INT y, INT p);
|
|
INT X(power_mod)(INT n, INT m, INT p);
|
|
INT X(find_generator)(INT p);
|
|
INT X(first_divisor)(INT n);
|
|
int X(is_prime)(INT n);
|
|
INT X(next_prime)(INT n);
|
|
int X(factors_into)(INT n, const INT *primes);
|
|
int X(factors_into_small_primes)(INT n);
|
|
INT X(choose_radix)(INT r, INT n);
|
|
INT X(isqrt)(INT n);
|
|
INT X(modulo)(INT a, INT n);
|
|
|
|
#define GENERIC_MIN_BAD 173 /* min prime for which generic becomes bad */
|
|
|
|
/* thresholds below which certain solvers are considered SLOW. These are guesses
|
|
believed to be conservative */
|
|
#define GENERIC_MAX_SLOW 16
|
|
#define RADER_MAX_SLOW 32
|
|
#define BLUESTEIN_MAX_SLOW 24
|
|
|
|
/*-----------------------------------------------------------------------*/
|
|
/* rader.c: */
|
|
typedef struct rader_tls rader_tl;
|
|
|
|
void X(rader_tl_insert)(INT k1, INT k2, INT k3, R *W, rader_tl **tl);
|
|
R *X(rader_tl_find)(INT k1, INT k2, INT k3, rader_tl *t);
|
|
void X(rader_tl_delete)(R *W, rader_tl **tl);
|
|
|
|
/*-----------------------------------------------------------------------*/
|
|
/* copy/transposition routines */
|
|
|
|
/* lower bound to the cache size, for tiled routines */
|
|
#define CACHESIZE 8192
|
|
|
|
INT X(compute_tilesz)(INT vl, int how_many_tiles_in_cache);
|
|
|
|
void X(tile2d)(INT n0l, INT n0u, INT n1l, INT n1u, INT tilesz,
|
|
void (*f)(INT n0l, INT n0u, INT n1l, INT n1u, void *args),
|
|
void *args);
|
|
void X(cpy1d)(R *I, R *O, INT n0, INT is0, INT os0, INT vl);
|
|
void X(zero1d_pair)(R *O0, R *O1, INT n0, INT os0);
|
|
void X(cpy2d)(R *I, R *O,
|
|
INT n0, INT is0, INT os0,
|
|
INT n1, INT is1, INT os1,
|
|
INT vl);
|
|
void X(cpy2d_ci)(R *I, R *O,
|
|
INT n0, INT is0, INT os0,
|
|
INT n1, INT is1, INT os1,
|
|
INT vl);
|
|
void X(cpy2d_co)(R *I, R *O,
|
|
INT n0, INT is0, INT os0,
|
|
INT n1, INT is1, INT os1,
|
|
INT vl);
|
|
void X(cpy2d_tiled)(R *I, R *O,
|
|
INT n0, INT is0, INT os0,
|
|
INT n1, INT is1, INT os1,
|
|
INT vl);
|
|
void X(cpy2d_tiledbuf)(R *I, R *O,
|
|
INT n0, INT is0, INT os0,
|
|
INT n1, INT is1, INT os1,
|
|
INT vl);
|
|
void X(cpy2d_pair)(R *I0, R *I1, R *O0, R *O1,
|
|
INT n0, INT is0, INT os0,
|
|
INT n1, INT is1, INT os1);
|
|
void X(cpy2d_pair_ci)(R *I0, R *I1, R *O0, R *O1,
|
|
INT n0, INT is0, INT os0,
|
|
INT n1, INT is1, INT os1);
|
|
void X(cpy2d_pair_co)(R *I0, R *I1, R *O0, R *O1,
|
|
INT n0, INT is0, INT os0,
|
|
INT n1, INT is1, INT os1);
|
|
|
|
void X(transpose)(R *I, INT n, INT s0, INT s1, INT vl);
|
|
void X(transpose_tiled)(R *I, INT n, INT s0, INT s1, INT vl);
|
|
void X(transpose_tiledbuf)(R *I, INT n, INT s0, INT s1, INT vl);
|
|
|
|
typedef void (*transpose_func)(R *I, INT n, INT s0, INT s1, INT vl);
|
|
typedef void (*cpy2d_func)(R *I, R *O,
|
|
INT n0, INT is0, INT os0,
|
|
INT n1, INT is1, INT os1,
|
|
INT vl);
|
|
|
|
/*-----------------------------------------------------------------------*/
|
|
/* misc stuff */
|
|
void X(null_awake)(plan *ego, enum wakefulness wakefulness);
|
|
double X(iestimate_cost)(const planner *, const plan *, const problem *);
|
|
|
|
#ifdef FFTW_RANDOM_ESTIMATOR
|
|
extern unsigned X(random_estimate_seed);
|
|
#endif
|
|
|
|
double X(measure_execution_time)(const planner *plnr,
|
|
plan *pln, const problem *p);
|
|
IFFTW_EXTERN int X(ialignment_of)(R *p);
|
|
unsigned X(hash)(const char *s);
|
|
INT X(nbuf)(INT n, INT vl, INT maxnbuf);
|
|
int X(nbuf_redundant)(INT n, INT vl, size_t which,
|
|
const INT *maxnbuf, size_t nmaxnbuf);
|
|
INT X(bufdist)(INT n, INT vl);
|
|
int X(toobig)(INT n);
|
|
int X(ct_uglyp)(INT min_n, INT v, INT n, INT r);
|
|
|
|
#if HAVE_SIMD
|
|
R *X(taint)(R *p, INT s);
|
|
R *X(join_taint)(R *p1, R *p2);
|
|
#define TAINT(p, s) X(taint)(p, s)
|
|
#define UNTAINT(p) ((R *) (((uintptr_t) (p)) & ~(uintptr_t)3))
|
|
#define TAINTOF(p) (((uintptr_t)(p)) & 3)
|
|
#define JOIN_TAINT(p1, p2) X(join_taint)(p1, p2)
|
|
#else
|
|
#define TAINT(p, s) (p)
|
|
#define UNTAINT(p) (p)
|
|
#define TAINTOF(p) 0
|
|
#define JOIN_TAINT(p1, p2) p1
|
|
#endif
|
|
|
|
#define ASSERT_ALIGNED_DOUBLE /*unused, legacy*/
|
|
|
|
/*-----------------------------------------------------------------------*/
|
|
/* macros used in codelets to reduce source code size */
|
|
|
|
typedef R E; /* internal precision of codelets. */
|
|
|
|
#if defined(FFTW_LDOUBLE)
|
|
# define K(x) ((E) x##L)
|
|
#elif defined(FFTW_QUAD)
|
|
# define K(x) ((E) x##Q)
|
|
#else
|
|
# define K(x) ((E) x)
|
|
#endif
|
|
#define DK(name, value) const E name = K(value)
|
|
|
|
/* FMA macros */
|
|
|
|
#if defined(__GNUC__) && (defined(__powerpc__) || defined(__ppc__) || defined(_POWER))
|
|
/* The obvious expression a * b + c does not work. If both x = a * b
|
|
+ c and y = a * b - c appear in the source, gcc computes t = a * b,
|
|
x = t + c, y = t - c, thus destroying the fma.
|
|
|
|
This peculiar coding seems to do the right thing on all of
|
|
gcc-2.95, gcc-3.1, gcc-3.2, and gcc-3.3. It does the right thing
|
|
on gcc-3.4 -fno-web (because the ``web'' pass splits the variable
|
|
`x' for the single-assignment form).
|
|
|
|
However, gcc-4.0 is a formidable adversary which succeeds in
|
|
pessimizing two fma's into one multiplication and two additions.
|
|
It does it very early in the game---before the optimization passes
|
|
even start. The only real workaround seems to use fake inline asm
|
|
such as
|
|
|
|
asm ("# confuse gcc %0" : "=f"(a) : "0"(a));
|
|
return a * b + c;
|
|
|
|
in each of the FMA, FMS, FNMA, and FNMS functions. However, this
|
|
does not solve the problem either, because two equal asm statements
|
|
count as a common subexpression! One must use *different* fake asm
|
|
statements:
|
|
|
|
in FMA:
|
|
asm ("# confuse gcc for fma %0" : "=f"(a) : "0"(a));
|
|
|
|
in FMS:
|
|
asm ("# confuse gcc for fms %0" : "=f"(a) : "0"(a));
|
|
|
|
etc.
|
|
|
|
After these changes, gcc recalcitrantly generates the fma that was
|
|
in the source to begin with. However, the extra asm() cruft
|
|
confuses other passes of gcc, notably the instruction scheduler.
|
|
(Of course, one could also generate the fma directly via inline
|
|
asm, but this confuses the scheduler even more.)
|
|
|
|
Steven and I have submitted more than one bug report to the gcc
|
|
mailing list over the past few years, to no effect. Thus, I give
|
|
up. gcc-4.0 can go to hell. I'll wait at least until gcc-4.3 is
|
|
out before touching this crap again.
|
|
*/
|
|
static __inline__ E FMA(E a, E b, E c)
|
|
{
|
|
E x = a * b;
|
|
x = x + c;
|
|
return x;
|
|
}
|
|
|
|
static __inline__ E FMS(E a, E b, E c)
|
|
{
|
|
E x = a * b;
|
|
x = x - c;
|
|
return x;
|
|
}
|
|
|
|
static __inline__ E FNMA(E a, E b, E c)
|
|
{
|
|
E x = a * b;
|
|
x = - (x + c);
|
|
return x;
|
|
}
|
|
|
|
static __inline__ E FNMS(E a, E b, E c)
|
|
{
|
|
E x = a * b;
|
|
x = - (x - c);
|
|
return x;
|
|
}
|
|
#else
|
|
#define FMA(a, b, c) (((a) * (b)) + (c))
|
|
#define FMS(a, b, c) (((a) * (b)) - (c))
|
|
#define FNMA(a, b, c) (- (((a) * (b)) + (c)))
|
|
#define FNMS(a, b, c) ((c) - ((a) * (b)))
|
|
#endif
|
|
|
|
#ifdef __cplusplus
|
|
} /* extern "C" */
|
|
#endif /* __cplusplus */
|
|
|
|
#endif /* __IFFTW_H__ */
|