NFFT  3.3.1
nfft.c
00001 /*
00002  * Copyright (c) 2002, 2016 Jens Keiner, Stefan Kunis, Daniel Potts
00003  *
00004  * This program is free software; you can redistribute it and/or modify it under
00005  * the terms of the GNU General Public License as published by the Free Software
00006  * Foundation; either version 2 of the License, or (at your option) any later
00007  * version.
00008  *
00009  * This program is distributed in the hope that it will be useful, but WITHOUT
00010  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
00011  * FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
00012  * details.
00013  *
00014  * You should have received a copy of the GNU General Public License along with
00015  * this program; if not, write to the Free Software Foundation, Inc., 51
00016  * Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
00017  */
00018 
00019 /* Nonequispaced FFT */
00020 
00021 /* Authors: D. Potts, S. Kunis 2002-2009, Jens Keiner 2009, Toni Volkmer 2012 */
00022 
00023 /* configure header */
00024 #include "config.h"
00025 
00026 /* complex datatype (maybe) */
00027 #ifdef HAVE_COMPLEX_H
00028 #include<complex.h>
00029 #endif
00030 
00031 /* NFFT headers */
00032 #include "nfft3.h"
00033 #include "infft.h"
00034 
00035 #ifdef _OPENMP
00036 #include <omp.h>
00037 #endif
00038 
00039 #ifdef OMP_ASSERT
00040 #include <assert.h>
00041 #endif
00042 
00043 #undef X
00044 #define X(name) NFFT(name)
00045 
00047 static inline INT intprod(const INT *vec, const INT a, const INT d)
00048 {
00049   INT t, p;
00050 
00051   p = 1;
00052   for (t = 0; t < d; t++)
00053     p *= vec[t] - a;
00054 
00055   return p;
00056 }
00057 
00058 /* handy shortcuts */
00059 #define BASE(x) CEXP(x)
00060 
00075 static inline void sort0(const INT d, const INT *n, const INT m,
00076     const INT local_x_num, const R *local_x, INT *ar_x)
00077 {
00078   INT u_j[d], i, j, help, rhigh;
00079   INT *ar_x_temp;
00080   INT nprod;
00081 
00082   for (i = 0; i < local_x_num; i++)
00083   {
00084     ar_x[2 * i] = 0;
00085     ar_x[2 *i + 1] = i;
00086     for (j = 0; j < d; j++)
00087     {
00088       help = (INT) LRINT(FLOOR((R)(n[j]) * local_x[d * i + j] - (R)(m)));
00089       u_j[j] = (help % n[j] + n[j]) % n[j];
00090 
00091       ar_x[2 * i] += u_j[j];
00092       if (j + 1 < d)
00093         ar_x[2 * i] *= n[j + 1];
00094     }
00095   }
00096 
00097   for (j = 0, nprod = 1; j < d; j++)
00098     nprod *= n[j];
00099 
00100   rhigh = (INT) LRINT(CEIL(LOG2((R)nprod))) - 1;
00101 
00102   ar_x_temp = (INT*) Y(malloc)(2 * (size_t)(local_x_num) * sizeof(INT));
00103   Y(sort_node_indices_radix_lsdf)(local_x_num, ar_x, ar_x_temp, rhigh);
00104 #ifdef OMP_ASSERT
00105   for (i = 1; i < local_x_num; i++)
00106     assert(ar_x[2 * (i - 1)] <= ar_x[2 * i]);
00107 #endif
00108   Y(free)(ar_x_temp);
00109 }
00110 
00119 static inline void sort(const X(plan) *ths)
00120 {
00121   if (ths->flags & NFFT_SORT_NODES)
00122     sort0(ths->d, ths->n, ths->m, ths->M_total, ths->x, ths->index_x);
00123 }
00124 
00145 void X(trafo_direct)(const X(plan) *ths)
00146 {
00147   C *f_hat = (C*)ths->f_hat, *f = (C*)ths->f;
00148 
00149   memset(f, 0, (size_t)(ths->M_total) * sizeof(C));
00150 
00151   if (ths->d == 1)
00152   {
00153     /* specialize for univariate case, rationale: faster */
00154     INT j;
00155 #ifdef _OPENMP
00156     #pragma omp parallel for default(shared) private(j)
00157 #endif
00158     for (j = 0; j < ths->M_total; j++)
00159     {
00160       INT k_L;
00161       for (k_L = 0; k_L < ths->N_total; k_L++)
00162       {
00163         R omega = K2PI * ((R)(k_L - ths->N_total/2)) * ths->x[j];
00164         f[j] += f_hat[k_L] * BASE(-II * omega);
00165       }
00166     }
00167   }
00168   else
00169   {
00170     /* multivariate case */
00171     INT j;
00172 #ifdef _OPENMP
00173     #pragma omp parallel for default(shared) private(j)
00174 #endif
00175     for (j = 0; j < ths->M_total; j++)
00176     {
00177       R x[ths->d], omega, Omega[ths->d + 1];
00178       INT t, t2, k_L, k[ths->d];
00179       Omega[0] = K(0.0);
00180       for (t = 0; t < ths->d; t++)
00181       {
00182         k[t] = -ths->N[t]/2;
00183         x[t] = K2PI * ths->x[j * ths->d + t];
00184         Omega[t+1] = ((R)k[t]) * x[t] + Omega[t];
00185       }
00186       omega = Omega[ths->d];
00187 
00188       for (k_L = 0; k_L < ths->N_total; k_L++)
00189       {
00190         f[j] += f_hat[k_L] * BASE(-II * omega);
00191         {
00192           for (t = ths->d - 1; (t >= 1) && (k[t] == ths->N[t]/2 - 1); t--)
00193             k[t]-= ths->N[t]-1;
00194 
00195           k[t]++;
00196 
00197           for (t2 = t; t2 < ths->d; t2++)
00198             Omega[t2+1] = ((R)k[t2]) * x[t2] + Omega[t2];
00199 
00200           omega = Omega[ths->d];
00201         }
00202       }
00203     }
00204   }
00205 }
00206 
00207 void X(adjoint_direct)(const X(plan) *ths)
00208 {
00209   C *f_hat = (C*)ths->f_hat, *f = (C*)ths->f;
00210 
00211   memset(f_hat, 0, (size_t)(ths->N_total) * sizeof(C));
00212 
00213   if (ths->d == 1)
00214   {
00215     /* specialize for univariate case, rationale: faster */
00216 #ifdef _OPENMP
00217       INT k_L;
00218       #pragma omp parallel for default(shared) private(k_L)
00219       for (k_L = 0; k_L < ths->N_total; k_L++)
00220       {
00221         INT j;
00222         for (j = 0; j < ths->M_total; j++)
00223         {
00224           R omega = K2PI * ((R)(k_L - (ths->N_total/2))) * ths->x[j];
00225           f_hat[k_L] += f[j] * BASE(II * omega);
00226         }
00227       }
00228 #else
00229       INT j;
00230       for (j = 0; j < ths->M_total; j++)
00231       {
00232         INT k_L;
00233         for (k_L = 0; k_L < ths->N_total; k_L++)
00234         {
00235           R omega = K2PI * ((R)(k_L - ths->N_total / 2)) * ths->x[j];
00236           f_hat[k_L] += f[j] * BASE(II * omega);
00237         }
00238       }
00239 #endif
00240   }
00241   else
00242   {
00243     /* multivariate case */
00244     INT j, k_L;
00245 #ifdef _OPENMP
00246     #pragma omp parallel for default(shared) private(j, k_L)
00247     for (k_L = 0; k_L < ths->N_total; k_L++)
00248     {
00249       INT k[ths->d], k_temp, t;
00250 
00251       k_temp = k_L;
00252 
00253       for (t = ths->d - 1; t >= 0; t--)
00254       {
00255         k[t] = k_temp % ths->N[t] - ths->N[t]/2;
00256         k_temp /= ths->N[t];
00257       }
00258 
00259       for (j = 0; j < ths->M_total; j++)
00260       {
00261         R omega = K(0.0);
00262         for (t = 0; t < ths->d; t++)
00263           omega += k[t] * K2PI * ths->x[j * ths->d + t];
00264         f_hat[k_L] += f[j] * BASE(II * omega);
00265       }
00266     }
00267 #else
00268     for (j = 0; j < ths->M_total; j++)
00269     {
00270       R x[ths->d], omega, Omega[ths->d+1];
00271       INT t, t2, k[ths->d];
00272       Omega[0] = K(0.0);
00273       for (t = 0; t < ths->d; t++)
00274       {
00275         k[t] = -ths->N[t]/2;
00276         x[t] = K2PI * ths->x[j * ths->d + t];
00277         Omega[t+1] = ((R)k[t]) * x[t] + Omega[t];
00278       }
00279       omega = Omega[ths->d];
00280       for (k_L = 0; k_L < ths->N_total; k_L++)
00281       {
00282         f_hat[k_L] += f[j] * BASE(II * omega);
00283 
00284         for (t = ths->d-1; (t >= 1) && (k[t] == ths->N[t]/2-1); t--)
00285           k[t]-= ths->N[t]-1;
00286 
00287         k[t]++;
00288 
00289         for (t2 = t; t2 < ths->d; t2++)
00290           Omega[t2+1] = ((R)k[t2]) * x[t2] + Omega[t2];
00291 
00292         omega = Omega[ths->d];
00293       }
00294     }
00295 #endif
00296   }
00297 }
00298 
00324 static inline void uo(const X(plan) *ths, const INT j, INT *up, INT *op,
00325   const INT act_dim)
00326 {
00327   const R xj = ths->x[j * ths->d + act_dim];
00328   INT c = LRINT(FLOOR(xj * (R)(ths->n[act_dim])));
00329 
00330   (*up) = c - (ths->m);
00331   (*op) = c + 1 + (ths->m);
00332 }
00333 
00334 static inline void uo2(INT *u, INT *o, const R x, const INT n, const INT m)
00335 {
00336   INT c = LRINT(FLOOR(x * (R)(n)));
00337 
00338   *u = (c - m + n) % n;
00339   *o = (c + 1 + m + n) % n;
00340 }
00341 
00342 #define MACRO_D_compute_A \
00343 { \
00344   g_hat[k_plain[ths->d]] = f_hat[ks_plain[ths->d]] * c_phi_inv_k[ths->d]; \
00345 }
00346 
00347 #define MACRO_D_compute_T \
00348 { \
00349   f_hat[ks_plain[ths->d]] = g_hat[k_plain[ths->d]] * c_phi_inv_k[ths->d]; \
00350 }
00351 
00352 #define MACRO_D_init_result_A memset(g_hat, 0, (size_t)(ths->n_total) * sizeof(C));
00353 
00354 #define MACRO_D_init_result_T memset(f_hat, 0, (size_t)(ths->N_total) * sizeof(C));
00355 
00356 #define MACRO_with_PRE_PHI_HUT * ths->c_phi_inv[t2][ks[t2]];
00357 
00358 #define MACRO_without_PRE_PHI_HUT / (PHI_HUT(ths->n[t2],ks[t2]-(ths->N[t2]/2),t2));
00359 
00360 #define MACRO_init_k_ks \
00361 { \
00362   for (t = ths->d-1; 0 <= t; t--) \
00363   { \
00364     kp[t] = k[t] = 0; \
00365     ks[t] = ths->N[t]/2; \
00366   } \
00367   t++; \
00368 }
00369 
00370 #define MACRO_update_c_phi_inv_k(which_one) \
00371 { \
00372   for (t2 = t; t2 < ths->d; t2++) \
00373   { \
00374     c_phi_inv_k[t2+1] = c_phi_inv_k[t2] MACRO_ ##which_one; \
00375     ks_plain[t2+1] = ks_plain[t2]*ths->N[t2] + ks[t2]; \
00376     k_plain[t2+1] = k_plain[t2]*ths->n[t2] + k[t2]; \
00377   } \
00378 }
00379 
00380 #define MACRO_count_k_ks \
00381 { \
00382   for (t = ths->d-1; (t > 0) && (kp[t] == ths->N[t]-1); t--) \
00383   { \
00384     kp[t] = k[t] = 0; \
00385     ks[t]= ths->N[t]/2; \
00386   } \
00387 \
00388   kp[t]++; k[t]++; ks[t]++; \
00389   if(kp[t] == ths->N[t]/2) \
00390   { \
00391     k[t] = ths->n[t] - ths->N[t]/2; \
00392     ks[t] = 0; \
00393   } \
00394 } \
00395 
00396 /* sub routines for the fast transforms  matrix vector multiplication with D, D^T */
00397 #define MACRO_D(which_one) \
00398 static inline void D_serial_ ## which_one (X(plan) *ths) \
00399 { \
00400   C *f_hat, *g_hat; /* local copy */ \
00401   R c_phi_inv_k[ths->d+1]; /* postfix product of PHI_HUT */ \
00402   INT t, t2; /* index dimensions */ \
00403   INT k_L; /* plain index */ \
00404   INT kp[ths->d]; /* multi index (simple) */ \
00405   INT k[ths->d]; /* multi index in g_hat */ \
00406   INT ks[ths->d]; /* multi index in f_hat, c_phi_inv*/ \
00407   INT k_plain[ths->d+1]; /* postfix plain index */ \
00408   INT ks_plain[ths->d+1]; /* postfix plain index */ \
00409  \
00410   f_hat = (C*)ths->f_hat; g_hat = (C*)ths->g_hat; \
00411   MACRO_D_init_result_ ## which_one; \
00412 \
00413   c_phi_inv_k[0] = K(1.0); \
00414   k_plain[0] = 0; \
00415   ks_plain[0] = 0; \
00416 \
00417   MACRO_init_k_ks; \
00418 \
00419   if (ths->flags & PRE_PHI_HUT) \
00420   { \
00421     for (k_L = 0; k_L < ths->N_total; k_L++) \
00422     { \
00423       MACRO_update_c_phi_inv_k(with_PRE_PHI_HUT); \
00424       MACRO_D_compute_ ## which_one; \
00425       MACRO_count_k_ks; \
00426     } \
00427   } \
00428   else \
00429   { \
00430     for (k_L = 0; k_L < ths->N_total; k_L++) \
00431     { \
00432       MACRO_update_c_phi_inv_k(without_PRE_PHI_HUT); \
00433       MACRO_D_compute_ ## which_one; \
00434       MACRO_count_k_ks; \
00435     } \
00436   } \
00437 }
00438 
00439 #ifdef _OPENMP
00440 static inline void D_openmp_A(X(plan) *ths)
00441 {
00442   C *f_hat, *g_hat;                     
00443   INT k_L;                              
00445   f_hat = (C*)ths->f_hat; g_hat = (C*)ths->g_hat;
00446   memset(g_hat, 0, ths->n_total * sizeof(C));
00447 
00448   if (ths->flags & PRE_PHI_HUT)
00449   {
00450     #pragma omp parallel for default(shared) private(k_L)
00451     for (k_L = 0; k_L < ths->N_total; k_L++)
00452     {
00453       INT kp[ths->d];                        //0..N-1
00454       INT k[ths->d];                        
00455       INT ks[ths->d];                       
00456       R c_phi_inv_k_val = K(1.0);
00457       INT k_plain_val = 0;
00458       INT ks_plain_val = 0;
00459       INT t;
00460       INT k_temp = k_L;
00461 
00462       for (t = ths->d-1; t >= 0; t--)
00463       {
00464         kp[t] = k_temp % ths->N[t];
00465         if (kp[t] >= ths->N[t]/2)
00466           k[t] = ths->n[t] - ths->N[t] + kp[t];
00467         else
00468           k[t] = kp[t];
00469         ks[t] = (kp[t] + ths->N[t]/2) % ths->N[t];
00470         k_temp /= ths->N[t];
00471       }
00472 
00473       for (t = 0; t < ths->d; t++)
00474       {
00475         c_phi_inv_k_val *= ths->c_phi_inv[t][ks[t]];
00476         ks_plain_val = ks_plain_val*ths->N[t] + ks[t];
00477         k_plain_val = k_plain_val*ths->n[t] + k[t];
00478       }
00479 
00480       g_hat[k_plain_val] = f_hat[ks_plain_val] * c_phi_inv_k_val;
00481     } /* for(k_L) */
00482   } /* if(PRE_PHI_HUT) */
00483   else
00484   {
00485     #pragma omp parallel for default(shared) private(k_L)
00486     for (k_L = 0; k_L < ths->N_total; k_L++)
00487     {
00488       INT kp[ths->d];                        //0..N-1
00489       INT k[ths->d];                        
00490       INT ks[ths->d];                       
00491       R c_phi_inv_k_val = K(1.0);
00492       INT k_plain_val = 0;
00493       INT ks_plain_val = 0;
00494       INT t;
00495       INT k_temp = k_L;
00496 
00497       for (t = ths->d-1; t >= 0; t--)
00498       {
00499         kp[t] = k_temp % ths->N[t];
00500         if (kp[t] >= ths->N[t]/2)
00501           k[t] = ths->n[t] - ths->N[t] + kp[t];
00502         else
00503           k[t] = kp[t];
00504         ks[t] = (kp[t] + ths->N[t]/2) % ths->N[t];
00505         k_temp /= ths->N[t];
00506       }
00507 
00508       for (t = 0; t < ths->d; t++)
00509       {
00510         c_phi_inv_k_val /= (PHI_HUT(ths->n[t],ks[t]-(ths->N[t]/2),t));
00511         ks_plain_val = ks_plain_val*ths->N[t] + ks[t];
00512         k_plain_val = k_plain_val*ths->n[t] + k[t];
00513       }
00514 
00515       g_hat[k_plain_val] = f_hat[ks_plain_val] * c_phi_inv_k_val;
00516     } /* for(k_L) */
00517   } /* else(PRE_PHI_HUT) */
00518 }
00519 #endif
00520 
00521 #ifndef _OPENMP
00522 MACRO_D(A)
00523 #endif
00524 
00525 static inline void D_A(X(plan) *ths)
00526 {
00527 #ifdef _OPENMP
00528   D_openmp_A(ths);
00529 #else
00530   D_serial_A(ths);
00531 #endif
00532 }
00533 
00534 #ifdef _OPENMP
00535 static void D_openmp_T(X(plan) *ths)
00536 {
00537   C *f_hat, *g_hat;                     
00538   INT k_L;                              
00540   f_hat = (C*)ths->f_hat; g_hat = (C*)ths->g_hat;
00541   memset(f_hat, 0, ths->N_total * sizeof(C));
00542 
00543   if (ths->flags & PRE_PHI_HUT)
00544   {
00545     #pragma omp parallel for default(shared) private(k_L)
00546     for (k_L = 0; k_L < ths->N_total; k_L++)
00547     {
00548       INT kp[ths->d];                        //0..N-1
00549       INT k[ths->d];                        
00550       INT ks[ths->d];                       
00551       R c_phi_inv_k_val = K(1.0);
00552       INT k_plain_val = 0;
00553       INT ks_plain_val = 0;
00554       INT t;
00555       INT k_temp = k_L;
00556 
00557       for (t = ths->d - 1; t >= 0; t--)
00558       {
00559         kp[t] = k_temp % ths->N[t];
00560         if (kp[t] >= ths->N[t]/2)
00561           k[t] = ths->n[t] - ths->N[t] + kp[t];
00562         else
00563           k[t] = kp[t];
00564         ks[t] = (kp[t] + ths->N[t]/2) % ths->N[t];
00565         k_temp /= ths->N[t];
00566       }
00567 
00568       for (t = 0; t < ths->d; t++)
00569       {
00570         c_phi_inv_k_val *= ths->c_phi_inv[t][ks[t]];
00571         ks_plain_val = ks_plain_val*ths->N[t] + ks[t];
00572         k_plain_val = k_plain_val*ths->n[t] + k[t];
00573       }
00574 
00575       f_hat[ks_plain_val] = g_hat[k_plain_val] * c_phi_inv_k_val;
00576     } /* for(k_L) */
00577   } /* if(PRE_PHI_HUT) */
00578   else
00579   {
00580     #pragma omp parallel for default(shared) private(k_L)
00581     for (k_L = 0; k_L < ths->N_total; k_L++)
00582     {
00583       INT kp[ths->d];                        //0..N-1
00584       INT k[ths->d];                        
00585       INT ks[ths->d];                       
00586       R c_phi_inv_k_val = K(1.0);
00587       INT k_plain_val = 0;
00588       INT ks_plain_val = 0;
00589       INT t;
00590       INT k_temp = k_L;
00591 
00592       for (t = ths->d-1; t >= 0; t--)
00593       {
00594         kp[t] = k_temp % ths->N[t];
00595         if (kp[t] >= ths->N[t]/2)
00596           k[t] = ths->n[t] - ths->N[t] + kp[t];
00597         else
00598           k[t] = kp[t];
00599         ks[t] = (kp[t] + ths->N[t]/2) % ths->N[t];
00600         k_temp /= ths->N[t];
00601       }
00602 
00603       for (t = 0; t < ths->d; t++)
00604       {
00605         c_phi_inv_k_val /= (PHI_HUT(ths->n[t],ks[t]-(ths->N[t]/2),t));
00606         ks_plain_val = ks_plain_val*ths->N[t] + ks[t];
00607         k_plain_val = k_plain_val*ths->n[t] + k[t];
00608       }
00609 
00610       f_hat[ks_plain_val] = g_hat[k_plain_val] * c_phi_inv_k_val;
00611     } /* for(k_L) */
00612   } /* else(PRE_PHI_HUT) */
00613 }
00614 #endif
00615 
00616 #ifndef _OPENMP
00617 MACRO_D(T)
00618 #endif
00619 
00620 static void D_T(X(plan) *ths)
00621 {
00622 #ifdef _OPENMP
00623   D_openmp_T(ths);
00624 #else
00625   D_serial_T(ths);
00626 #endif
00627 }
00628 
00629 /* sub routines for the fast transforms matrix vector multiplication with B, B^T */
00630 #define MACRO_B_init_result_A memset(f, 0, (size_t)(ths->M_total) * sizeof(C));
00631 #define MACRO_B_init_result_T memset(g, 0, (size_t)(ths->n_total) * sizeof(C));
00632 
00633 #define MACRO_B_PRE_FULL_PSI_compute_A \
00634 { \
00635   (*fj) += ths->psi[ix] * g[ths->psi_index_g[ix]]; \
00636 }
00637 
00638 #define MACRO_B_PRE_FULL_PSI_compute_T \
00639 { \
00640   g[ths->psi_index_g[ix]] += ths->psi[ix] * (*fj); \
00641 }
00642 
00643 #define MACRO_B_compute_A \
00644 { \
00645   (*fj) += phi_prod[ths->d] * g[ll_plain[ths->d]]; \
00646 }
00647 
00648 #define MACRO_B_compute_T \
00649 { \
00650   g[ll_plain[ths->d]] += phi_prod[ths->d] * (*fj); \
00651 }
00652 
00653 #define MACRO_with_FG_PSI fg_psi[t2][lj[t2]]
00654 
00655 #define MACRO_with_PRE_PSI ths->psi[(j*ths->d+t2) * (2*ths->m+2)+lj[t2]]
00656 
00657 #define MACRO_without_PRE_PSI  PHI(ths->n[t2], ths->x[j*ths->d+t2] \
00658   - ((R)l[t2])/((R)ths->n[t2]), t2)
00659 
00660 #define MACRO_init_uo_l_lj_t \
00661 { \
00662   for (t = ths->d-1; t >= 0; t--) \
00663   { \
00664     uo(ths,j,&u[t],&o[t],t); \
00665     l[t] = u[t]; \
00666     lj[t] = 0; \
00667   } \
00668   t++; \
00669 }
00670 
00671 #define MACRO_update_phi_prod_ll_plain(which_one) { \
00672   for (t2 = t; t2 < ths->d; t2++) \
00673     { \
00674       phi_prod[t2+1] = phi_prod[t2] * MACRO_ ## which_one; \
00675       ll_plain[t2+1] = ll_plain[t2] * ths->n[t2] + (l[t2] + ths->n[t2]) % ths->n[t2]; \
00676     } \
00677 }
00678 
00679 #define MACRO_count_uo_l_lj_t \
00680 { \
00681   for (t = ths->d-1; (t > 0) && (l[t] == o[t]); t--) \
00682   { \
00683     l[t] = u[t]; \
00684     lj[t] = 0; \
00685   } \
00686  \
00687   l[t]++; \
00688   lj[t]++; \
00689 }
00690 
00691 #define MACRO_B(which_one) \
00692 static inline void B_serial_ ## which_one (X(plan) *ths) \
00693 { \
00694   INT lprod; /* 'regular bandwidth' of matrix B  */ \
00695   INT u[ths->d], o[ths->d]; /* multi band with respect to x_j */ \
00696   INT t, t2; /* index dimensions */ \
00697   INT j; /* index nodes */ \
00698   INT l_L, ix; /* index one row of B */ \
00699   INT l[ths->d]; /* multi index u<=l<=o */ \
00700   INT lj[ths->d]; /* multi index 0<=lj<u+o+1 */ \
00701   INT ll_plain[ths->d+1]; /* postfix plain index in g */ \
00702   R phi_prod[ths->d+1]; /* postfix product of PHI */ \
00703   C *f, *g; /* local copy */ \
00704   C *fj; /* local copy */ \
00705   R y[ths->d]; \
00706   R fg_psi[ths->d][2*ths->m+2]; \
00707   R fg_exp_l[ths->d][2*ths->m+2]; \
00708   INT l_fg,lj_fg; \
00709   R tmpEXP1, tmpEXP2, tmpEXP2sq, tmp1, tmp2, tmp3; \
00710   R ip_w; \
00711   INT ip_u; \
00712   INT ip_s = ths->K/(ths->m+2); \
00713  \
00714   f = (C*)ths->f; g = (C*)ths->g; \
00715  \
00716   MACRO_B_init_result_ ## which_one; \
00717  \
00718   if (ths->flags & PRE_FULL_PSI) \
00719   { \
00720     for (ix = 0, j = 0, fj = f; j < ths->M_total; j++, fj++) \
00721     { \
00722       for (l_L = 0; l_L < ths->psi_index_f[j]; l_L++, ix++) \
00723       { \
00724         MACRO_B_PRE_FULL_PSI_compute_ ## which_one; \
00725       } \
00726     } \
00727     return; \
00728   } \
00729 \
00730   phi_prod[0] = K(1.0); \
00731   ll_plain[0] = 0; \
00732 \
00733   for (t = 0, lprod = 1; t < ths->d; t++) \
00734     lprod *= (2 * ths->m + 2); \
00735 \
00736   if (ths->flags & PRE_PSI) \
00737   { \
00738     for (j = 0, fj = f; j < ths->M_total; j++, fj++) \
00739     { \
00740       MACRO_init_uo_l_lj_t; \
00741  \
00742       for (l_L = 0; l_L < lprod; l_L++) \
00743       { \
00744         MACRO_update_phi_prod_ll_plain(with_PRE_PSI); \
00745  \
00746         MACRO_B_compute_ ## which_one; \
00747  \
00748         MACRO_count_uo_l_lj_t; \
00749       } /* for(l_L) */ \
00750     } /* for(j) */ \
00751     return; \
00752   } /* if(PRE_PSI) */ \
00753  \
00754   if (ths->flags & PRE_FG_PSI) \
00755   { \
00756     for(t2 = 0; t2 < ths->d; t2++) \
00757     { \
00758       tmpEXP2 = EXP(K(-1.0) / ths->b[t2]); \
00759       tmpEXP2sq = tmpEXP2*tmpEXP2; \
00760       tmp2 = K(1.0); \
00761       tmp3 = K(1.0); \
00762       fg_exp_l[t2][0] = K(1.0); \
00763       for (lj_fg = 1; lj_fg <= (2 * ths->m + 2); lj_fg++) \
00764       { \
00765         tmp3 = tmp2*tmpEXP2; \
00766         tmp2 *= tmpEXP2sq; \
00767         fg_exp_l[t2][lj_fg] = fg_exp_l[t2][lj_fg-1] * tmp3; \
00768       } \
00769     } \
00770     for (j = 0, fj = f; j < ths->M_total; j++, fj++) \
00771     { \
00772       MACRO_init_uo_l_lj_t; \
00773  \
00774       for (t2 = 0; t2 < ths->d; t2++) \
00775       { \
00776         fg_psi[t2][0] = ths->psi[2*(j*ths->d+t2)]; \
00777         tmpEXP1 = ths->psi[2*(j*ths->d+t2)+1]; \
00778         tmp1 = K(1.0); \
00779         for (l_fg = u[t2]+1, lj_fg = 1; l_fg <= o[t2]; l_fg++, lj_fg++) \
00780         { \
00781           tmp1 *= tmpEXP1; \
00782           fg_psi[t2][lj_fg] = fg_psi[t2][0]*tmp1*fg_exp_l[t2][lj_fg]; \
00783         } \
00784       } \
00785  \
00786       for (l_L= 0; l_L < lprod; l_L++) \
00787       { \
00788         MACRO_update_phi_prod_ll_plain(with_FG_PSI); \
00789  \
00790         MACRO_B_compute_ ## which_one; \
00791  \
00792         MACRO_count_uo_l_lj_t; \
00793       } /* for(l_L) */ \
00794     } /* for(j) */ \
00795     return; \
00796   } /* if(PRE_FG_PSI) */ \
00797  \
00798   if (ths->flags & FG_PSI) \
00799   { \
00800     for (t2 = 0; t2 < ths->d; t2++) \
00801     { \
00802       tmpEXP2 = EXP(K(-1.0)/ths->b[t2]); \
00803       tmpEXP2sq = tmpEXP2*tmpEXP2; \
00804       tmp2 = K(1.0); \
00805       tmp3 = K(1.0); \
00806       fg_exp_l[t2][0] = K(1.0); \
00807       for (lj_fg = 1; lj_fg <= (2*ths->m+2); lj_fg++) \
00808       { \
00809         tmp3 = tmp2*tmpEXP2; \
00810         tmp2 *= tmpEXP2sq; \
00811         fg_exp_l[t2][lj_fg] = fg_exp_l[t2][lj_fg-1]*tmp3; \
00812       } \
00813     } \
00814     for (j = 0, fj = f; j < ths->M_total; j++, fj++) \
00815     { \
00816       MACRO_init_uo_l_lj_t; \
00817  \
00818       for (t2 = 0; t2 < ths->d; t2++) \
00819       { \
00820         fg_psi[t2][0] = (PHI(ths->n[t2], (ths->x[j*ths->d+t2] - ((R)u[t2])/((R)(ths->n[t2]))), t2));\
00821  \
00822         tmpEXP1 = EXP(K(2.0) * ((R)(ths->n[t2]) * ths->x[j * ths->d + t2] - (R)(u[t2])) \
00823           /ths->b[t2]); \
00824         tmp1 = K(1.0); \
00825         for (l_fg = u[t2] + 1, lj_fg = 1; l_fg <= o[t2]; l_fg++, lj_fg++) \
00826         { \
00827           tmp1 *= tmpEXP1; \
00828           fg_psi[t2][lj_fg] = fg_psi[t2][0]*tmp1*fg_exp_l[t2][lj_fg]; \
00829         } \
00830       } \
00831  \
00832       for (l_L = 0; l_L < lprod; l_L++) \
00833       { \
00834         MACRO_update_phi_prod_ll_plain(with_FG_PSI); \
00835  \
00836         MACRO_B_compute_ ## which_one; \
00837  \
00838         MACRO_count_uo_l_lj_t; \
00839       } /* for(l_L) */ \
00840     } /* for(j) */ \
00841     return; \
00842   } /* if(FG_PSI) */ \
00843  \
00844   if (ths->flags & PRE_LIN_PSI) \
00845   { \
00846     for (j = 0, fj=f; j<ths->M_total; j++, fj++) \
00847     { \
00848       MACRO_init_uo_l_lj_t; \
00849  \
00850       for (t2 = 0; t2 < ths->d; t2++) \
00851       { \
00852         y[t2] = (((R)(ths->n[t2]) * ths->x[j * ths->d + t2] - (R)(u[t2])) \
00853           * ((R)(ths->K))) / (R)(ths->m + 2); \
00854         ip_u  = LRINT(FLOOR(y[t2])); \
00855         ip_w  = y[t2]-ip_u; \
00856         for (l_fg = u[t2], lj_fg = 0; l_fg <= o[t2]; l_fg++, lj_fg++) \
00857         { \
00858           fg_psi[t2][lj_fg] = ths->psi[(ths->K+1)*t2 + ABS(ip_u-lj_fg*ip_s)] \
00859             * (1-ip_w) + ths->psi[(ths->K+1)*t2 + ABS(ip_u-lj_fg*ip_s+1)] \
00860             * (ip_w); \
00861         } \
00862       } \
00863  \
00864       for (l_L = 0; l_L < lprod; l_L++) \
00865       { \
00866         MACRO_update_phi_prod_ll_plain(with_FG_PSI); \
00867  \
00868         MACRO_B_compute_ ## which_one; \
00869  \
00870         MACRO_count_uo_l_lj_t; \
00871       } /* for(l_L) */ \
00872     } /* for(j) */ \
00873     return; \
00874   } /* if(PRE_LIN_PSI) */ \
00875  \
00876   /* no precomputed psi at all */ \
00877   for (j = 0, fj = f; j < ths->M_total; j++, fj++) \
00878   { \
00879     MACRO_init_uo_l_lj_t; \
00880  \
00881     for (l_L = 0; l_L < lprod; l_L++) \
00882     { \
00883       MACRO_update_phi_prod_ll_plain(without_PRE_PSI); \
00884  \
00885       MACRO_B_compute_ ## which_one; \
00886  \
00887       MACRO_count_uo_l_lj_t; \
00888     } /* for(l_L) */ \
00889   } /* for(j) */ \
00890 } /* nfft_B */ \
00891 
00892 #ifndef _OPENMP
00893 MACRO_B(A)
00894 #endif
00895 
00896 #ifdef _OPENMP
00897 static inline void B_openmp_A (X(plan) *ths)
00898 {
00899   INT lprod; /* 'regular bandwidth' of matrix B  */
00900   INT k;
00901 
00902   memset(ths->f, 0, ths->M_total * sizeof(C));
00903 
00904   for (k = 0, lprod = 1; k < ths->d; k++)
00905     lprod *= (2*ths->m+2);
00906 
00907   if (ths->flags & PRE_FULL_PSI)
00908   {
00909     #pragma omp parallel for default(shared) private(k)
00910     for (k = 0; k < ths->M_total; k++)
00911     {
00912       INT l;
00913       INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
00914       ths->f[j] = K(0.0);
00915       for (l = 0; l < lprod; l++)
00916         ths->f[j] += ths->psi[j*lprod+l] * ths->g[ths->psi_index_g[j*lprod+l]];
00917     }
00918     return;
00919   }
00920 
00921   if (ths->flags & PRE_PSI)
00922   {
00923     #pragma omp parallel for default(shared) private(k)
00924     for (k = 0; k < ths->M_total; k++)
00925     {
00926       INT u[ths->d], o[ths->d]; /* multi band with respect to x_j */
00927       INT t, t2; /* index dimensions */
00928       INT l_L; /* index one row of B */
00929       INT l[ths->d]; /* multi index u<=l<=o */
00930       INT lj[ths->d]; /* multi index 0<=lj<u+o+1 */
00931       INT ll_plain[ths->d+1]; /* postfix plain index in g */
00932       R phi_prod[ths->d+1]; /* postfix product of PHI */
00933       INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
00934 
00935       phi_prod[0] = K(1.0);
00936       ll_plain[0] = 0;
00937 
00938       MACRO_init_uo_l_lj_t;
00939 
00940       for (l_L = 0; l_L < lprod; l_L++)
00941       {
00942         MACRO_update_phi_prod_ll_plain(with_PRE_PSI);
00943 
00944         ths->f[j] += phi_prod[ths->d] * ths->g[ll_plain[ths->d]];
00945 
00946         MACRO_count_uo_l_lj_t;
00947       } /* for(l_L) */
00948     } /* for(j) */
00949     return;
00950   } /* if(PRE_PSI) */
00951 
00952   if (ths->flags & PRE_FG_PSI)
00953   {
00954     INT t, t2; /* index dimensions */
00955     R fg_exp_l[ths->d][2*ths->m+2];
00956 
00957     for (t2 = 0; t2 < ths->d; t2++)
00958     {
00959       INT lj_fg;
00960       R tmpEXP2 = EXP(K(-1.0)/ths->b[t2]);
00961       R tmpEXP2sq = tmpEXP2*tmpEXP2;
00962       R tmp2 = K(1.0);
00963       R tmp3 = K(1.0);
00964       fg_exp_l[t2][0] = K(1.0);
00965       for(lj_fg = 1; lj_fg <= (2*ths->m+2); lj_fg++)
00966       {
00967         tmp3 = tmp2*tmpEXP2;
00968         tmp2 *= tmpEXP2sq;
00969         fg_exp_l[t2][lj_fg] = fg_exp_l[t2][lj_fg-1]*tmp3;
00970       }
00971     }
00972 
00973     #pragma omp parallel for default(shared) private(k,t,t2)
00974     for (k = 0; k < ths->M_total; k++)
00975     {
00976       INT ll_plain[ths->d+1]; /* postfix plain index in g */
00977       R phi_prod[ths->d+1]; /* postfix product of PHI */
00978       INT u[ths->d], o[ths->d]; /* multi band with respect to x_j */
00979       INT l[ths->d]; /* multi index u<=l<=o */
00980       INT lj[ths->d]; /* multi index 0<=lj<u+o+1 */
00981       R fg_psi[ths->d][2*ths->m+2];
00982       R tmpEXP1, tmp1;
00983       INT l_fg,lj_fg;
00984       INT l_L;
00985       INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
00986 
00987       phi_prod[0] = K(1.0);
00988       ll_plain[0] = 0;
00989 
00990       MACRO_init_uo_l_lj_t;
00991 
00992       for (t2 = 0; t2 < ths->d; t2++)
00993       {
00994         fg_psi[t2][0] = ths->psi[2*(j*ths->d+t2)];
00995         tmpEXP1 = ths->psi[2*(j*ths->d+t2)+1];
00996         tmp1 = K(1.0);
00997         for (l_fg = u[t2]+1, lj_fg = 1; l_fg <= o[t2]; l_fg++, lj_fg++)
00998         {
00999           tmp1 *= tmpEXP1;
01000           fg_psi[t2][lj_fg] = fg_psi[t2][0]*tmp1*fg_exp_l[t2][lj_fg];
01001         }
01002       }
01003 
01004       for (l_L= 0; l_L < lprod; l_L++)
01005       {
01006         MACRO_update_phi_prod_ll_plain(with_FG_PSI);
01007 
01008         ths->f[j] += phi_prod[ths->d] * ths->g[ll_plain[ths->d]];
01009 
01010         MACRO_count_uo_l_lj_t;
01011       } /* for(l_L) */
01012     } /* for(j) */
01013     return;
01014   } /* if(PRE_FG_PSI) */
01015 
01016   if (ths->flags & FG_PSI)
01017   {
01018     INT t, t2; /* index dimensions */
01019     R fg_exp_l[ths->d][2*ths->m+2];
01020 
01021     sort(ths);
01022 
01023     for (t2 = 0; t2 < ths->d; t2++)
01024     {
01025       INT lj_fg;
01026       R tmpEXP2 = EXP(K(-1.0)/ths->b[t2]);
01027       R tmpEXP2sq = tmpEXP2*tmpEXP2;
01028       R tmp2 = K(1.0);
01029       R tmp3 = K(1.0);
01030       fg_exp_l[t2][0] = K(1.0);
01031       for (lj_fg = 1; lj_fg <= (2*ths->m+2); lj_fg++)
01032       {
01033         tmp3 = tmp2*tmpEXP2;
01034         tmp2 *= tmpEXP2sq;
01035         fg_exp_l[t2][lj_fg] = fg_exp_l[t2][lj_fg-1]*tmp3;
01036       }
01037     }
01038 
01039     #pragma omp parallel for default(shared) private(k,t,t2)
01040     for (k = 0; k < ths->M_total; k++)
01041     {
01042       INT ll_plain[ths->d+1]; /* postfix plain index in g */
01043       R phi_prod[ths->d+1]; /* postfix product of PHI */
01044       INT u[ths->d], o[ths->d]; /* multi band with respect to x_j */
01045       INT l[ths->d]; /* multi index u<=l<=o */
01046       INT lj[ths->d]; /* multi index 0<=lj<u+o+1 */
01047       R fg_psi[ths->d][2*ths->m+2];
01048       R tmpEXP1, tmp1;
01049       INT l_fg,lj_fg;
01050       INT l_L;
01051       INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
01052 
01053       phi_prod[0] = K(1.0);
01054       ll_plain[0] = 0;
01055 
01056       MACRO_init_uo_l_lj_t;
01057 
01058       for (t2 = 0; t2 < ths->d; t2++)
01059       {
01060         fg_psi[t2][0] = (PHI(ths->n[t2],(ths->x[j*ths->d+t2]-((R)u[t2])/ths->n[t2]),t2));
01061 
01062         tmpEXP1 = EXP(K(2.0)*(ths->n[t2]*ths->x[j*ths->d+t2] - u[t2])
01063           /ths->b[t2]);
01064         tmp1 = K(1.0);
01065         for (l_fg = u[t2] + 1, lj_fg = 1; l_fg <= o[t2]; l_fg++, lj_fg++)
01066         {
01067           tmp1 *= tmpEXP1;
01068           fg_psi[t2][lj_fg] = fg_psi[t2][0]*tmp1*fg_exp_l[t2][lj_fg];
01069         }
01070       }
01071 
01072       for (l_L = 0; l_L < lprod; l_L++)
01073       {
01074         MACRO_update_phi_prod_ll_plain(with_FG_PSI);
01075 
01076         ths->f[j] += phi_prod[ths->d] * ths->g[ll_plain[ths->d]];
01077 
01078         MACRO_count_uo_l_lj_t;
01079       } /* for(l_L) */
01080     } /* for(j) */
01081     return;
01082   } /* if(FG_PSI) */
01083 
01084   if (ths->flags & PRE_LIN_PSI)
01085   {
01086     sort(ths);
01087 
01088     #pragma omp parallel for default(shared) private(k)
01089     for (k = 0; k<ths->M_total; k++)
01090     {
01091       INT u[ths->d], o[ths->d]; /* multi band with respect to x_j */
01092       INT t, t2; /* index dimensions */
01093       INT l_L; /* index one row of B */
01094       INT l[ths->d]; /* multi index u<=l<=o */
01095       INT lj[ths->d]; /* multi index 0<=lj<u+o+1 */
01096       INT ll_plain[ths->d+1]; /* postfix plain index in g */
01097       R phi_prod[ths->d+1]; /* postfix product of PHI */
01098       R y[ths->d];
01099       R fg_psi[ths->d][2*ths->m+2];
01100       INT l_fg,lj_fg;
01101       R ip_w;
01102       INT ip_u;
01103       INT ip_s = ths->K/(ths->m+2);
01104       INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
01105 
01106       phi_prod[0] = K(1.0);
01107       ll_plain[0] = 0;
01108 
01109       MACRO_init_uo_l_lj_t;
01110 
01111       for (t2 = 0; t2 < ths->d; t2++)
01112       {
01113         y[t2] = ((ths->n[t2]*ths->x[j*ths->d+t2]-(R)u[t2])
01114           * ((R)ths->K))/(ths->m+2);
01115         ip_u  = LRINT(FLOOR(y[t2]));
01116         ip_w  = y[t2]-ip_u;
01117         for (l_fg = u[t2], lj_fg = 0; l_fg <= o[t2]; l_fg++, lj_fg++)
01118         {
01119           fg_psi[t2][lj_fg] = ths->psi[(ths->K+1)*t2 + ABS(ip_u-lj_fg*ip_s)]
01120             * (1-ip_w) + ths->psi[(ths->K+1)*t2 + ABS(ip_u-lj_fg*ip_s+1)]
01121             * (ip_w);
01122         }
01123       }
01124 
01125       for (l_L = 0; l_L < lprod; l_L++)
01126       {
01127         MACRO_update_phi_prod_ll_plain(with_FG_PSI);
01128 
01129         ths->f[j] += phi_prod[ths->d] * ths->g[ll_plain[ths->d]];
01130 
01131         MACRO_count_uo_l_lj_t;
01132       } /* for(l_L) */
01133     } /* for(j) */
01134     return;
01135   } /* if(PRE_LIN_PSI) */
01136 
01137   /* no precomputed psi at all */
01138   sort(ths);
01139 
01140   #pragma omp parallel for default(shared) private(k)
01141   for (k = 0; k < ths->M_total; k++)
01142   {
01143     INT u[ths->d], o[ths->d]; /* multi band with respect to x_j */
01144     INT t, t2; /* index dimensions */
01145     INT l_L; /* index one row of B */
01146     INT l[ths->d]; /* multi index u<=l<=o */
01147     INT lj[ths->d]; /* multi index 0<=lj<u+o+1 */
01148     INT ll_plain[ths->d+1]; /* postfix plain index in g */
01149     R phi_prod[ths->d+1]; /* postfix product of PHI */
01150     INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
01151 
01152     phi_prod[0] = K(1.0);
01153     ll_plain[0] = 0;
01154 
01155     MACRO_init_uo_l_lj_t;
01156 
01157     for (l_L = 0; l_L < lprod; l_L++)
01158     {
01159       MACRO_update_phi_prod_ll_plain(without_PRE_PSI);
01160 
01161       ths->f[j] += phi_prod[ths->d] * ths->g[ll_plain[ths->d]];
01162 
01163       MACRO_count_uo_l_lj_t;
01164     } /* for(l_L) */
01165   } /* for(j) */
01166 }
01167 #endif
01168 
01169 static void B_A(X(plan) *ths)
01170 {
01171 #ifdef _OPENMP
01172   B_openmp_A(ths);
01173 #else
01174   B_serial_A(ths);
01175 #endif
01176 }
01177 
01178 #ifdef _OPENMP
01179 
01194 static inline INT index_x_binary_search(const INT *ar_x, const INT len, const INT key)
01195 {
01196   INT left = 0, right = len - 1;
01197 
01198   if (len == 1)
01199     return 0;
01200 
01201   while (left < right - 1)
01202   {
01203     INT i = (left + right) / 2;
01204     if (ar_x[2*i] >= key)
01205       right = i;
01206     else if (ar_x[2*i] < key)
01207       left = i;
01208   }
01209 
01210   if (ar_x[2*left] < key && left != len-1)
01211     return left+1;
01212 
01213   return left;
01214 }
01215 #endif
01216 
01217 #ifdef _OPENMP
01218 
01233 static void nfft_adjoint_B_omp_blockwise_init(INT *my_u0, INT *my_o0,
01234     INT *min_u_a, INT *max_u_a, INT *min_u_b, INT *max_u_b, const INT d,
01235     const INT *n, const INT m)
01236 {
01237   const INT n0 = n[0];
01238   INT k;
01239   INT nthreads = omp_get_num_threads();
01240   INT nthreads_used = MIN(nthreads, n0);
01241   INT size_per_thread = n0 / nthreads_used;
01242   INT size_left = n0 - size_per_thread * nthreads_used;
01243   INT size_g[nthreads_used];
01244   INT offset_g[nthreads_used];
01245   INT my_id = omp_get_thread_num();
01246   INT n_prod_rest = 1;
01247 
01248   for (k = 1; k < d; k++)
01249     n_prod_rest *= n[k];
01250 
01251   *min_u_a = -1;
01252   *max_u_a = -1;
01253   *min_u_b = -1;
01254   *max_u_b = -1;
01255   *my_u0 = -1;
01256   *my_o0 = -1;
01257 
01258   if (my_id < nthreads_used)
01259   {
01260     const INT m22 = 2 * m + 2;
01261 
01262     offset_g[0] = 0;
01263     for (k = 0; k < nthreads_used; k++)
01264     {
01265       if (k > 0)
01266         offset_g[k] = offset_g[k-1] + size_g[k-1];
01267       size_g[k] = size_per_thread;
01268       if (size_left > 0)
01269       {
01270         size_g[k]++;
01271         size_left--;
01272       }
01273     }
01274 
01275     *my_u0 = offset_g[my_id];
01276     *my_o0 = offset_g[my_id] + size_g[my_id] - 1;
01277 
01278     if (nthreads_used > 1)
01279     {
01280       *max_u_a = n_prod_rest*(offset_g[my_id] + size_g[my_id]) - 1;
01281       *min_u_a = n_prod_rest*(offset_g[my_id] - m22 + 1);
01282     }
01283     else
01284     {
01285       *min_u_a = 0;
01286       *max_u_a = n_prod_rest * n0 - 1;
01287     }
01288 
01289     if (*min_u_a < 0)
01290     {
01291       *min_u_b = n_prod_rest * (offset_g[my_id] - m22 + 1 + n0);
01292       *max_u_b = n_prod_rest * n0 - 1;
01293       *min_u_a = 0;
01294     }
01295 
01296     if (*min_u_b != -1 && *min_u_b <= *max_u_a)
01297     {
01298       *max_u_a = *max_u_b;
01299       *min_u_b = -1;
01300       *max_u_b = -1;
01301     }
01302 #ifdef OMP_ASSERT
01303     assert(*min_u_a <= *max_u_a);
01304     assert(*min_u_b <= *max_u_b);
01305     assert(*min_u_b == -1 || *max_u_a < *min_u_b);
01306 #endif
01307   }
01308 }
01309 #endif
01310 
01319 static void nfft_adjoint_B_compute_full_psi(C *g, const INT *psi_index_g,
01320     const R *psi, const C *f, const INT M, const INT d, const INT *n,
01321     const INT m, const unsigned flags, const INT *index_x)
01322 {
01323   INT k;
01324   INT lprod;
01325 #ifdef _OPENMP
01326   INT lprod_m1;
01327 #endif
01328 #ifndef _OPENMP
01329   UNUSED(n);
01330 #endif
01331   {
01332     INT t;
01333     for(t = 0, lprod = 1; t < d; t++)
01334         lprod *= 2 * m + 2;
01335   }
01336 #ifdef _OPENMP
01337   lprod_m1 = lprod / (2 * m + 2);
01338 #endif
01339 
01340 #ifdef _OPENMP
01341   if (flags & NFFT_OMP_BLOCKWISE_ADJOINT)
01342   {
01343     #pragma omp parallel private(k)
01344     {
01345       INT my_u0, my_o0, min_u_a, max_u_a, min_u_b, max_u_b;
01346       const INT *ar_x = index_x;
01347       INT n_prod_rest = 1;
01348 
01349       for (k = 1; k < d; k++)
01350         n_prod_rest *= n[k];
01351 
01352       nfft_adjoint_B_omp_blockwise_init(&my_u0, &my_o0, &min_u_a, &max_u_a, &min_u_b, &max_u_b, d, n, m);
01353 
01354       if (min_u_a != -1)
01355       {
01356         k = index_x_binary_search(ar_x, M, min_u_a);
01357 #ifdef OMP_ASSERT
01358         assert(ar_x[2*k] >= min_u_a || k == M-1);
01359         if (k > 0)
01360           assert(ar_x[2*k-2] < min_u_a);
01361 #endif
01362         while (k < M)
01363         {
01364           INT l0, lrest;
01365           INT u_prod = ar_x[2*k];
01366           INT j = ar_x[2*k+1];
01367 
01368           if (u_prod < min_u_a || u_prod > max_u_a)
01369             break;
01370 
01371           for (l0 = 0; l0 < 2 * m + 2; l0++)
01372           {
01373             const INT start_index = psi_index_g[j * lprod + l0 * lprod_m1];
01374 
01375             if (start_index < my_u0 * n_prod_rest || start_index > (my_o0+1) * n_prod_rest - 1)
01376               continue;
01377 
01378             for (lrest = 0; lrest < lprod_m1; lrest++)
01379             {
01380               const INT l = l0 * lprod_m1 + lrest;
01381               g[psi_index_g[j * lprod + l]] += psi[j * lprod + l] * f[j];
01382             }
01383           }
01384 
01385           k++;
01386         }
01387       }
01388 
01389       if (min_u_b != -1)
01390       {
01391         k = index_x_binary_search(ar_x, M, min_u_b);
01392 #ifdef OMP_ASSERT
01393         assert(ar_x[2*k] >= min_u_b || k == M-1);
01394         if (k > 0)
01395           assert(ar_x[2*k-2] < min_u_b);
01396 #endif
01397         while (k < M)
01398         {
01399           INT l0, lrest;
01400           INT u_prod = ar_x[2*k];
01401           INT j = ar_x[2*k+1];
01402 
01403           if (u_prod < min_u_b || u_prod > max_u_b)
01404             break;
01405 
01406           for (l0 = 0; l0 < 2 * m + 2; l0++)
01407           {
01408             const INT start_index = psi_index_g[j * lprod + l0 * lprod_m1];
01409 
01410             if (start_index < my_u0 * n_prod_rest || start_index > (my_o0+1) * n_prod_rest - 1)
01411               continue;
01412             for (lrest = 0; lrest < lprod_m1; lrest++)
01413             {
01414               const INT l = l0 * lprod_m1 + lrest;
01415               g[psi_index_g[j * lprod + l]] += psi[j * lprod + l] * f[j];
01416             }
01417           }
01418 
01419           k++;
01420         }
01421       }
01422     } /* omp parallel */
01423     return;
01424   } /* if(NFFT_OMP_BLOCKWISE_ADJOINT) */
01425 #endif
01426 
01427 #ifdef _OPENMP
01428   #pragma omp parallel for default(shared) private(k)
01429 #endif
01430   for (k = 0; k < M; k++)
01431   {
01432     INT l;
01433     INT j = (flags & NFFT_SORT_NODES) ? index_x[2*k+1] : k;
01434 
01435     for (l = 0; l < lprod; l++)
01436     {
01437 #ifdef _OPENMP
01438       C val = psi[j * lprod + l] * f[j];
01439       C *gref = g + psi_index_g[j * lprod + l];
01440       R *gref_real = (R*) gref;
01441 
01442       #pragma omp atomic
01443       gref_real[0] += CREAL(val);
01444 
01445       #pragma omp atomic
01446       gref_real[1] += CIMAG(val);
01447 #else
01448       g[psi_index_g[j * lprod + l]] += psi[j * lprod + l] * f[j];
01449 #endif
01450     }
01451   }
01452 }
01453 
01454 #ifndef _OPENMP
01455 MACRO_B(T)
01456 #endif
01457 
01458 #ifdef _OPENMP
01459 static inline void B_openmp_T(X(plan) *ths)
01460 {
01461   INT lprod; /* 'regular bandwidth' of matrix B  */
01462   INT k;
01463 
01464   memset(ths->g, 0, (size_t)(ths->n_total) * sizeof(C));
01465 
01466   for (k = 0, lprod = 1; k < ths->d; k++)
01467     lprod *= (2*ths->m+2);
01468 
01469   if (ths->flags & PRE_FULL_PSI)
01470   {
01471     nfft_adjoint_B_compute_full_psi(ths->g, ths->psi_index_g, ths->psi, ths->f,
01472         ths->M_total, ths->d, ths->n, ths->m, ths->flags, ths->index_x);
01473     return;
01474   }
01475 
01476   if (ths->flags & PRE_PSI)
01477   {
01478     #pragma omp parallel for default(shared) private(k)
01479     for (k = 0; k < ths->M_total; k++)
01480     {
01481       INT u[ths->d], o[ths->d]; /* multi band with respect to x_j */
01482       INT t, t2; /* index dimensions */
01483       INT l_L; /* index one row of B */
01484       INT l[ths->d]; /* multi index u<=l<=o */
01485       INT lj[ths->d]; /* multi index 0<=lj<u+o+1 */
01486       INT ll_plain[ths->d+1]; /* postfix plain index in g */
01487       R phi_prod[ths->d+1]; /* postfix product of PHI */
01488       INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
01489 
01490       phi_prod[0] = K(1.0);
01491       ll_plain[0] = 0;
01492 
01493       MACRO_init_uo_l_lj_t;
01494 
01495       for (l_L = 0; l_L < lprod; l_L++)
01496       {
01497         C *lhs;
01498         R *lhs_real;
01499         C val;
01500 
01501         MACRO_update_phi_prod_ll_plain(with_PRE_PSI);
01502 
01503         lhs = ths->g + ll_plain[ths->d];
01504         lhs_real = (R*)lhs;
01505         val = phi_prod[ths->d] * ths->f[j];
01506 
01507         #pragma omp atomic
01508         lhs_real[0] += CREAL(val);
01509 
01510         #pragma omp atomic
01511         lhs_real[1] += CIMAG(val);
01512 
01513         MACRO_count_uo_l_lj_t;
01514       } /* for(l_L) */
01515     } /* for(j) */
01516     return;
01517   } /* if(PRE_PSI) */
01518 
01519   if (ths->flags & PRE_FG_PSI)
01520   {
01521     INT t, t2; /* index dimensions */
01522     R fg_exp_l[ths->d][2*ths->m+2];
01523     for(t2 = 0; t2 < ths->d; t2++)
01524     {
01525       INT lj_fg;
01526       R tmpEXP2 = EXP(K(-1.0)/ths->b[t2]);
01527       R tmpEXP2sq = tmpEXP2*tmpEXP2;
01528       R tmp2 = K(1.0);
01529       R tmp3 = K(1.0);
01530       fg_exp_l[t2][0] = K(1.0);
01531       for(lj_fg = 1; lj_fg <= (2*ths->m+2); lj_fg++)
01532       {
01533         tmp3 = tmp2*tmpEXP2;
01534         tmp2 *= tmpEXP2sq;
01535         fg_exp_l[t2][lj_fg] = fg_exp_l[t2][lj_fg-1]*tmp3;
01536       }
01537     }
01538 
01539     #pragma omp parallel for default(shared) private(k,t,t2)
01540     for (k = 0; k < ths->M_total; k++)
01541     {
01542       INT ll_plain[ths->d+1]; /* postfix plain index in g */
01543       R phi_prod[ths->d+1]; /* postfix product of PHI */
01544       INT u[ths->d], o[ths->d]; /* multi band with respect to x_j */
01545       INT l[ths->d]; /* multi index u<=l<=o */
01546       INT lj[ths->d]; /* multi index 0<=lj<u+o+1 */
01547       R fg_psi[ths->d][2*ths->m+2];
01548       R tmpEXP1, tmp1;
01549       INT l_fg,lj_fg;
01550       INT l_L;
01551       INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
01552 
01553       phi_prod[0] = K(1.0);
01554       ll_plain[0] = 0;
01555 
01556       MACRO_init_uo_l_lj_t;
01557 
01558       for (t2 = 0; t2 < ths->d; t2++)
01559       {
01560         fg_psi[t2][0] = ths->psi[2*(j*ths->d+t2)];
01561         tmpEXP1 = ths->psi[2*(j*ths->d+t2)+1];
01562         tmp1 = K(1.0);
01563         for (l_fg = u[t2]+1, lj_fg = 1; l_fg <= o[t2]; l_fg++, lj_fg++)
01564         {
01565           tmp1 *= tmpEXP1;
01566           fg_psi[t2][lj_fg] = fg_psi[t2][0]*tmp1*fg_exp_l[t2][lj_fg];
01567         }
01568       }
01569 
01570       for (l_L= 0; l_L < lprod; l_L++)
01571       {
01572         C *lhs;
01573         R *lhs_real;
01574         C val;
01575 
01576         MACRO_update_phi_prod_ll_plain(with_FG_PSI);
01577 
01578         lhs = ths->g + ll_plain[ths->d];
01579         lhs_real = (R*)lhs;
01580         val = phi_prod[ths->d] * ths->f[j];
01581 
01582         #pragma omp atomic
01583         lhs_real[0] += CREAL(val);
01584 
01585         #pragma omp atomic
01586         lhs_real[1] += CIMAG(val);
01587 
01588         MACRO_count_uo_l_lj_t;
01589       } /* for(l_L) */
01590     } /* for(j) */
01591     return;
01592   } /* if(PRE_FG_PSI) */
01593 
01594   if (ths->flags & FG_PSI)
01595   {
01596     INT t, t2; /* index dimensions */
01597     R fg_exp_l[ths->d][2*ths->m+2];
01598 
01599     sort(ths);
01600 
01601     for (t2 = 0; t2 < ths->d; t2++)
01602     {
01603       INT lj_fg;
01604       R tmpEXP2 = EXP(K(-1.0)/ths->b[t2]);
01605       R tmpEXP2sq = tmpEXP2*tmpEXP2;
01606       R tmp2 = K(1.0);
01607       R tmp3 = K(1.0);
01608       fg_exp_l[t2][0] = K(1.0);
01609       for (lj_fg = 1; lj_fg <= (2*ths->m+2); lj_fg++)
01610       {
01611         tmp3 = tmp2*tmpEXP2;
01612         tmp2 *= tmpEXP2sq;
01613         fg_exp_l[t2][lj_fg] = fg_exp_l[t2][lj_fg-1]*tmp3;
01614       }
01615     }
01616 
01617     #pragma omp parallel for default(shared) private(k,t,t2)
01618     for (k = 0; k < ths->M_total; k++)
01619     {
01620       INT ll_plain[ths->d+1]; /* postfix plain index in g */
01621       R phi_prod[ths->d+1]; /* postfix product of PHI */
01622       INT u[ths->d], o[ths->d]; /* multi band with respect to x_j */
01623       INT l[ths->d]; /* multi index u<=l<=o */
01624       INT lj[ths->d]; /* multi index 0<=lj<u+o+1 */
01625       R fg_psi[ths->d][2*ths->m+2];
01626       R tmpEXP1, tmp1;
01627       INT l_fg,lj_fg;
01628       INT l_L;
01629       INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
01630 
01631       phi_prod[0] = K(1.0);
01632       ll_plain[0] = 0;
01633 
01634       MACRO_init_uo_l_lj_t;
01635 
01636       for (t2 = 0; t2 < ths->d; t2++)
01637       {
01638         fg_psi[t2][0] = (PHI(ths->n[t2],(ths->x[j*ths->d+t2]-((R)u[t2])/ths->n[t2]),t2));
01639 
01640         tmpEXP1 = EXP(K(2.0)*(ths->n[t2]*ths->x[j*ths->d+t2] - u[t2])
01641           /ths->b[t2]);
01642         tmp1 = K(1.0);
01643         for (l_fg = u[t2] + 1, lj_fg = 1; l_fg <= o[t2]; l_fg++, lj_fg++)
01644         {
01645           tmp1 *= tmpEXP1;
01646           fg_psi[t2][lj_fg] = fg_psi[t2][0]*tmp1*fg_exp_l[t2][lj_fg];
01647         }
01648       }
01649 
01650       for (l_L = 0; l_L < lprod; l_L++)
01651       {
01652         C *lhs;
01653         R *lhs_real;
01654         C val;
01655 
01656         MACRO_update_phi_prod_ll_plain(with_FG_PSI);
01657 
01658         lhs = ths->g + ll_plain[ths->d];
01659         lhs_real = (R*)lhs;
01660         val = phi_prod[ths->d] * ths->f[j];
01661 
01662         #pragma omp atomic
01663         lhs_real[0] += CREAL(val);
01664 
01665         #pragma omp atomic
01666         lhs_real[1] += CIMAG(val);
01667 
01668         MACRO_count_uo_l_lj_t;
01669       } /* for(l_L) */
01670     } /* for(j) */
01671     return;
01672   } /* if(FG_PSI) */
01673 
01674   if (ths->flags & PRE_LIN_PSI)
01675   {
01676     sort(ths);
01677 
01678     #pragma omp parallel for default(shared) private(k)
01679     for (k = 0; k<ths->M_total; k++)
01680     {
01681       INT u[ths->d], o[ths->d]; /* multi band with respect to x_j */
01682       INT t, t2; /* index dimensions */
01683       INT l_L; /* index one row of B */
01684       INT l[ths->d]; /* multi index u<=l<=o */
01685       INT lj[ths->d]; /* multi index 0<=lj<u+o+1 */
01686       INT ll_plain[ths->d+1]; /* postfix plain index in g */
01687       R phi_prod[ths->d+1]; /* postfix product of PHI */
01688       R y[ths->d];
01689       R fg_psi[ths->d][2*ths->m+2];
01690       INT l_fg,lj_fg;
01691       R ip_w;
01692       INT ip_u;
01693       INT ip_s = ths->K/(ths->m+2);
01694       INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
01695 
01696       phi_prod[0] = K(1.0);
01697       ll_plain[0] = 0;
01698 
01699       MACRO_init_uo_l_lj_t;
01700 
01701       for (t2 = 0; t2 < ths->d; t2++)
01702       {
01703         y[t2] = ((ths->n[t2]*ths->x[j*ths->d+t2]-(R)u[t2])
01704           * ((R)ths->K))/(ths->m+2);
01705         ip_u  = LRINT(FLOOR(y[t2]));
01706         ip_w  = y[t2]-ip_u;
01707         for (l_fg = u[t2], lj_fg = 0; l_fg <= o[t2]; l_fg++, lj_fg++)
01708         {
01709           fg_psi[t2][lj_fg] = ths->psi[(ths->K+1)*t2 + ABS(ip_u-lj_fg*ip_s)]
01710             * (1-ip_w) + ths->psi[(ths->K+1)*t2 + ABS(ip_u-lj_fg*ip_s+1)]
01711             * (ip_w);
01712         }
01713       }
01714 
01715       for (l_L = 0; l_L < lprod; l_L++)
01716       {
01717         C *lhs;
01718         R *lhs_real;
01719         C val;
01720 
01721         MACRO_update_phi_prod_ll_plain(with_FG_PSI);
01722 
01723         lhs = ths->g + ll_plain[ths->d];
01724         lhs_real = (R*)lhs;
01725         val = phi_prod[ths->d] * ths->f[j];
01726 
01727         #pragma omp atomic
01728         lhs_real[0] += CREAL(val);
01729 
01730         #pragma omp atomic
01731         lhs_real[1] += CIMAG(val);
01732 
01733         MACRO_count_uo_l_lj_t;
01734       } /* for(l_L) */
01735     } /* for(j) */
01736     return;
01737   } /* if(PRE_LIN_PSI) */
01738 
01739   /* no precomputed psi at all */
01740   sort(ths);
01741 
01742   #pragma omp parallel for default(shared) private(k)
01743   for (k = 0; k < ths->M_total; k++)
01744   {
01745     INT u[ths->d], o[ths->d]; /* multi band with respect to x_j */
01746     INT t, t2; /* index dimensions */
01747     INT l_L; /* index one row of B */
01748     INT l[ths->d]; /* multi index u<=l<=o */
01749     INT lj[ths->d]; /* multi index 0<=lj<u+o+1 */
01750     INT ll_plain[ths->d+1]; /* postfix plain index in g */
01751     R phi_prod[ths->d+1]; /* postfix product of PHI */
01752     INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
01753 
01754     phi_prod[0] = K(1.0);
01755     ll_plain[0] = 0;
01756 
01757     MACRO_init_uo_l_lj_t;
01758 
01759     for (l_L = 0; l_L < lprod; l_L++)
01760     {
01761       C *lhs;
01762       R *lhs_real;
01763       C val;
01764 
01765       MACRO_update_phi_prod_ll_plain(without_PRE_PSI);
01766 
01767       lhs = ths->g + ll_plain[ths->d];
01768       lhs_real = (R*)lhs;
01769       val = phi_prod[ths->d] * ths->f[j];
01770 
01771       #pragma omp atomic
01772       lhs_real[0] += CREAL(val);
01773 
01774       #pragma omp atomic
01775       lhs_real[1] += CIMAG(val);
01776 
01777       MACRO_count_uo_l_lj_t;
01778     } /* for(l_L) */
01779   } /* for(j) */
01780 }
01781 #endif
01782 
01783 static void B_T(X(plan) *ths)
01784 {
01785 #ifdef _OPENMP
01786   B_openmp_T(ths);
01787 #else
01788   B_serial_T(ths);
01789 #endif
01790 }
01791 
01792 /* ## specialized version for d=1  ########################################### */
01793 
01794 static void nfft_1d_init_fg_exp_l(R *fg_exp_l, const INT m, const R b)
01795 {
01796   const INT tmp2 = 2*m+2;
01797   INT l;
01798   R fg_exp_b0, fg_exp_b1, fg_exp_b2, fg_exp_b0_sq;
01799 
01800   fg_exp_b0 = EXP(K(-1.0)/b);
01801   fg_exp_b0_sq = fg_exp_b0*fg_exp_b0;
01802   fg_exp_b1 = fg_exp_b2 =fg_exp_l[0] = K(1.0);
01803 
01804   for (l = 1; l < tmp2; l++)
01805   {
01806     fg_exp_b2 = fg_exp_b1*fg_exp_b0;
01807     fg_exp_b1 *= fg_exp_b0_sq;
01808     fg_exp_l[l] = fg_exp_l[l-1]*fg_exp_b2;
01809   }
01810 }
01811 
01812 
01813 static void nfft_trafo_1d_compute(C *fj, const C *g,const R *psij_const,
01814   const R *xj, const INT n, const INT m)
01815 {
01816   INT u, o, l;
01817   const C *gj;
01818   const R *psij;
01819   psij = psij_const;
01820 
01821   uo2(&u, &o, *xj, n, m);
01822 
01823   if (u < o)
01824   {
01825     for (l = 1, gj = g + u, (*fj) = (*psij++) * (*gj++); l <= 2*m+1; l++)
01826       (*fj) += (*psij++) * (*gj++);
01827   }
01828   else
01829   {
01830     for (l = 1, gj = g + u, (*fj) = (*psij++) * (*gj++); l < 2*m+1 - o; l++)
01831       (*fj) += (*psij++) * (*gj++);
01832     for (l = 0, gj = g; l <= o; l++)
01833       (*fj) += (*psij++) * (*gj++);
01834   }
01835 }
01836 
01837 #ifndef _OPENMP
01838 static void nfft_adjoint_1d_compute_serial(const C *fj, C *g,
01839     const R *psij_const, const R *xj, const INT n, const INT m)
01840 {
01841   INT u,o,l;
01842   C *gj;
01843   const R *psij;
01844   psij = psij_const;
01845 
01846   uo2(&u,&o,*xj, n, m);
01847 
01848   if (u < o)
01849   {
01850     for (l = 0, gj = g+u; l <= 2*m+1; l++)
01851       (*gj++) += (*psij++) * (*fj);
01852   }
01853   else
01854   {
01855     for (l = 0, gj = g+u; l < 2*m+1-o; l++)
01856       (*gj++) += (*psij++) * (*fj);
01857     for (l = 0, gj = g; l <= o; l++)
01858       (*gj++) += (*psij++) * (*fj);
01859   }
01860 }
01861 #endif
01862 
01863 #ifdef _OPENMP
01864 /* adjoint NFFT one-dimensional case with OpenMP atomic operations */
01865 static void nfft_adjoint_1d_compute_omp_atomic(const C f, C *g,
01866     const R *psij_const, const R *xj, const INT n, const INT m)
01867 {
01868   INT u,o,l;
01869   C *gj;
01870   INT index_temp[2*m+2];
01871 
01872   uo2(&u,&o,*xj, n, m);
01873 
01874   for (l=0; l<=2*m+1; l++)
01875     index_temp[l] = (l+u)%n;
01876 
01877   for (l = 0, gj = g+u; l <= 2*m+1; l++)
01878   {
01879     INT i = index_temp[l];
01880     C *lhs = g+i;
01881     R *lhs_real = (R*)lhs;
01882     C val = psij_const[l] * f;
01883     #pragma omp atomic
01884     lhs_real[0] += CREAL(val);
01885 
01886     #pragma omp atomic
01887     lhs_real[1] += CIMAG(val);
01888   }
01889 }
01890 #endif
01891 
01892 #ifdef _OPENMP
01893 
01908 static void nfft_adjoint_1d_compute_omp_blockwise(const C f, C *g,
01909     const R *psij_const, const R *xj, const INT n, const INT m,
01910     const INT my_u0, const INT my_o0)
01911 {
01912   INT ar_u,ar_o,l;
01913 
01914   uo2(&ar_u,&ar_o,*xj, n, m);
01915 
01916   if (ar_u < ar_o)
01917   {
01918     INT u = MAX(my_u0,ar_u);
01919     INT o = MIN(my_o0,ar_o);
01920     INT offset_psij = u-ar_u;
01921 #ifdef OMP_ASSERT
01922     assert(offset_psij >= 0);
01923     assert(o-u <= 2*m+1);
01924     assert(offset_psij+o-u <= 2*m+1);
01925 #endif
01926 
01927     for (l = 0; l <= o-u; l++)
01928       g[u+l] += psij_const[offset_psij+l] * f;
01929   }
01930   else
01931   {
01932     INT u = MAX(my_u0,ar_u);
01933     INT o = my_o0;
01934     INT offset_psij = u-ar_u;
01935 #ifdef OMP_ASSERT
01936     assert(offset_psij >= 0);
01937     assert(o-u <= 2*m+1);
01938     assert(offset_psij+o-u <= 2*m+1);
01939 #endif
01940 
01941     for (l = 0; l <= o-u; l++)
01942       g[u+l] += psij_const[offset_psij+l] * f;
01943 
01944     u = my_u0;
01945     o = MIN(my_o0,ar_o);
01946     offset_psij += my_u0-ar_u+n;
01947 
01948 #ifdef OMP_ASSERT
01949     if (u <= o)
01950     {
01951       assert(o-u <= 2*m+1);
01952       if (offset_psij+o-u > 2*m+1)
01953       {
01954         fprintf(stderr, "ERR: %d %d %d %d %d %d %d\n", ar_u, ar_o, my_u0, my_o0, u, o, offset_psij);
01955       }
01956       assert(offset_psij+o-u <= 2*m+1);
01957     }
01958 #endif
01959     for (l = 0; l <= o-u; l++)
01960       g[u+l] += psij_const[offset_psij+l] * f;
01961   }
01962 }
01963 #endif
01964 
01965 static void nfft_trafo_1d_B(X(plan) *ths)
01966 {
01967   const INT n = ths->n[0], M = ths->M_total, m = ths->m, m2p2 = 2*m+2;
01968   const C *g = (C*)ths->g;
01969 
01970   if (ths->flags & PRE_FULL_PSI)
01971   {
01972     INT k;
01973 #ifdef _OPENMP
01974     #pragma omp parallel for default(shared) private(k)
01975 #endif
01976     for (k = 0; k < M; k++)
01977     {
01978       INT l;
01979       INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
01980       ths->f[j] = K(0.0);
01981       for (l = 0; l < m2p2; l++)
01982         ths->f[j] += ths->psi[j*m2p2+l] * g[ths->psi_index_g[j*m2p2+l]];
01983     }
01984     return;
01985   } /* if(PRE_FULL_PSI) */
01986 
01987   if (ths->flags & PRE_PSI)
01988   {
01989     INT k;
01990 #ifdef _OPENMP
01991     #pragma omp parallel for default(shared) private(k)
01992 #endif
01993     for (k = 0; k < M; k++)
01994     {
01995       INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
01996       nfft_trafo_1d_compute(&ths->f[j], g, ths->psi + j * (2 * m + 2),
01997         &ths->x[j], n, m);
01998     }
01999     return;
02000   } /* if(PRE_PSI) */
02001 
02002   if (ths->flags & PRE_FG_PSI)
02003   {
02004     INT k;
02005     R fg_exp_l[m2p2];
02006 
02007     nfft_1d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
02008 
02009 #ifdef _OPENMP
02010     #pragma omp parallel for default(shared) private(k)
02011 #endif
02012     for (k = 0; k < M; k++)
02013     {
02014       INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
02015       const R fg_psij0 = ths->psi[2 * j], fg_psij1 = ths->psi[2 * j + 1];
02016       R fg_psij2 = K(1.0);
02017       R psij_const[m2p2];
02018       INT l;
02019 
02020       psij_const[0] = fg_psij0;
02021 
02022       for (l = 1; l < m2p2; l++)
02023       {
02024         fg_psij2 *= fg_psij1;
02025         psij_const[l] = fg_psij0 * fg_psij2 * fg_exp_l[l];
02026       }
02027 
02028       nfft_trafo_1d_compute(&ths->f[j], g, psij_const, &ths->x[j], n, m);
02029     }
02030 
02031     return;
02032   } /* if(PRE_FG_PSI) */
02033 
02034   if (ths->flags & FG_PSI)
02035   {
02036     INT k;
02037     R fg_exp_l[m2p2];
02038 
02039     sort(ths);
02040 
02041     nfft_1d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
02042 
02043 #ifdef _OPENMP
02044     #pragma omp parallel for default(shared) private(k)
02045 #endif
02046     for (k = 0; k < M; k++)
02047     {
02048       INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
02049       INT u, o, l;
02050       R fg_psij0, fg_psij1, fg_psij2;
02051       R psij_const[m2p2];
02052 
02053       uo(ths, (INT)j, &u, &o, (INT)0);
02054       fg_psij0 = (PHI(ths->n[0], ths->x[j] - ((R)(u))/(R)(n), 0));
02055       fg_psij1 = EXP(K(2.0) * ((R)(n) * ths->x[j] - (R)(u)) / ths->b[0]);
02056       fg_psij2  = K(1.0);
02057 
02058       psij_const[0] = fg_psij0;
02059 
02060       for (l = 1; l < m2p2; l++)
02061       {
02062         fg_psij2 *= fg_psij1;
02063         psij_const[l] = fg_psij0 * fg_psij2 * fg_exp_l[l];
02064       }
02065 
02066       nfft_trafo_1d_compute(&ths->f[j], g, psij_const, &ths->x[j], n, m);
02067     }
02068     return;
02069   } /* if(FG_PSI) */
02070 
02071   if (ths->flags & PRE_LIN_PSI)
02072   {
02073     const INT K = ths->K, ip_s = K / (m + 2);
02074     INT k;
02075 
02076     sort(ths);
02077 
02078 #ifdef _OPENMP
02079     #pragma omp parallel for default(shared) private(k)
02080 #endif
02081     for (k = 0; k < M; k++)
02082     {
02083       INT u, o, l;
02084       R ip_y, ip_w;
02085       INT ip_u;
02086       R psij_const[m2p2];
02087       INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
02088 
02089       uo(ths, (INT)j, &u, &o, (INT)0);
02090 
02091       ip_y = FABS((R)(n) * ths->x[j] - (R)(u)) * ((R)ip_s);
02092       ip_u = (INT)(LRINT(FLOOR(ip_y)));
02093       ip_w = ip_y - (R)(ip_u);
02094 
02095       for (l = 0; l < m2p2; l++)
02096         psij_const[l] = ths->psi[ABS(ip_u-l*ip_s)] * (K(1.0) - ip_w)
02097           + ths->psi[ABS(ip_u-l*ip_s+1)] * (ip_w);
02098 
02099       nfft_trafo_1d_compute(&ths->f[j], g, psij_const, &ths->x[j], n, m);
02100     }
02101     return;
02102   } /* if(PRE_LIN_PSI) */
02103   else
02104   {
02105     /* no precomputed psi at all */
02106     INT k;
02107 
02108     sort(ths);
02109 
02110 #ifdef _OPENMP
02111     #pragma omp parallel for default(shared) private(k)
02112 #endif
02113     for (k = 0; k < M; k++)
02114     {
02115       R psij_const[m2p2];
02116       INT u, o, l;
02117       INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
02118 
02119       uo(ths, (INT)j, &u, &o, (INT)0);
02120 
02121       for (l = 0; l < m2p2; l++)
02122         psij_const[l] = (PHI(ths->n[0], ths->x[j] - ((R)((u+l))) / (R)(n), 0));
02123 
02124       nfft_trafo_1d_compute(&ths->f[j], g, psij_const, &ths->x[j], n, m);
02125     }
02126   }
02127 }
02128 
02129 
02130 #ifdef OMP_ASSERT
02131 #define MACRO_adjoint_1d_B_OMP_BLOCKWISE_ASSERT_A \
02132 { \
02133           assert(ar_x[2*k] >= min_u_a || k == M-1); \
02134           if (k > 0) \
02135             assert(ar_x[2*k-2] < min_u_a); \
02136 }
02137 #else
02138 #define MACRO_adjoint_1d_B_OMP_BLOCKWISE_ASSERT_A
02139 #endif
02140 
02141 #ifdef OMP_ASSERT
02142 #define MACRO_adjoint_1d_B_OMP_BLOCKWISE_ASSERT_B \
02143 { \
02144           assert(ar_x[2*k] >= min_u_b || k == M-1); \
02145           if (k > 0) \
02146             assert(ar_x[2*k-2] < min_u_b); \
02147 }
02148 #else
02149 #define MACRO_adjoint_1d_B_OMP_BLOCKWISE_ASSERT_B
02150 #endif
02151 
02152 #define MACRO_adjoint_1d_B_OMP_BLOCKWISE_COMPUTE_PRE_PSI \
02153 { \
02154             nfft_adjoint_1d_compute_omp_blockwise(ths->f[j], g, \
02155                 ths->psi + j * (2 * m + 2), ths->x + j, n, m, my_u0, my_o0); \
02156 }
02157 
02158 #define MACRO_adjoint_1d_B_OMP_BLOCKWISE_COMPUTE_PRE_FG_PSI \
02159 { \
02160             R psij_const[2 * m + 2]; \
02161             INT u, o, l; \
02162             R fg_psij0 = ths->psi[2 * j]; \
02163             R fg_psij1 = ths->psi[2 * j + 1]; \
02164             R fg_psij2 = K(1.0); \
02165  \
02166             psij_const[0] = fg_psij0; \
02167             for (l = 1; l <= 2 * m + 1; l++) \
02168             { \
02169               fg_psij2 *= fg_psij1; \
02170               psij_const[l] = fg_psij0 * fg_psij2 * fg_exp_l[l]; \
02171             } \
02172  \
02173             nfft_adjoint_1d_compute_omp_blockwise(ths->f[j], g, psij_const, \
02174                 ths->x + j, n, m, my_u0, my_o0); \
02175 }
02176 
02177 #define MACRO_adjoint_1d_B_OMP_BLOCKWISE_COMPUTE_FG_PSI \
02178 { \
02179             R psij_const[2 * m + 2]; \
02180             R fg_psij0, fg_psij1, fg_psij2; \
02181             INT u, o, l; \
02182  \
02183             uo(ths, j, &u, &o, (INT)0); \
02184             fg_psij0 = (PHI(ths->n[0],ths->x[j]-((R)u)/n,0)); \
02185             fg_psij1 = EXP(K(2.0) * (n * (ths->x[j]) - u) / ths->b[0]); \
02186             fg_psij2 = K(1.0); \
02187             psij_const[0] = fg_psij0; \
02188             for (l = 1; l <= 2 * m + 1; l++) \
02189             { \
02190               fg_psij2 *= fg_psij1; \
02191               psij_const[l] = fg_psij0 * fg_psij2 * fg_exp_l[l]; \
02192             } \
02193  \
02194             nfft_adjoint_1d_compute_omp_blockwise(ths->f[j], g, psij_const, \
02195                 ths->x + j, n, m, my_u0, my_o0); \
02196 }
02197 
02198 #define MACRO_adjoint_1d_B_OMP_BLOCKWISE_COMPUTE_PRE_LIN_PSI \
02199 { \
02200             R psij_const[2 * m + 2]; \
02201             INT ip_u; \
02202             R ip_y, ip_w; \
02203             INT u, o, l; \
02204  \
02205             uo(ths, j, &u, &o, (INT)0); \
02206  \
02207             ip_y = FABS(n * ths->x[j] - u) * ((R)ip_s); \
02208             ip_u = LRINT(FLOOR(ip_y)); \
02209             ip_w = ip_y - ip_u; \
02210             for (l = 0; l < 2 * m + 2; l++) \
02211               psij_const[l] \
02212                   = ths->psi[ABS(ip_u-l*ip_s)] * (K(1.0) - ip_w) \
02213                       + ths->psi[ABS(ip_u-l*ip_s+1)] * (ip_w); \
02214  \
02215             nfft_adjoint_1d_compute_omp_blockwise(ths->f[j], g, psij_const, \
02216                 ths->x + j, n, m, my_u0, my_o0); \
02217 }
02218 
02219 #define MACRO_adjoint_1d_B_OMP_BLOCKWISE_COMPUTE_NO_PSI \
02220 { \
02221             R psij_const[2 * m + 2]; \
02222             INT u, o, l; \
02223  \
02224             uo(ths, j, &u, &o, (INT)0); \
02225  \
02226             for (l = 0; l <= 2 * m + 1; l++) \
02227               psij_const[l] = (PHI(ths->n[0],ths->x[j]-((R)((u+l)))/n,0)); \
02228  \
02229             nfft_adjoint_1d_compute_omp_blockwise(ths->f[j], g, psij_const, \
02230                 ths->x + j, n, m, my_u0, my_o0); \
02231 }
02232 
02233 #define MACRO_adjoint_1d_B_OMP_BLOCKWISE(whichone) \
02234 { \
02235     if (ths->flags & NFFT_OMP_BLOCKWISE_ADJOINT) \
02236     { \
02237       _Pragma("omp parallel private(k)") \
02238       { \
02239         INT my_u0, my_o0, min_u_a, max_u_a, min_u_b, max_u_b; \
02240         INT *ar_x = ths->index_x; \
02241  \
02242         nfft_adjoint_B_omp_blockwise_init(&my_u0, &my_o0, &min_u_a, &max_u_a, \
02243                                       &min_u_b, &max_u_b, 1, &n, m); \
02244  \
02245         if (min_u_a != -1) \
02246         { \
02247           k = index_x_binary_search(ar_x, M, min_u_a); \
02248  \
02249           MACRO_adjoint_1d_B_OMP_BLOCKWISE_ASSERT_A \
02250  \
02251           while (k < M) \
02252           { \
02253             INT u_prod = ar_x[2*k]; \
02254             INT j = ar_x[2*k+1]; \
02255  \
02256             if (u_prod < min_u_a || u_prod > max_u_a) \
02257               break; \
02258  \
02259             MACRO_adjoint_1d_B_OMP_BLOCKWISE_COMPUTE_ ##whichone \
02260  \
02261             k++; \
02262           } \
02263         } \
02264  \
02265         if (min_u_b != -1) \
02266         { \
02267           k = index_x_binary_search(ar_x, M, min_u_b); \
02268  \
02269           MACRO_adjoint_1d_B_OMP_BLOCKWISE_ASSERT_B \
02270  \
02271           while (k < M) \
02272           { \
02273             INT u_prod = ar_x[2*k]; \
02274             INT j = ar_x[2*k+1]; \
02275  \
02276             if (u_prod < min_u_b || u_prod > max_u_b) \
02277               break; \
02278  \
02279             MACRO_adjoint_1d_B_OMP_BLOCKWISE_COMPUTE_ ##whichone \
02280  \
02281             k++; \
02282           } \
02283         } \
02284       } /* omp parallel */ \
02285       return; \
02286     } /* if(NFFT_OMP_BLOCKWISE_ADJOINT) */ \
02287 }
02288 
02289 static void nfft_adjoint_1d_B(X(plan) *ths)
02290 {
02291   const INT n = ths->n[0], M = ths->M_total, m = ths->m;
02292   INT k;
02293   C *g = (C*)ths->g;
02294 
02295   memset(g, 0, (size_t)(ths->n_total) * sizeof(C));
02296 
02297   if (ths->flags & PRE_FULL_PSI)
02298   {
02299     nfft_adjoint_B_compute_full_psi(g, ths->psi_index_g, ths->psi, ths->f, M,
02300         (INT)1, ths->n, m, ths->flags, ths->index_x);
02301     return;
02302   } /* if(PRE_FULL_PSI) */
02303 
02304   if (ths->flags & PRE_PSI)
02305   {
02306 #ifdef _OPENMP
02307     MACRO_adjoint_1d_B_OMP_BLOCKWISE(PRE_PSI)
02308 #endif
02309 
02310 #ifdef _OPENMP
02311     #pragma omp parallel for default(shared) private(k)
02312 #endif
02313     for (k = 0; k < M; k++)
02314     {
02315       INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
02316 #ifdef _OPENMP
02317       nfft_adjoint_1d_compute_omp_atomic(ths->f[j], g, ths->psi + j * (2 * m + 2), ths->x + j, n, m);
02318 #else
02319       nfft_adjoint_1d_compute_serial(ths->f + j, g, ths->psi + j * (2 * m + 2), ths->x + j, n, m);
02320 #endif
02321     }
02322 
02323     return;
02324   } /* if(PRE_PSI) */
02325 
02326   if (ths->flags & PRE_FG_PSI)
02327   {
02328     R fg_exp_l[2 * m + 2];
02329 
02330     nfft_1d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
02331 
02332 #ifdef _OPENMP
02333     MACRO_adjoint_1d_B_OMP_BLOCKWISE(PRE_FG_PSI)
02334 #endif
02335 
02336 
02337 #ifdef _OPENMP
02338     #pragma omp parallel for default(shared) private(k)
02339 #endif
02340     for (k = 0; k < M; k++)
02341     {
02342       R psij_const[2 * m + 2];
02343       INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
02344       INT l;
02345       R fg_psij0 = ths->psi[2 * j];
02346       R fg_psij1 = ths->psi[2 * j + 1];
02347       R fg_psij2 = K(1.0);
02348 
02349       psij_const[0] = fg_psij0;
02350       for (l = 1; l <= 2 * m + 1; l++)
02351       {
02352         fg_psij2 *= fg_psij1;
02353         psij_const[l] = fg_psij0 * fg_psij2 * fg_exp_l[l];
02354       }
02355 
02356 #ifdef _OPENMP
02357       nfft_adjoint_1d_compute_omp_atomic(ths->f[j], g, psij_const, ths->x + j, n, m);
02358 #else
02359       nfft_adjoint_1d_compute_serial(ths->f + j, g, psij_const, ths->x + j, n, m);
02360 #endif
02361     }
02362 
02363     return;
02364   } /* if(PRE_FG_PSI) */
02365 
02366   if (ths->flags & FG_PSI)
02367   {
02368     R fg_exp_l[2 * m + 2];
02369 
02370     nfft_1d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
02371 
02372     sort(ths);
02373 
02374 #ifdef _OPENMP
02375     MACRO_adjoint_1d_B_OMP_BLOCKWISE(FG_PSI)
02376 #endif
02377 
02378 #ifdef _OPENMP
02379     #pragma omp parallel for default(shared) private(k)
02380 #endif
02381     for (k = 0; k < M; k++)
02382     {
02383       INT u,o,l;
02384       R psij_const[2 * m + 2];
02385       R fg_psij0, fg_psij1, fg_psij2;
02386       INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
02387 
02388       uo(ths, j, &u, &o, (INT)0);
02389       fg_psij0 = (PHI(ths->n[0], ths->x[j] - ((R)u) / (R)(n),0));
02390       fg_psij1 = EXP(K(2.0) * ((R)(n) * (ths->x[j]) - (R)(u)) / ths->b[0]);
02391       fg_psij2 = K(1.0);
02392       psij_const[0] = fg_psij0;
02393       for (l = 1; l <= 2 * m + 1; l++)
02394       {
02395         fg_psij2 *= fg_psij1;
02396         psij_const[l] = fg_psij0 * fg_psij2 * fg_exp_l[l];
02397       }
02398 
02399 #ifdef _OPENMP
02400       nfft_adjoint_1d_compute_omp_atomic(ths->f[j], g, psij_const, ths->x + j, n, m);
02401 #else
02402       nfft_adjoint_1d_compute_serial(ths->f + j, g, psij_const, ths->x + j, n, m);
02403 #endif
02404     }
02405 
02406     return;
02407   } /* if(FG_PSI) */
02408 
02409   if (ths->flags & PRE_LIN_PSI)
02410   {
02411     const INT K = ths->K;
02412     const INT ip_s = K / (m + 2);
02413 
02414     sort(ths);
02415 
02416 #ifdef _OPENMP
02417     MACRO_adjoint_1d_B_OMP_BLOCKWISE(PRE_LIN_PSI)
02418 #endif
02419 
02420 #ifdef _OPENMP
02421     #pragma omp parallel for default(shared) private(k)
02422 #endif
02423     for (k = 0; k < M; k++)
02424     {
02425       INT u,o,l;
02426       INT ip_u;
02427       R ip_y, ip_w;
02428       INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
02429       R psij_const[2 * m + 2];
02430 
02431       uo(ths, j, &u, &o, (INT)0);
02432 
02433       ip_y = FABS((R)(n) * ths->x[j] - (R)(u)) * ((R)ip_s);
02434       ip_u = (INT)(LRINT(FLOOR(ip_y)));
02435       ip_w = ip_y - (R)(ip_u);
02436       for (l = 0; l < 2 * m + 2; l++)
02437         psij_const[l]
02438             = ths->psi[ABS(ip_u-l*ip_s)] * (K(1.0) - ip_w)
02439                 + ths->psi[ABS(ip_u-l*ip_s+1)] * (ip_w);
02440 
02441 #ifdef _OPENMP
02442       nfft_adjoint_1d_compute_omp_atomic(ths->f[j], g, psij_const, ths->x + j, n, m);
02443 #else
02444       nfft_adjoint_1d_compute_serial(ths->f + j, g, psij_const, ths->x + j, n, m);
02445 #endif
02446     }
02447     return;
02448   } /* if(PRE_LIN_PSI) */
02449 
02450   /* no precomputed psi at all */
02451   sort(ths);
02452 
02453 #ifdef _OPENMP
02454   MACRO_adjoint_1d_B_OMP_BLOCKWISE(NO_PSI)
02455 #endif
02456 
02457 #ifdef _OPENMP
02458   #pragma omp parallel for default(shared) private(k)
02459 #endif
02460   for (k = 0; k < M; k++)
02461   {
02462     INT u,o,l;
02463     R psij_const[2 * m + 2];
02464     INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
02465 
02466     uo(ths, j, &u, &o, (INT)0);
02467 
02468     for (l = 0; l <= 2 * m + 1; l++)
02469       psij_const[l] = (PHI(ths->n[0], ths->x[j] - ((R)((u+l))) / (R)(n),0));
02470 
02471 #ifdef _OPENMP
02472     nfft_adjoint_1d_compute_omp_atomic(ths->f[j], g, psij_const, ths->x + j, n, m);
02473 #else
02474     nfft_adjoint_1d_compute_serial(ths->f + j, g, psij_const, ths->x + j, n, m);
02475 #endif
02476   }
02477 }
02478 
02479 void X(trafo_1d)(X(plan) *ths)
02480 {
02481   const INT N = ths->N[0], N2 = N/2, n = ths->n[0];
02482   C *f_hat1 = (C*)ths->f_hat, *f_hat2 = (C*)&ths->f_hat[N2];
02483 
02484   ths->g_hat = ths->g1;
02485   ths->g = ths->g2;
02486 
02487   {
02488     C *g_hat1 = (C*)&ths->g_hat[n-N/2], *g_hat2 = (C*)ths->g_hat;
02489     R *c_phi_inv1, *c_phi_inv2;
02490 
02491     TIC(0)
02492 #ifdef _OPENMP
02493     {
02494       INT k;
02495       #pragma omp parallel for default(shared) private(k)
02496       for (k = 0; k < ths->n_total; k++)
02497         ths->g_hat[k] = 0.0;
02498     }
02499 #else
02500     memset(ths->g_hat, 0, (size_t)(ths->n_total) * sizeof(C));
02501 #endif
02502     if(ths->flags & PRE_PHI_HUT)
02503     {
02504       INT k;
02505       c_phi_inv1 = ths->c_phi_inv[0];
02506       c_phi_inv2 = &ths->c_phi_inv[0][N2];
02507 
02508 #ifdef _OPENMP
02509       #pragma omp parallel for default(shared) private(k)
02510 #endif
02511       for (k = 0; k < N2; k++)
02512       {
02513         g_hat1[k] = f_hat1[k] * c_phi_inv1[k];
02514         g_hat2[k] = f_hat2[k] * c_phi_inv2[k];
02515       }
02516     }
02517     else
02518     {
02519       INT k;
02520 #ifdef _OPENMP
02521       #pragma omp parallel for default(shared) private(k)
02522 #endif
02523       for (k = 0; k < N2; k++)
02524       {
02525         g_hat1[k] = f_hat1[k] / (PHI_HUT(ths->n[0],k-N2,0));
02526         g_hat2[k] = f_hat2[k] / (PHI_HUT(ths->n[0],k,0));
02527       }
02528     }
02529     TOC(0)
02530 
02531     TIC_FFTW(1)
02532     FFTW(execute)(ths->my_fftw_plan1);
02533     TOC_FFTW(1);
02534 
02535     TIC(2);
02536     nfft_trafo_1d_B(ths);
02537     TOC(2);
02538   }
02539 }
02540 
02541 void X(adjoint_1d)(X(plan) *ths)
02542 {
02543   INT n,N;
02544   C *g_hat1,*g_hat2,*f_hat1,*f_hat2;
02545   R *c_phi_inv1, *c_phi_inv2;
02546 
02547   N=ths->N[0];
02548   n=ths->n[0];
02549 
02550   ths->g_hat=ths->g1;
02551   ths->g=ths->g2;
02552 
02553   f_hat1=(C*)ths->f_hat;
02554   f_hat2=(C*)&ths->f_hat[N/2];
02555   g_hat1=(C*)&ths->g_hat[n-N/2];
02556   g_hat2=(C*)ths->g_hat;
02557 
02558   TIC(2)
02559   nfft_adjoint_1d_B(ths);
02560   TOC(2)
02561 
02562   TIC_FFTW(1)
02563   FFTW(execute)(ths->my_fftw_plan2);
02564   TOC_FFTW(1);
02565 
02566   TIC(0)
02567   if(ths->flags & PRE_PHI_HUT)
02568   {
02569     INT k;
02570     c_phi_inv1=ths->c_phi_inv[0];
02571     c_phi_inv2=&ths->c_phi_inv[0][N/2];
02572 
02573 #ifdef _OPENMP
02574     #pragma omp parallel for default(shared) private(k)
02575 #endif
02576     for (k = 0; k < N/2; k++)
02577     {
02578       f_hat1[k] = g_hat1[k] * c_phi_inv1[k];
02579       f_hat2[k] = g_hat2[k] * c_phi_inv2[k];
02580     }
02581   }
02582   else
02583   {
02584     INT k;
02585 
02586 #ifdef _OPENMP
02587     #pragma omp parallel for default(shared) private(k)
02588 #endif
02589     for (k = 0; k < N/2; k++)
02590     {
02591       f_hat1[k] = g_hat1[k] / (PHI_HUT(ths->n[0],k-N/2,0));
02592       f_hat2[k] = g_hat2[k] / (PHI_HUT(ths->n[0],k,0));
02593     }
02594   }
02595   TOC(0)
02596 }
02597 
02598 
02599 /* ################################################ SPECIFIC VERSIONS FOR d=2 */
02600 
02601 static void nfft_2d_init_fg_exp_l(R *fg_exp_l, const INT m, const R b)
02602 {
02603   INT l;
02604   R fg_exp_b0, fg_exp_b1, fg_exp_b2, fg_exp_b0_sq;
02605 
02606   fg_exp_b0 = EXP(K(-1.0)/b);
02607   fg_exp_b0_sq = fg_exp_b0*fg_exp_b0;
02608   fg_exp_b1 = K(1.0);
02609   fg_exp_b2 = K(1.0);
02610   fg_exp_l[0] = K(1.0);
02611   for(l=1; l <= 2*m+1; l++)
02612     {
02613       fg_exp_b2 = fg_exp_b1*fg_exp_b0;
02614       fg_exp_b1 *= fg_exp_b0_sq;
02615       fg_exp_l[l] = fg_exp_l[l-1]*fg_exp_b2;
02616     }
02617 }
02618 
02619 static void nfft_trafo_2d_compute(C *fj, const C *g, const R *psij_const0,
02620     const R *psij_const1, const R *xj0, const R *xj1, const INT n0,
02621     const INT n1, const INT m)
02622 {
02623   INT u0,o0,l0,u1,o1,l1;
02624   const C *gj;
02625   const R *psij0,*psij1;
02626 
02627   psij0=psij_const0;
02628   psij1=psij_const1;
02629 
02630   uo2(&u0,&o0,*xj0, n0, m);
02631   uo2(&u1,&o1,*xj1, n1, m);
02632 
02633   *fj=0;
02634 
02635   if (u0 < o0)
02636       if(u1 < o1)
02637     for(l0=0; l0<=2*m+1; l0++,psij0++)
02638     {
02639         psij1=psij_const1;
02640         gj=g+(u0+l0)*n1+u1;
02641         for(l1=0; l1<=2*m+1; l1++)
02642       (*fj) += (*psij0) * (*psij1++) * (*gj++);
02643     }
02644       else
02645     for(l0=0; l0<=2*m+1; l0++,psij0++)
02646     {
02647         psij1=psij_const1;
02648         gj=g+(u0+l0)*n1+u1;
02649         for(l1=0; l1<2*m+1-o1; l1++)
02650       (*fj) += (*psij0) * (*psij1++) * (*gj++);
02651         gj=g+(u0+l0)*n1;
02652         for(l1=0; l1<=o1; l1++)
02653       (*fj) += (*psij0) * (*psij1++) * (*gj++);
02654     }
02655   else
02656       if(u1<o1)
02657       {
02658     for(l0=0; l0<2*m+1-o0; l0++,psij0++)
02659     {
02660         psij1=psij_const1;
02661         gj=g+(u0+l0)*n1+u1;
02662         for(l1=0; l1<=2*m+1; l1++)
02663       (*fj) += (*psij0) * (*psij1++) * (*gj++);
02664     }
02665     for(l0=0; l0<=o0; l0++,psij0++)
02666     {
02667         psij1=psij_const1;
02668         gj=g+l0*n1+u1;
02669         for(l1=0; l1<=2*m+1; l1++)
02670       (*fj) += (*psij0) * (*psij1++) * (*gj++);
02671     }
02672       }
02673       else
02674       {
02675     for(l0=0; l0<2*m+1-o0; l0++,psij0++)
02676     {
02677         psij1=psij_const1;
02678         gj=g+(u0+l0)*n1+u1;
02679         for(l1=0; l1<2*m+1-o1; l1++)
02680       (*fj) += (*psij0) * (*psij1++) * (*gj++);
02681         gj=g+(u0+l0)*n1;
02682         for(l1=0; l1<=o1; l1++)
02683       (*fj) += (*psij0) * (*psij1++) * (*gj++);
02684     }
02685     for(l0=0; l0<=o0; l0++,psij0++)
02686     {
02687         psij1=psij_const1;
02688         gj=g+l0*n1+u1;
02689         for(l1=0; l1<2*m+1-o1; l1++)
02690       (*fj) += (*psij0) * (*psij1++) * (*gj++);
02691         gj=g+l0*n1;
02692         for(l1=0; l1<=o1; l1++)
02693       (*fj) += (*psij0) * (*psij1++) * (*gj++);
02694     }
02695       }
02696 }
02697 
02698 #ifdef _OPENMP
02699 /* adjoint NFFT two-dimensional case with OpenMP atomic operations */
02700 static void nfft_adjoint_2d_compute_omp_atomic(const C f, C *g,
02701             const R *psij_const0, const R *psij_const1, const R *xj0,
02702             const R *xj1, const INT n0, const INT n1, const INT m)
02703 {
02704   INT u0,o0,l0,u1,o1,l1;
02705   const INT lprod = (2*m+2) * (2*m+2);
02706 
02707   INT index_temp0[2*m+2];
02708   INT index_temp1[2*m+2];
02709 
02710   uo2(&u0,&o0,*xj0, n0, m);
02711   uo2(&u1,&o1,*xj1, n1, m);
02712 
02713   for (l0=0; l0<=2*m+1; l0++)
02714     index_temp0[l0] = (u0+l0)%n0;
02715 
02716   for (l1=0; l1<=2*m+1; l1++)
02717     index_temp1[l1] = (u1+l1)%n1;
02718 
02719   for(l0=0; l0<=2*m+1; l0++)
02720   {
02721     for(l1=0; l1<=2*m+1; l1++)
02722     {
02723       INT i = index_temp0[l0] * n1 + index_temp1[l1];
02724       C *lhs = g+i;
02725       R *lhs_real = (R*)lhs;
02726       C val = psij_const0[l0] * psij_const1[l1] * f;
02727 
02728       #pragma omp atomic
02729       lhs_real[0] += CREAL(val);
02730 
02731       #pragma omp atomic
02732       lhs_real[1] += CIMAG(val);
02733     }
02734   }
02735 }
02736 #endif
02737 
02738 #ifdef _OPENMP
02739 
02757 static void nfft_adjoint_2d_compute_omp_blockwise(const C f, C *g,
02758             const R *psij_const0, const R *psij_const1, const R *xj0,
02759             const R *xj1, const INT n0, const INT n1, const INT m,
02760             const INT my_u0, const INT my_o0)
02761 {
02762   INT ar_u0,ar_o0,l0,u1,o1,l1;
02763   const INT lprod = (2*m+2) * (2*m+2);
02764   INT index_temp1[2*m+2];
02765 
02766   uo2(&ar_u0,&ar_o0,*xj0, n0, m);
02767   uo2(&u1,&o1,*xj1, n1, m);
02768 
02769   for (l1 = 0; l1 <= 2*m+1; l1++)
02770     index_temp1[l1] = (u1+l1)%n1;
02771 
02772   if(ar_u0 < ar_o0)
02773   {
02774     INT u0 = MAX(my_u0,ar_u0);
02775     INT o0 = MIN(my_o0,ar_o0);
02776     INT offset_psij = u0-ar_u0;
02777 #ifdef OMP_ASSERT
02778     assert(offset_psij >= 0);
02779     assert(o0-u0 <= 2*m+1);
02780     assert(offset_psij+o0-u0 <= 2*m+1);
02781 #endif
02782 
02783     for (l0 = 0; l0 <= o0-u0; l0++)
02784     {
02785       INT i0 = (u0+l0) * n1;
02786       const C val0 = psij_const0[offset_psij+l0];
02787 
02788       for(l1=0; l1<=2*m+1; l1++)
02789         g[i0 + index_temp1[l1]] += val0 * psij_const1[l1] * f;
02790     }
02791   }
02792   else
02793   {
02794     INT u0 = MAX(my_u0,ar_u0);
02795     INT o0 = my_o0;
02796     INT offset_psij = u0-ar_u0;
02797 #ifdef OMP_ASSERT
02798     assert(offset_psij >= 0);
02799     assert(o0-u0 <= 2*m+1);
02800     assert(offset_psij+o0-u0 <= 2*m+1);
02801 #endif
02802 
02803     for (l0 = 0; l0 <= o0-u0; l0++)
02804     {
02805       INT i0 = (u0+l0) * n1;
02806       const C val0 = psij_const0[offset_psij+l0];
02807 
02808       for(l1=0; l1<=2*m+1; l1++)
02809         g[i0 + index_temp1[l1]] += val0 * psij_const1[l1] * f;
02810     }
02811 
02812     u0 = my_u0;
02813     o0 = MIN(my_o0,ar_o0);
02814     offset_psij += my_u0-ar_u0+n0;
02815 
02816 #ifdef OMP_ASSERT
02817     if (u0<=o0)
02818     {
02819       assert(o0-u0 <= 2*m+1);
02820       assert(offset_psij+o0-u0 <= 2*m+1);
02821     }
02822 #endif
02823 
02824     for (l0 = 0; l0 <= o0-u0; l0++)
02825     {
02826       INT i0 = (u0+l0) * n1;
02827       const C val0 = psij_const0[offset_psij+l0];
02828 
02829       for(l1=0; l1<=2*m+1; l1++)
02830         g[i0 + index_temp1[l1]] += val0 * psij_const1[l1] * f;
02831     }
02832   }
02833 }
02834 #endif
02835 
02836 #ifndef _OPENMP
02837 static void nfft_adjoint_2d_compute_serial(const C *fj, C *g,
02838             const R *psij_const0, const R *psij_const1, const R *xj0,
02839             const R *xj1, const INT n0, const INT n1, const INT m)
02840 {
02841   INT u0,o0,l0,u1,o1,l1;
02842   C *gj;
02843   const R *psij0,*psij1;
02844 
02845   psij0=psij_const0;
02846   psij1=psij_const1;
02847 
02848   uo2(&u0,&o0,*xj0, n0, m);
02849   uo2(&u1,&o1,*xj1, n1, m);
02850 
02851   if(u0<o0)
02852       if(u1<o1)
02853     for(l0=0; l0<=2*m+1; l0++,psij0++)
02854     {
02855         psij1=psij_const1;
02856         gj=g+(u0+l0)*n1+u1;
02857         for(l1=0; l1<=2*m+1; l1++)
02858     (*gj++) += (*psij0) * (*psij1++) * (*fj);
02859     }
02860       else
02861     for(l0=0; l0<=2*m+1; l0++,psij0++)
02862     {
02863         psij1=psij_const1;
02864         gj=g+(u0+l0)*n1+u1;
02865         for(l1=0; l1<2*m+1-o1; l1++)
02866       (*gj++) += (*psij0) * (*psij1++) * (*fj);
02867         gj=g+(u0+l0)*n1;
02868         for(l1=0; l1<=o1; l1++)
02869       (*gj++) += (*psij0) * (*psij1++) * (*fj);
02870     }
02871   else
02872       if(u1<o1)
02873       {
02874     for(l0=0; l0<2*m+1-o0; l0++,psij0++)
02875     {
02876         psij1=psij_const1;
02877         gj=g+(u0+l0)*n1+u1;
02878         for(l1=0; l1<=2*m+1; l1++)
02879       (*gj++) += (*psij0) * (*psij1++) * (*fj);
02880     }
02881     for(l0=0; l0<=o0; l0++,psij0++)
02882     {
02883         psij1=psij_const1;
02884         gj=g+l0*n1+u1;
02885         for(l1=0; l1<=2*m+1; l1++)
02886       (*gj++) += (*psij0) * (*psij1++) * (*fj);
02887     }
02888       }
02889       else
02890       {
02891     for(l0=0; l0<2*m+1-o0; l0++,psij0++)
02892     {
02893         psij1=psij_const1;
02894         gj=g+(u0+l0)*n1+u1;
02895         for(l1=0; l1<2*m+1-o1; l1++)
02896       (*gj++) += (*psij0) * (*psij1++) * (*fj);
02897         gj=g+(u0+l0)*n1;
02898         for(l1=0; l1<=o1; l1++)
02899       (*gj++) += (*psij0) * (*psij1++) * (*fj);
02900     }
02901     for(l0=0; l0<=o0; l0++,psij0++)
02902     {
02903         psij1=psij_const1;
02904         gj=g+l0*n1+u1;
02905         for(l1=0; l1<2*m+1-o1; l1++)
02906       (*gj++) += (*psij0) * (*psij1++) * (*fj);
02907         gj=g+l0*n1;
02908         for(l1=0; l1<=o1; l1++)
02909       (*gj++) += (*psij0) * (*psij1++) * (*fj);
02910     }
02911       }
02912 }
02913 #endif
02914 
02915 static void nfft_trafo_2d_B(X(plan) *ths)
02916 {
02917   const C *g = (C*)ths->g;
02918   const INT n0 = ths->n[0];
02919   const INT n1 = ths->n[1];
02920   const INT M = ths->M_total;
02921   const INT m = ths->m;
02922 
02923   INT k;
02924 
02925   if(ths->flags & PRE_FULL_PSI)
02926   {
02927     const INT lprod = (2*m+2) * (2*m+2);
02928 #ifdef _OPENMP
02929     #pragma omp parallel for default(shared) private(k)
02930 #endif
02931     for (k = 0; k < M; k++)
02932     {
02933       INT l;
02934       INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
02935       ths->f[j] = K(0.0);
02936       for (l = 0; l < lprod; l++)
02937         ths->f[j] += ths->psi[j*lprod+l] * g[ths->psi_index_g[j*lprod+l]];
02938     }
02939     return;
02940   } /* if(PRE_FULL_PSI) */
02941 
02942   if(ths->flags & PRE_PSI)
02943   {
02944 #ifdef _OPENMP
02945     #pragma omp parallel for default(shared) private(k)
02946 #endif
02947     for (k = 0; k < M; k++)
02948     {
02949       INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
02950       nfft_trafo_2d_compute(ths->f+j, g, ths->psi+j*2*(2*m+2), ths->psi+(j*2+1)*(2*m+2), ths->x+2*j, ths->x+2*j+1, n0, n1, m);
02951     }
02952 
02953       return;
02954   } /* if(PRE_PSI) */
02955 
02956   if(ths->flags & PRE_FG_PSI)
02957   {
02958     R fg_exp_l[2*(2*m+2)];
02959 
02960     nfft_2d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
02961     nfft_2d_init_fg_exp_l(fg_exp_l+2*m+2, m, ths->b[1]);
02962 
02963 #ifdef _OPENMP
02964     #pragma omp parallel for default(shared) private(k)
02965 #endif
02966     for (k = 0; k < M; k++)
02967     {
02968       R psij_const[2*(2*m+2)];
02969       INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
02970       INT l;
02971       R fg_psij0 = ths->psi[2*j*2];
02972       R fg_psij1 = ths->psi[2*j*2+1];
02973       R fg_psij2 = K(1.0);
02974 
02975       psij_const[0] = fg_psij0;
02976       for (l = 1; l <= 2*m+1; l++)
02977       {
02978         fg_psij2 *= fg_psij1;
02979         psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l];
02980       }
02981 
02982       fg_psij0 = ths->psi[2*(j*2+1)];
02983       fg_psij1 = ths->psi[2*(j*2+1)+1];
02984       fg_psij2 = K(1.0);
02985       psij_const[2*m+2] = fg_psij0;
02986       for (l = 1; l <= 2*m+1; l++)
02987       {
02988         fg_psij2 *= fg_psij1;
02989         psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l];
02990       }
02991 
02992       nfft_trafo_2d_compute(ths->f+j, g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
02993     }
02994 
02995     return;
02996   } /* if(PRE_FG_PSI) */
02997 
02998   if(ths->flags & FG_PSI)
02999   {
03000     R fg_exp_l[2*(2*m+2)];
03001 
03002     nfft_2d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
03003     nfft_2d_init_fg_exp_l(fg_exp_l+2*m+2, m, ths->b[1]);
03004 
03005     sort(ths);
03006 
03007 #ifdef _OPENMP
03008     #pragma omp parallel for default(shared) private(k)
03009 #endif
03010     for (k = 0; k < M; k++)
03011     {
03012       INT u, o, l;
03013       R fg_psij0, fg_psij1, fg_psij2;
03014       R psij_const[2*(2*m+2)];
03015       INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
03016 
03017       uo(ths, j, &u, &o, (INT)0);
03018       fg_psij0 = (PHI(ths->n[0], ths->x[2*j] - ((R)u) / (R)(n0),0));
03019       fg_psij1 = EXP(K(2.0) * ((R)(n0) * (ths->x[2*j]) - (R)(u)) / ths->b[0]);
03020       fg_psij2 = K(1.0);
03021       psij_const[0] = fg_psij0;
03022       for (l = 1; l <= 2*m+1; l++)
03023       {
03024         fg_psij2 *= fg_psij1;
03025         psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l];
03026       }
03027 
03028       uo(ths,j,&u,&o, (INT)1);
03029       fg_psij0 = (PHI(ths->n[1], ths->x[2*j+1] - ((R)u) / (R)(n1),1));
03030       fg_psij1 = EXP(K(2.0) * ((R)(n1) * (ths->x[2*j+1]) - (R)(u)) / ths->b[1]);
03031       fg_psij2 = K(1.0);
03032       psij_const[2*m+2] = fg_psij0;
03033       for(l=1; l<=2*m+1; l++)
03034       {
03035         fg_psij2 *= fg_psij1;
03036         psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l];
03037       }
03038 
03039       nfft_trafo_2d_compute(ths->f+j, g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
03040     }
03041 
03042     return;
03043   } /* if(FG_PSI) */
03044 
03045   if(ths->flags & PRE_LIN_PSI)
03046   {
03047     const INT K = ths->K, ip_s = K / (m + 2);
03048 
03049     sort(ths);
03050 
03051 #ifdef _OPENMP
03052     #pragma omp parallel for default(shared) private(k)
03053 #endif
03054     for (k = 0; k < M; k++)
03055     {
03056       INT u, o, l;
03057       R ip_y, ip_w;
03058       INT ip_u;
03059       R psij_const[2*(2*m+2)];
03060       INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
03061 
03062       uo(ths,j,&u,&o,(INT)0);
03063       ip_y = FABS((R)(n0) * ths->x[2*j] - (R)(u)) * ((R)ip_s);
03064       ip_u = (INT)LRINT(FLOOR(ip_y));
03065       ip_w = ip_y - (R)(ip_u);
03066       for (l = 0; l < 2*m+2; l++)
03067         psij_const[l] = ths->psi[ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) + ths->psi[ABS(ip_u-l*ip_s+1)]*(ip_w);
03068 
03069       uo(ths,j,&u,&o,(INT)1);
03070       ip_y = FABS((R)(n1) * ths->x[2*j+1] - (R)(u)) * ((R)ip_s);
03071       ip_u = (INT)(LRINT(FLOOR(ip_y)));
03072       ip_w = ip_y - (R)(ip_u);
03073       for (l = 0; l < 2*m+2; l++)
03074         psij_const[2*m+2+l] = ths->psi[(K+1)+ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) + ths->psi[(K+1)+ABS(ip_u-l*ip_s+1)]*(ip_w);
03075 
03076       nfft_trafo_2d_compute(ths->f+j, g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
03077     }
03078       return;
03079   } /* if(PRE_LIN_PSI) */
03080 
03081   /* no precomputed psi at all */
03082 
03083   sort(ths);
03084 
03085 #ifdef _OPENMP
03086   #pragma omp parallel for default(shared) private(k)
03087 #endif
03088   for (k = 0; k < M; k++)
03089   {
03090     R psij_const[2*(2*m+2)];
03091     INT u, o, l;
03092     INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
03093 
03094     uo(ths,j,&u,&o,(INT)0);
03095     for (l = 0; l <= 2*m+1; l++)
03096       psij_const[l]=(PHI(ths->n[0], ths->x[2*j] - ((R)((u+l))) / (R)(n0),0));
03097 
03098     uo(ths,j,&u,&o,(INT)1);
03099     for (l = 0; l <= 2*m+1; l++)
03100       psij_const[2*m+2+l] = (PHI(ths->n[1], ths->x[2*j+1] - ((R)((u+l)))/(R)(n1),1));
03101 
03102     nfft_trafo_2d_compute(ths->f+j, g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
03103   }
03104 }
03105 
03106 #ifdef OMP_ASSERT
03107 #define MACRO_adjoint_2d_B_OMP_BLOCKWISE_ASSERT_A \
03108 { \
03109           assert(ar_x[2*k] >= min_u_a || k == M-1); \
03110           if (k > 0) \
03111             assert(ar_x[2*k-2] < min_u_a); \
03112 }
03113 #else
03114 #define MACRO_adjoint_2d_B_OMP_BLOCKWISE_ASSERT_A
03115 #endif
03116 
03117 #ifdef OMP_ASSERT
03118 #define MACRO_adjoint_2d_B_OMP_BLOCKWISE_ASSERT_B \
03119 { \
03120           assert(ar_x[2*k] >= min_u_b || k == M-1); \
03121           if (k > 0) \
03122             assert(ar_x[2*k-2] < min_u_b); \
03123 }
03124 #else
03125 #define MACRO_adjoint_2d_B_OMP_BLOCKWISE_ASSERT_B
03126 #endif
03127 
03128 #define MACRO_adjoint_2d_B_OMP_BLOCKWISE_COMPUTE_PRE_PSI \
03129             nfft_adjoint_2d_compute_omp_blockwise(ths->f[j], g, \
03130                 ths->psi+j*2*(2*m+2), ths->psi+(j*2+1)*(2*m+2), \
03131                 ths->x+2*j, ths->x+2*j+1, n0, n1, m, my_u0, my_o0);
03132 
03133 #define MACRO_adjoint_2d_B_OMP_BLOCKWISE_COMPUTE_PRE_FG_PSI \
03134 { \
03135             R psij_const[2*(2*m+2)]; \
03136             INT u, o, l; \
03137             R fg_psij0 = ths->psi[2*j*2]; \
03138             R fg_psij1 = ths->psi[2*j*2+1]; \
03139             R fg_psij2 = K(1.0); \
03140  \
03141             psij_const[0] = fg_psij0; \
03142             for(l=1; l<=2*m+1; l++) \
03143             { \
03144               fg_psij2 *= fg_psij1; \
03145               psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l]; \
03146             } \
03147  \
03148             fg_psij0 = ths->psi[2*(j*2+1)]; \
03149             fg_psij1 = ths->psi[2*(j*2+1)+1]; \
03150             fg_psij2 = K(1.0); \
03151             psij_const[2*m+2] = fg_psij0; \
03152             for(l=1; l<=2*m+1; l++) \
03153             { \
03154               fg_psij2 *= fg_psij1; \
03155               psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l]; \
03156             } \
03157  \
03158             nfft_adjoint_2d_compute_omp_blockwise(ths->f[j], g, \
03159                 psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, \
03160                 n0, n1, m, my_u0, my_o0); \
03161 }
03162 
03163 #define MACRO_adjoint_2d_B_OMP_BLOCKWISE_COMPUTE_FG_PSI \
03164 { \
03165             R psij_const[2*(2*m+2)]; \
03166             R fg_psij0, fg_psij1, fg_psij2; \
03167             INT u, o, l; \
03168  \
03169             uo(ths,j,&u,&o,(INT)0); \
03170             fg_psij0 = (PHI(ths->n[0],ths->x[2*j]-((R)u)/n0,0)); \
03171             fg_psij1 = EXP(K(2.0)*(n0*(ths->x[2*j]) - u)/ths->b[0]); \
03172             fg_psij2 = K(1.0); \
03173             psij_const[0] = fg_psij0; \
03174             for(l=1; l<=2*m+1; l++) \
03175             { \
03176               fg_psij2 *= fg_psij1; \
03177               psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l]; \
03178             } \
03179  \
03180             uo(ths,j,&u,&o,(INT)1); \
03181             fg_psij0 = (PHI(ths->n[1],ths->x[2*j+1]-((R)u)/n1,1)); \
03182             fg_psij1 = EXP(K(2.0)*(n1*(ths->x[2*j+1]) - u)/ths->b[1]); \
03183             fg_psij2 = K(1.0); \
03184             psij_const[2*m+2] = fg_psij0; \
03185             for(l=1; l<=2*m+1; l++) \
03186             { \
03187               fg_psij2 *= fg_psij1; \
03188               psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l]; \
03189             } \
03190  \
03191             nfft_adjoint_2d_compute_omp_blockwise(ths->f[j], g, \
03192                 psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, \
03193                 n0, n1, m, my_u0, my_o0); \
03194 }
03195 
03196 #define MACRO_adjoint_2d_B_OMP_BLOCKWISE_COMPUTE_PRE_LIN_PSI \
03197 { \
03198             R psij_const[2*(2*m+2)]; \
03199             INT u, o, l; \
03200             INT ip_u; \
03201             R ip_y, ip_w; \
03202  \
03203             uo(ths,j,&u,&o,(INT)0); \
03204             ip_y = FABS(n0*(ths->x[2*j]) - u)*((R)ip_s); \
03205             ip_u = LRINT(FLOOR(ip_y)); \
03206             ip_w = ip_y-ip_u; \
03207             for(l=0; l < 2*m+2; l++) \
03208               psij_const[l] = ths->psi[ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) + \
03209                 ths->psi[ABS(ip_u-l*ip_s+1)]*(ip_w); \
03210  \
03211             uo(ths,j,&u,&o,(INT)1); \
03212             ip_y = FABS(n1*(ths->x[2*j+1]) - u)*((R)ip_s); \
03213             ip_u = LRINT(FLOOR(ip_y)); \
03214             ip_w = ip_y-ip_u; \
03215             for(l=0; l < 2*m+2; l++) \
03216               psij_const[2*m+2+l] = ths->psi[(K+1)+ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) + \
03217                 ths->psi[(K+1)+ABS(ip_u-l*ip_s+1)]*(ip_w); \
03218  \
03219             nfft_adjoint_2d_compute_omp_blockwise(ths->f[j], g, \
03220                 psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, \
03221                 n0, n1, m, my_u0, my_o0); \
03222 }
03223 
03224 #define MACRO_adjoint_2d_B_OMP_BLOCKWISE_COMPUTE_NO_PSI \
03225 { \
03226             R psij_const[2*(2*m+2)]; \
03227             INT u, o, l; \
03228  \
03229             uo(ths,j,&u,&o,(INT)0); \
03230             for(l=0;l<=2*m+1;l++) \
03231               psij_const[l]=(PHI(ths->n[0],ths->x[2*j]-((R)((u+l)))/n0,0)); \
03232  \
03233             uo(ths,j,&u,&o,(INT)1); \
03234             for(l=0;l<=2*m+1;l++) \
03235               psij_const[2*m+2+l]=(PHI(ths->n[1],ths->x[2*j+1]-((R)((u+l)))/n1,1)); \
03236  \
03237             nfft_adjoint_2d_compute_omp_blockwise(ths->f[j], g, \
03238                 psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, \
03239                 n0, n1, m, my_u0, my_o0); \
03240 }
03241 
03242 #define MACRO_adjoint_2d_B_OMP_BLOCKWISE(whichone) \
03243 { \
03244     if (ths->flags & NFFT_OMP_BLOCKWISE_ADJOINT) \
03245     { \
03246       _Pragma("omp parallel private(k)") \
03247       { \
03248         INT my_u0, my_o0, min_u_a, max_u_a, min_u_b, max_u_b; \
03249         INT *ar_x = ths->index_x; \
03250  \
03251         nfft_adjoint_B_omp_blockwise_init(&my_u0, &my_o0, &min_u_a, &max_u_a, \
03252             &min_u_b, &max_u_b, 2, ths->n, m); \
03253  \
03254         if (min_u_a != -1) \
03255         { \
03256           k = index_x_binary_search(ar_x, M, min_u_a); \
03257  \
03258           MACRO_adjoint_2d_B_OMP_BLOCKWISE_ASSERT_A \
03259  \
03260           while (k < M) \
03261           { \
03262             INT u_prod = ar_x[2*k]; \
03263             INT j = ar_x[2*k+1]; \
03264  \
03265             if (u_prod < min_u_a || u_prod > max_u_a) \
03266               break; \
03267  \
03268             MACRO_adjoint_2d_B_OMP_BLOCKWISE_COMPUTE_ ##whichone \
03269  \
03270             k++; \
03271           } \
03272         } \
03273  \
03274         if (min_u_b != -1) \
03275         { \
03276           INT k = index_x_binary_search(ar_x, M, min_u_b); \
03277  \
03278           MACRO_adjoint_2d_B_OMP_BLOCKWISE_ASSERT_B \
03279  \
03280           while (k < M) \
03281           { \
03282             INT u_prod = ar_x[2*k]; \
03283             INT j = ar_x[2*k+1]; \
03284  \
03285             if (u_prod < min_u_b || u_prod > max_u_b) \
03286               break; \
03287  \
03288             MACRO_adjoint_2d_B_OMP_BLOCKWISE_COMPUTE_ ##whichone \
03289  \
03290             k++; \
03291           } \
03292         } \
03293       } /* omp parallel */ \
03294       return; \
03295     } /* if(NFFT_OMP_BLOCKWISE_ADJOINT) */ \
03296 }
03297 
03298 
03299 static void nfft_adjoint_2d_B(X(plan) *ths)
03300 {
03301   const INT n0 = ths->n[0];
03302   const INT n1 = ths->n[1];
03303   const INT M = ths->M_total;
03304   const INT m = ths->m;
03305   C* g = (C*) ths->g;
03306   INT k;
03307 
03308   memset(g, 0, (size_t)(ths->n_total) * sizeof(C));
03309 
03310   if(ths->flags & PRE_FULL_PSI)
03311   {
03312     nfft_adjoint_B_compute_full_psi(g, ths->psi_index_g, ths->psi, ths->f, M,
03313         (INT)2, ths->n, m, ths->flags, ths->index_x);
03314     return;
03315   } /* if(PRE_FULL_PSI) */
03316 
03317   if(ths->flags & PRE_PSI)
03318   {
03319 #ifdef _OPENMP
03320     MACRO_adjoint_2d_B_OMP_BLOCKWISE(PRE_PSI)
03321 #endif
03322 
03323 #ifdef _OPENMP
03324     #pragma omp parallel for default(shared) private(k)
03325 #endif
03326     for (k = 0; k < M; k++)
03327     {
03328       INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
03329 #ifdef _OPENMP
03330       nfft_adjoint_2d_compute_omp_atomic(ths->f[j], g, ths->psi+j*2*(2*m+2), ths->psi+(j*2+1)*(2*m+2), ths->x+2*j, ths->x+2*j+1, n0, n1, m);
03331 #else
03332       nfft_adjoint_2d_compute_serial(ths->f+j, g, ths->psi+j*2*(2*m+2), ths->psi+(j*2+1)*(2*m+2), ths->x+2*j, ths->x+2*j+1, n0, n1, m);
03333 #endif
03334     }
03335     return;
03336   } /* if(PRE_PSI) */
03337 
03338   if(ths->flags & PRE_FG_PSI)
03339   {
03340     R fg_exp_l[2*(2*m+2)];
03341 
03342     nfft_2d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
03343     nfft_2d_init_fg_exp_l(fg_exp_l+2*m+2, m, ths->b[1]);
03344 
03345 #ifdef _OPENMP
03346     MACRO_adjoint_2d_B_OMP_BLOCKWISE(PRE_FG_PSI)
03347 #endif
03348 
03349 
03350 #ifdef _OPENMP
03351     #pragma omp parallel for default(shared) private(k)
03352 #endif
03353     for (k = 0; k < M; k++)
03354     {
03355       R psij_const[2*(2*m+2)];
03356       INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
03357       INT l;
03358       R fg_psij0 = ths->psi[2*j*2];
03359       R fg_psij1 = ths->psi[2*j*2+1];
03360       R fg_psij2 = K(1.0);
03361 
03362       psij_const[0] = fg_psij0;
03363       for(l=1; l<=2*m+1; l++)
03364       {
03365         fg_psij2 *= fg_psij1;
03366         psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l];
03367       }
03368 
03369       fg_psij0 = ths->psi[2*(j*2+1)];
03370       fg_psij1 = ths->psi[2*(j*2+1)+1];
03371       fg_psij2 = K(1.0);
03372       psij_const[2*m+2] = fg_psij0;
03373       for(l=1; l<=2*m+1; l++)
03374       {
03375         fg_psij2 *= fg_psij1;
03376         psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l];
03377       }
03378 
03379 #ifdef _OPENMP
03380       nfft_adjoint_2d_compute_omp_atomic(ths->f[j], g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
03381 #else
03382       nfft_adjoint_2d_compute_serial(ths->f+j, g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
03383 #endif
03384     }
03385 
03386     return;
03387   } /* if(PRE_FG_PSI) */
03388 
03389   if(ths->flags & FG_PSI)
03390   {
03391     R fg_exp_l[2*(2*m+2)];
03392 
03393     nfft_2d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
03394     nfft_2d_init_fg_exp_l(fg_exp_l+2*m+2, m, ths->b[1]);
03395 
03396     sort(ths);
03397 
03398 #ifdef _OPENMP
03399     MACRO_adjoint_2d_B_OMP_BLOCKWISE(FG_PSI)
03400 #endif
03401 
03402 #ifdef _OPENMP
03403     #pragma omp parallel for default(shared) private(k)
03404 #endif
03405     for (k = 0; k < M; k++)
03406     {
03407       INT u, o, l;
03408       R fg_psij0, fg_psij1, fg_psij2;
03409       R psij_const[2*(2*m+2)];
03410       INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
03411 
03412       uo(ths,j,&u,&o,(INT)0);
03413       fg_psij0 = (PHI(ths->n[0], ths->x[2*j] - ((R)u)/(R)(n0),0));
03414       fg_psij1 = EXP(K(2.0) * ((R)(n0) * (ths->x[2*j]) - (R)(u)) / ths->b[0]);
03415       fg_psij2 = K(1.0);
03416       psij_const[0] = fg_psij0;
03417       for(l=1; l<=2*m+1; l++)
03418       {
03419         fg_psij2 *= fg_psij1;
03420         psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l];
03421       }
03422 
03423       uo(ths,j,&u,&o,(INT)1);
03424       fg_psij0 = (PHI(ths->n[1], ths->x[2*j+1] - ((R)u) / (R)(n1),1));
03425       fg_psij1 = EXP(K(2.0) * ((R)(n1) * (ths->x[2*j+1]) - (R)(u)) / ths->b[1]);
03426       fg_psij2 = K(1.0);
03427       psij_const[2*m+2] = fg_psij0;
03428       for(l=1; l<=2*m+1; l++)
03429       {
03430         fg_psij2 *= fg_psij1;
03431         psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l];
03432       }
03433 
03434 #ifdef _OPENMP
03435       nfft_adjoint_2d_compute_omp_atomic(ths->f[j], g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
03436 #else
03437       nfft_adjoint_2d_compute_serial(ths->f+j, g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
03438 #endif
03439     }
03440 
03441     return;
03442   } /* if(FG_PSI) */
03443 
03444   if(ths->flags & PRE_LIN_PSI)
03445   {
03446     const INT K = ths->K;
03447     const INT ip_s = K / (m + 2);
03448 
03449     sort(ths);
03450 
03451 #ifdef _OPENMP
03452     MACRO_adjoint_2d_B_OMP_BLOCKWISE(PRE_LIN_PSI)
03453 #endif
03454 
03455 #ifdef _OPENMP
03456     #pragma omp parallel for default(shared) private(k)
03457 #endif
03458     for (k = 0; k < M; k++)
03459     {
03460       INT u,o,l;
03461       INT ip_u;
03462       R ip_y, ip_w;
03463       INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
03464       R psij_const[2*(2*m+2)];
03465 
03466       uo(ths,j,&u,&o,(INT)0);
03467       ip_y = FABS((R)(n0) * (ths->x[2*j]) - (R)(u)) * ((R)ip_s);
03468       ip_u = (INT)(LRINT(FLOOR(ip_y)));
03469       ip_w = ip_y - (R)(ip_u);
03470       for(l=0; l < 2*m+2; l++)
03471         psij_const[l] = ths->psi[ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) +
03472           ths->psi[ABS(ip_u-l*ip_s+1)]*(ip_w);
03473 
03474       uo(ths,j,&u,&o,(INT)1);
03475       ip_y = FABS((R)(n1) * (ths->x[2*j+1]) - (R)(u)) * ((R)ip_s);
03476       ip_u = (INT)(LRINT(FLOOR(ip_y)));
03477       ip_w = ip_y - (R)(ip_u);
03478       for(l=0; l < 2*m+2; l++)
03479         psij_const[2*m+2+l] = ths->psi[(K+1)+ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) +
03480           ths->psi[(K+1)+ABS(ip_u-l*ip_s+1)]*(ip_w);
03481 
03482 #ifdef _OPENMP
03483       nfft_adjoint_2d_compute_omp_atomic(ths->f[j], g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
03484 #else
03485       nfft_adjoint_2d_compute_serial(ths->f+j, g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
03486 #endif
03487   }
03488       return;
03489     } /* if(PRE_LIN_PSI) */
03490 
03491   /* no precomputed psi at all */
03492   sort(ths);
03493 
03494 #ifdef _OPENMP
03495   MACRO_adjoint_2d_B_OMP_BLOCKWISE(NO_PSI)
03496 #endif
03497 
03498 #ifdef _OPENMP
03499   #pragma omp parallel for default(shared) private(k)
03500 #endif
03501   for (k = 0; k < M; k++)
03502   {
03503     INT u,o,l;
03504     R psij_const[2*(2*m+2)];
03505     INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
03506 
03507     uo(ths,j,&u,&o,(INT)0);
03508     for(l=0;l<=2*m+1;l++)
03509       psij_const[l]=(PHI(ths->n[0], ths->x[2*j] - ((R)((u+l))) / (R)(n0),0));
03510 
03511     uo(ths,j,&u,&o,(INT)1);
03512     for(l=0;l<=2*m+1;l++)
03513       psij_const[2*m+2+l]=(PHI(ths->n[1], ths->x[2*j+1] - ((R)((u+l))) / (R)(n1),1));
03514 
03515 #ifdef _OPENMP
03516     nfft_adjoint_2d_compute_omp_atomic(ths->f[j], g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
03517 #else
03518     nfft_adjoint_2d_compute_serial(ths->f+j, g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
03519 #endif
03520   }
03521 }
03522 
03523 
03524 void X(trafo_2d)(X(plan) *ths)
03525 {
03526   INT k0,k1,n0,n1,N0,N1;
03527   C *g_hat,*f_hat;
03528   R *c_phi_inv01, *c_phi_inv02, *c_phi_inv11, *c_phi_inv12;
03529   R ck01, ck02, ck11, ck12;
03530   C *g_hat11,*f_hat11,*g_hat21,*f_hat21,*g_hat12,*f_hat12,*g_hat22,*f_hat22;
03531 
03532   ths->g_hat=ths->g1;
03533   ths->g=ths->g2;
03534 
03535   N0=ths->N[0];
03536   N1=ths->N[1];
03537   n0=ths->n[0];
03538   n1=ths->n[1];
03539 
03540   f_hat=(C*)ths->f_hat;
03541   g_hat=(C*)ths->g_hat;
03542 
03543   TIC(0)
03544 #ifdef _OPENMP
03545   #pragma omp parallel for default(shared) private(k0)
03546   for (k0 = 0; k0 < ths->n_total; k0++)
03547     ths->g_hat[k0] = 0.0;
03548 #else
03549   memset(ths->g_hat, 0, (size_t)(ths->n_total) * sizeof(C));
03550 #endif
03551   if(ths->flags & PRE_PHI_HUT)
03552     {
03553       c_phi_inv01=ths->c_phi_inv[0];
03554       c_phi_inv02=&ths->c_phi_inv[0][N0/2];
03555 
03556 #ifdef _OPENMP
03557       #pragma omp parallel for default(shared) private(k0,k1,ck01,ck02,c_phi_inv11,c_phi_inv12,g_hat11,f_hat11,g_hat21,f_hat21,g_hat12,f_hat12,g_hat22,f_hat22,ck11,ck12)
03558 #endif
03559       for(k0=0;k0<N0/2;k0++)
03560       {
03561         ck01=c_phi_inv01[k0];
03562         ck02=c_phi_inv02[k0];
03563 
03564         c_phi_inv11=ths->c_phi_inv[1];
03565         c_phi_inv12=&ths->c_phi_inv[1][N1/2];
03566 
03567         g_hat11=g_hat + (n0-(N0/2)+k0)*n1+n1-(N1/2);
03568         f_hat11=f_hat + k0*N1;
03569         g_hat21=g_hat + k0*n1+n1-(N1/2);
03570         f_hat21=f_hat + ((N0/2)+k0)*N1;
03571         g_hat12=g_hat + (n0-(N0/2)+k0)*n1;
03572         f_hat12=f_hat + k0*N1+(N1/2);
03573         g_hat22=g_hat + k0*n1;
03574         f_hat22=f_hat + ((N0/2)+k0)*N1+(N1/2);
03575 
03576         for(k1=0;k1<N1/2;k1++)
03577         {
03578           ck11=c_phi_inv11[k1];
03579           ck12=c_phi_inv12[k1];
03580 
03581           g_hat11[k1] = f_hat11[k1] * ck01 * ck11;
03582           g_hat21[k1] = f_hat21[k1] * ck02 * ck11;
03583           g_hat12[k1] = f_hat12[k1] * ck01 * ck12;
03584           g_hat22[k1] = f_hat22[k1] * ck02 * ck12;
03585         }
03586       }
03587     }
03588   else
03589 #ifdef _OPENMP
03590     #pragma omp parallel for default(shared) private(k0,k1,ck01,ck02,ck11,ck12)
03591 #endif
03592     for(k0=0;k0<N0/2;k0++)
03593       {
03594   ck01=K(1.0)/(PHI_HUT(ths->n[0],k0-N0/2,0));
03595   ck02=K(1.0)/(PHI_HUT(ths->n[0],k0,0));
03596   for(k1=0;k1<N1/2;k1++)
03597     {
03598       ck11=K(1.0)/(PHI_HUT(ths->n[1],k1-N1/2,1));
03599       ck12=K(1.0)/(PHI_HUT(ths->n[1],k1,1));
03600       g_hat[(n0-N0/2+k0)*n1+n1-N1/2+k1] = f_hat[k0*N1+k1]             * ck01 * ck11;
03601       g_hat[k0*n1+n1-N1/2+k1]           = f_hat[(N0/2+k0)*N1+k1]      * ck02 * ck11;
03602       g_hat[(n0-N0/2+k0)*n1+k1]         = f_hat[k0*N1+N1/2+k1]        * ck01 * ck12;
03603       g_hat[k0*n1+k1]                   = f_hat[(N0/2+k0)*N1+N1/2+k1] * ck02 * ck12;
03604     }
03605       }
03606 
03607   TOC(0)
03608 
03609   TIC_FFTW(1)
03610   FFTW(execute)(ths->my_fftw_plan1);
03611   TOC_FFTW(1);
03612 
03613   TIC(2);
03614   nfft_trafo_2d_B(ths);
03615   TOC(2);
03616 }
03617 
03618 void X(adjoint_2d)(X(plan) *ths)
03619 {
03620   INT k0,k1,n0,n1,N0,N1;
03621   C *g_hat,*f_hat;
03622   R *c_phi_inv01, *c_phi_inv02, *c_phi_inv11, *c_phi_inv12;
03623   R ck01, ck02, ck11, ck12;
03624   C *g_hat11,*f_hat11,*g_hat21,*f_hat21,*g_hat12,*f_hat12,*g_hat22,*f_hat22;
03625 
03626   ths->g_hat=ths->g1;
03627   ths->g=ths->g2;
03628 
03629   N0=ths->N[0];
03630   N1=ths->N[1];
03631   n0=ths->n[0];
03632   n1=ths->n[1];
03633 
03634   f_hat=(C*)ths->f_hat;
03635   g_hat=(C*)ths->g_hat;
03636 
03637   TIC(2);
03638   nfft_adjoint_2d_B(ths);
03639   TOC(2);
03640 
03641   TIC_FFTW(1)
03642   FFTW(execute)(ths->my_fftw_plan2);
03643   TOC_FFTW(1);
03644 
03645   TIC(0)
03646   if(ths->flags & PRE_PHI_HUT)
03647     {
03648       c_phi_inv01=ths->c_phi_inv[0];
03649       c_phi_inv02=&ths->c_phi_inv[0][N0/2];
03650 
03651 #ifdef _OPENMP
03652       #pragma omp parallel for default(shared) private(k0,k1,ck01,ck02,c_phi_inv11,c_phi_inv12,g_hat11,f_hat11,g_hat21,f_hat21,g_hat12,f_hat12,g_hat22,f_hat22,ck11,ck12)
03653 #endif
03654       for(k0=0;k0<N0/2;k0++)
03655       {
03656         ck01=c_phi_inv01[k0];
03657         ck02=c_phi_inv02[k0];
03658 
03659         c_phi_inv11=ths->c_phi_inv[1];
03660         c_phi_inv12=&ths->c_phi_inv[1][N1/2];
03661 
03662         g_hat11=g_hat + (n0-(N0/2)+k0)*n1+n1-(N1/2);
03663         f_hat11=f_hat + k0*N1;
03664         g_hat21=g_hat + k0*n1+n1-(N1/2);
03665         f_hat21=f_hat + ((N0/2)+k0)*N1;
03666         g_hat12=g_hat + (n0-(N0/2)+k0)*n1;
03667         f_hat12=f_hat + k0*N1+(N1/2);
03668         g_hat22=g_hat + k0*n1;
03669         f_hat22=f_hat + ((N0/2)+k0)*N1+(N1/2);
03670 
03671         for(k1=0;k1<N1/2;k1++)
03672         {
03673           ck11=c_phi_inv11[k1];
03674           ck12=c_phi_inv12[k1];
03675 
03676           f_hat11[k1] = g_hat11[k1] * ck01 * ck11;
03677           f_hat21[k1] = g_hat21[k1] * ck02 * ck11;
03678           f_hat12[k1] = g_hat12[k1] * ck01 * ck12;
03679           f_hat22[k1] = g_hat22[k1] * ck02 * ck12;
03680         }
03681       }
03682     }
03683   else
03684 #ifdef _OPENMP
03685     #pragma omp parallel for default(shared) private(k0,k1,ck01,ck02,ck11,ck12)
03686 #endif
03687     for(k0=0;k0<N0/2;k0++)
03688       {
03689   ck01=K(1.0)/(PHI_HUT(ths->n[0],k0-N0/2,0));
03690   ck02=K(1.0)/(PHI_HUT(ths->n[0],k0,0));
03691   for(k1=0;k1<N1/2;k1++)
03692     {
03693       ck11=K(1.0)/(PHI_HUT(ths->n[1],k1-N1/2,1));
03694       ck12=K(1.0)/(PHI_HUT(ths->n[1],k1,1));
03695       f_hat[k0*N1+k1]             = g_hat[(n0-N0/2+k0)*n1+n1-N1/2+k1] * ck01 * ck11;
03696       f_hat[(N0/2+k0)*N1+k1]      = g_hat[k0*n1+n1-N1/2+k1]           * ck02 * ck11;
03697       f_hat[k0*N1+N1/2+k1]        = g_hat[(n0-N0/2+k0)*n1+k1]         * ck01 * ck12;
03698       f_hat[(N0/2+k0)*N1+N1/2+k1] = g_hat[k0*n1+k1]                   * ck02 * ck12;
03699     }
03700       }
03701   TOC(0)
03702 }
03703 
03704 /* ################################################ SPECIFIC VERSIONS FOR d=3 */
03705 
03706 static void nfft_3d_init_fg_exp_l(R *fg_exp_l, const INT m, const R b)
03707 {
03708   INT l;
03709   R fg_exp_b0, fg_exp_b1, fg_exp_b2, fg_exp_b0_sq;
03710 
03711   fg_exp_b0 = EXP(-K(1.0) / b);
03712   fg_exp_b0_sq = fg_exp_b0*fg_exp_b0;
03713   fg_exp_b1 = K(1.0);
03714   fg_exp_b2 = K(1.0);
03715   fg_exp_l[0] = K(1.0);
03716   for(l=1; l <= 2*m+1; l++)
03717     {
03718       fg_exp_b2 = fg_exp_b1*fg_exp_b0;
03719       fg_exp_b1 *= fg_exp_b0_sq;
03720       fg_exp_l[l] = fg_exp_l[l-1]*fg_exp_b2;
03721     }
03722 }
03723 
03724 static void nfft_trafo_3d_compute(C *fj, const C *g, const R *psij_const0,
03725     const R *psij_const1, const R *psij_const2, const R *xj0, const R *xj1,
03726     const R *xj2, const INT n0, const INT n1, const INT n2, const INT m)
03727 {
03728   INT u0, o0, l0, u1, o1, l1, u2, o2, l2;
03729   const C *gj;
03730   const R *psij0, *psij1, *psij2;
03731 
03732   psij0 = psij_const0;
03733   psij1 = psij_const1;
03734   psij2 = psij_const2;
03735 
03736   uo2(&u0, &o0, *xj0, n0, m);
03737   uo2(&u1, &o1, *xj1, n1, m);
03738   uo2(&u2, &o2, *xj2, n2, m);
03739 
03740   *fj = 0;
03741 
03742   if (u0 < o0)
03743     if (u1 < o1)
03744       if (u2 < o2)
03745         for (l0 = 0; l0 <= 2 * m + 1; l0++, psij0++)
03746         {
03747           psij1 = psij_const1;
03748           for (l1 = 0; l1 <= 2 * m + 1; l1++, psij1++)
03749           {
03750             psij2 = psij_const2;
03751             gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
03752             for (l2 = 0; l2 <= 2 * m + 1; l2++)
03753               (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
03754           }
03755         }
03756       else
03757         /* asserts (u2>o2)*/
03758         for (l0 = 0; l0 <= 2 * m + 1; l0++, psij0++)
03759         {
03760           psij1 = psij_const1;
03761           for (l1 = 0; l1 <= 2 * m + 1; l1++, psij1++)
03762           {
03763             psij2 = psij_const2;
03764             gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
03765             for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
03766               (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
03767             gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2;
03768             for (l2 = 0; l2 <= o2; l2++)
03769               (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
03770           }
03771         }
03772     else /* asserts (u1>o1)*/
03773       if (u2 < o2)
03774         for (l0 = 0; l0 <= 2 * m + 1; l0++, psij0++)
03775         {
03776           psij1 = psij_const1;
03777           for (l1 = 0; l1 < 2 * m + 1 - o1; l1++, psij1++)
03778           {
03779             psij2 = psij_const2;
03780             gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
03781             for (l2 = 0; l2 <= 2 * m + 1; l2++)
03782               (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
03783           }
03784           for (l1 = 0; l1 <= o1; l1++, psij1++)
03785           {
03786             psij2 = psij_const2;
03787             gj = g + ((u0 + l0) * n1 + l1) * n2 + u2;
03788             for (l2 = 0; l2 <= 2 * m + 1; l2++)
03789               (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
03790           }
03791         }
03792       else/* asserts (u2>o2) */
03793       {
03794         for (l0 = 0; l0 <= 2 * m + 1; l0++, psij0++)
03795         {
03796           psij1 = psij_const1;
03797           for (l1 = 0; l1 < 2 * m + 1 - o1; l1++, psij1++)
03798           {
03799             psij2 = psij_const2;
03800             gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
03801             for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
03802               (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
03803             gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2;
03804             for (l2 = 0; l2 <= o2; l2++)
03805               (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
03806           }
03807           for (l1 = 0; l1 <= o1; l1++, psij1++)
03808           {
03809             psij2 = psij_const2;
03810             gj = g + ((u0 + l0) * n1 + l1) * n2 + u2;
03811             for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
03812               (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
03813             gj = g + ((u0 + l0) * n1 + l1) * n2;
03814             for (l2 = 0; l2 <= o2; l2++)
03815               (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
03816           }
03817         }
03818       }
03819   else /* asserts (u0>o0) */
03820     if (u1 < o1)
03821       if (u2 < o2)
03822       {
03823         for (l0 = 0; l0 < 2 * m + 1 - o0; l0++, psij0++)
03824         {
03825           psij1 = psij_const1;
03826           for (l1 = 0; l1 <= 2 * m + 1; l1++, psij1++)
03827           {
03828             psij2 = psij_const2;
03829             gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
03830             for (l2 = 0; l2 <= 2 * m + 1; l2++)
03831               (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
03832           }
03833         }
03834 
03835         for (l0 = 0; l0 <= o0; l0++, psij0++)
03836         {
03837           psij1 = psij_const1;
03838           for (l1 = 0; l1 <= 2 * m + 1; l1++, psij1++)
03839           {
03840             psij2 = psij_const2;
03841             gj = g + (l0 * n1 + (u1 + l1)) * n2 + u2;
03842             for (l2 = 0; l2 <= 2 * m + 1; l2++)
03843               (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
03844           }
03845         }
03846       } else/* asserts (u2>o2) */
03847       {
03848         for (l0 = 0; l0 < 2 * m + 1 - o0; l0++, psij0++)
03849         {
03850           psij1 = psij_const1;
03851           for (l1 = 0; l1 <= 2 * m + 1; l1++, psij1++)
03852           {
03853             psij2 = psij_const2;
03854             gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
03855             for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
03856               (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
03857             gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2;
03858             for (l2 = 0; l2 <= o2; l2++)
03859               (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
03860           }
03861         }
03862 
03863         for (l0 = 0; l0 <= o0; l0++, psij0++)
03864         {
03865           psij1 = psij_const1;
03866           for (l1 = 0; l1 <= 2 * m + 1; l1++, psij1++)
03867           {
03868             psij2 = psij_const2;
03869             gj = g + (l0 * n1 + (u1 + l1)) * n2 + u2;
03870             for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
03871               (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
03872             gj = g + (l0 * n1 + (u1 + l1)) * n2;
03873             for (l2 = 0; l2 <= o2; l2++)
03874               (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
03875           }
03876         }
03877       }
03878     else /* asserts (u1>o1) */
03879       if (u2 < o2)
03880       {
03881         for (l0 = 0; l0 < 2 * m + 1 - o0; l0++, psij0++)
03882         {
03883           psij1 = psij_const1;
03884           for (l1 = 0; l1 < 2 * m + 1 - o1; l1++, psij1++)
03885           {
03886             psij2 = psij_const2;
03887             gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
03888             for (l2 = 0; l2 <= 2 * m + 1; l2++)
03889               (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
03890           }
03891           for (l1 = 0; l1 <= o1; l1++, psij1++)
03892           {
03893             psij2 = psij_const2;
03894             gj = g + ((u0 + l0) * n1 + l1) * n2 + u2;
03895             for (l2 = 0; l2 <= 2 * m + 1; l2++)
03896               (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
03897           }
03898         }
03899         for (l0 = 0; l0 <= o0; l0++, psij0++)
03900         {
03901           psij1 = psij_const1;
03902           for (l1 = 0; l1 < 2 * m + 1 - o1; l1++, psij1++)
03903           {
03904             psij2 = psij_const2;
03905             gj = g + (l0 * n1 + (u1 + l1)) * n2 + u2;
03906             for (l2 = 0; l2 <= 2 * m + 1; l2++)
03907               (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
03908           }
03909           for (l1 = 0; l1 <= o1; l1++, psij1++)
03910           {
03911             psij2 = psij_const2;
03912             gj = g + (l0 * n1 + l1) * n2 + u2;
03913             for (l2 = 0; l2 <= 2 * m + 1; l2++)
03914               (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
03915           }
03916         }
03917       } else/* asserts (u2>o2) */
03918       {
03919         for (l0 = 0; l0 < 2 * m + 1 - o0; l0++, psij0++)
03920         {
03921           psij1 = psij_const1;
03922           for (l1 = 0; l1 < 2 * m + 1 - o1; l1++, psij1++)
03923           {
03924             psij2 = psij_const2;
03925             gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
03926             for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
03927               (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
03928             gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2;
03929             for (l2 = 0; l2 <= o2; l2++)
03930               (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
03931           }
03932           for (l1 = 0; l1 <= o1; l1++, psij1++)
03933           {
03934             psij2 = psij_const2;
03935             gj = g + ((u0 + l0) * n1 + l1) * n2 + u2;
03936             for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
03937               (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
03938             gj = g + ((u0 + l0) * n1 + l1) * n2;
03939             for (l2 = 0; l2 <= o2; l2++)
03940               (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
03941           }
03942         }
03943 
03944         for (l0 = 0; l0 <= o0; l0++, psij0++)
03945         {
03946           psij1 = psij_const1;
03947           for (l1 = 0; l1 < 2 * m + 1 - o1; l1++, psij1++)
03948           {
03949             psij2 = psij_const2;
03950             gj = g + (l0 * n1 + (u1 + l1)) * n2 + u2;
03951             for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
03952               (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
03953             gj = g + (l0 * n1 + (u1 + l1)) * n2;
03954             for (l2 = 0; l2 <= o2; l2++)
03955               (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
03956           }
03957           for (l1 = 0; l1 <= o1; l1++, psij1++)
03958           {
03959             psij2 = psij_const2;
03960             gj = g + (l0 * n1 + l1) * n2 + u2;
03961             for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
03962               (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
03963             gj = g + (l0 * n1 + l1) * n2;
03964             for (l2 = 0; l2 <= o2; l2++)
03965               (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
03966           }
03967         }
03968       }
03969 }
03970 
03971 #ifdef _OPENMP
03972 
03993 static void nfft_adjoint_3d_compute_omp_blockwise(const C f, C *g,
03994     const R *psij_const0, const R *psij_const1, const R *psij_const2,
03995     const R *xj0, const R *xj1, const R *xj2,
03996     const INT n0, const INT n1, const INT n2, const INT m,
03997     const INT my_u0, const INT my_o0)
03998 {
03999   INT ar_u0,ar_o0,l0,u1,o1,l1,u2,o2,l2;
04000   const INT lprod = (2*m+2) * (2*m+2) * (2*m+2);
04001 
04002   INT index_temp1[2*m+2];
04003   INT index_temp2[2*m+2];
04004 
04005   uo2(&ar_u0,&ar_o0,*xj0, n0, m);
04006   uo2(&u1,&o1,*xj1, n1, m);
04007   uo2(&u2,&o2,*xj2, n2, m);
04008 
04009   for (l1=0; l1<=2*m+1; l1++)
04010     index_temp1[l1] = (u1+l1)%n1;
04011 
04012   for (l2=0; l2<=2*m+1; l2++)
04013     index_temp2[l2] = (u2+l2)%n2;
04014 
04015   if(ar_u0<ar_o0)
04016   {
04017     INT u0 = MAX(my_u0,ar_u0);
04018     INT o0 = MIN(my_o0,ar_o0);
04019     INT offset_psij = u0-ar_u0;
04020 #ifdef OMP_ASSERT
04021     assert(offset_psij >= 0);
04022     assert(o0-u0 <= 2*m+1);
04023     assert(offset_psij+o0-u0 <= 2*m+1);
04024 #endif
04025 
04026     for (l0 = 0; l0 <= o0-u0; l0++)
04027     {
04028       const INT i0 = (u0+l0) * n1;
04029       const C val0 = psij_const0[offset_psij+l0];
04030 
04031       for(l1=0; l1<=2*m+1; l1++)
04032       {
04033         const INT i1 = (i0 + index_temp1[l1]) * n2;
04034         const C val1 = psij_const1[l1];
04035 
04036         for(l2=0; l2<=2*m+1; l2++)
04037           g[i1 + index_temp2[l2]] += val0 * val1 * psij_const2[l2] * f;
04038       }
04039     }
04040   }
04041   else
04042   {
04043     INT u0 = MAX(my_u0,ar_u0);
04044     INT o0 = my_o0;
04045     INT offset_psij = u0-ar_u0;
04046 #ifdef OMP_ASSERT
04047     assert(offset_psij >= 0);
04048     assert(o0-u0 <= 2*m+1);
04049     assert(offset_psij+o0-u0 <= 2*m+1);
04050 #endif
04051 
04052     for (l0 = 0; l0 <= o0-u0; l0++)
04053     {
04054       INT i0 = (u0+l0) * n1;
04055       const C val0 = psij_const0[offset_psij+l0];
04056 
04057       for(l1=0; l1<=2*m+1; l1++)
04058       {
04059         const INT i1 = (i0 + index_temp1[l1]) * n2;
04060         const C val1 = psij_const1[l1];
04061 
04062         for(l2=0; l2<=2*m+1; l2++)
04063           g[i1 + index_temp2[l2]] += val0 * val1 * psij_const2[l2] * f;
04064       }
04065     }
04066 
04067     u0 = my_u0;
04068     o0 = MIN(my_o0,ar_o0);
04069     offset_psij += my_u0-ar_u0+n0;
04070 
04071 #ifdef OMP_ASSERT
04072     if (u0<=o0)
04073     {
04074       assert(o0-u0 <= 2*m+1);
04075       assert(offset_psij+o0-u0 <= 2*m+1);
04076     }
04077 #endif
04078     for (l0 = 0; l0 <= o0-u0; l0++)
04079     {
04080       INT i0 = (u0+l0) * n1;
04081       const C val0 = psij_const0[offset_psij+l0];
04082 
04083       for(l1=0; l1<=2*m+1; l1++)
04084       {
04085         const INT i1 = (i0 + index_temp1[l1]) * n2;
04086         const C val1 = psij_const1[l1];
04087 
04088         for(l2=0; l2<=2*m+1; l2++)
04089           g[i1 + index_temp2[l2]] += val0 * val1 * psij_const2[l2] * f;
04090       }
04091     }
04092   }
04093 }
04094 #endif
04095 
04096 #ifdef _OPENMP
04097 /* adjoint NFFT three-dimensional case with OpenMP atomic operations */
04098 static void nfft_adjoint_3d_compute_omp_atomic(const C f, C *g,
04099     const R *psij_const0, const R *psij_const1, const R *psij_const2,
04100     const R *xj0, const R *xj1, const R *xj2,
04101     const INT n0, const INT n1, const INT n2, const INT m)
04102 {
04103   INT u0,o0,l0,u1,o1,l1,u2,o2,l2;
04104   const INT lprod = (2*m+2) * (2*m+2) * (2*m+2);
04105 
04106   INT index_temp0[2*m+2];
04107   INT index_temp1[2*m+2];
04108   INT index_temp2[2*m+2];
04109 
04110   uo2(&u0,&o0,*xj0, n0, m);
04111   uo2(&u1,&o1,*xj1, n1, m);
04112   uo2(&u2,&o2,*xj2, n2, m);
04113 
04114   for (l0=0; l0<=2*m+1; l0++)
04115     index_temp0[l0] = (u0+l0)%n0;
04116 
04117   for (l1=0; l1<=2*m+1; l1++)
04118     index_temp1[l1] = (u1+l1)%n1;
04119 
04120   for (l2=0; l2<=2*m+1; l2++)
04121     index_temp2[l2] = (u2+l2)%n2;
04122 
04123   for(l0=0; l0<=2*m+1; l0++)
04124   {
04125     for(l1=0; l1<=2*m+1; l1++)
04126     {
04127       for(l2=0; l2<=2*m+1; l2++)
04128       {
04129         INT i = (index_temp0[l0] * n1 + index_temp1[l1]) * n2 + index_temp2[l2];
04130         C *lhs = g+i;
04131         R *lhs_real = (R*)lhs;
04132         C val = psij_const0[l0] * psij_const1[l1] * psij_const2[l2] * f;
04133 
04134 #pragma omp atomic
04135         lhs_real[0] += CREAL(val);
04136 
04137 #pragma omp atomic
04138         lhs_real[1] += CIMAG(val);
04139       }
04140     }
04141   }
04142 }
04143 #endif
04144 
04145 #ifndef _OPENMP
04146 static void nfft_adjoint_3d_compute_serial(const C *fj, C *g,
04147     const R *psij_const0, const R *psij_const1, const R *psij_const2, const R *xj0,
04148     const R *xj1, const R *xj2, const INT n0, const INT n1, const INT n2,
04149     const INT m)
04150 {
04151   INT u0, o0, l0, u1, o1, l1, u2, o2, l2;
04152   C *gj;
04153   const R *psij0, *psij1, *psij2;
04154 
04155   psij0 = psij_const0;
04156   psij1 = psij_const1;
04157   psij2 = psij_const2;
04158 
04159   uo2(&u0, &o0, *xj0, n0, m);
04160   uo2(&u1, &o1, *xj1, n1, m);
04161   uo2(&u2, &o2, *xj2, n2, m);
04162 
04163   if (u0 < o0)
04164     if (u1 < o1)
04165       if (u2 < o2)
04166         for (l0 = 0; l0 <= 2 * m + 1; l0++, psij0++)
04167         {
04168           psij1 = psij_const1;
04169           for (l1 = 0; l1 <= 2 * m + 1; l1++, psij1++)
04170           {
04171             psij2 = psij_const2;
04172             gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
04173             for (l2 = 0; l2 <= 2 * m + 1; l2++)
04174               (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
04175           }
04176         }
04177       else
04178         /* asserts (u2>o2)*/
04179         for (l0 = 0; l0 <= 2 * m + 1; l0++, psij0++)
04180         {
04181           psij1 = psij_const1;
04182           for (l1 = 0; l1 <= 2 * m + 1; l1++, psij1++)
04183           {
04184             psij2 = psij_const2;
04185             gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
04186             for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
04187               (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
04188             gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2;
04189             for (l2 = 0; l2 <= o2; l2++)
04190               (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
04191           }
04192         }
04193     else /* asserts (u1>o1)*/
04194       if (u2 < o2)
04195         for (l0 = 0; l0 <= 2 * m + 1; l0++, psij0++)
04196         {
04197           psij1 = psij_const1;
04198           for (l1 = 0; l1 < 2 * m + 1 - o1; l1++, psij1++)
04199           {
04200             psij2 = psij_const2;
04201             gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
04202             for (l2 = 0; l2 <= 2 * m + 1; l2++)
04203               (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
04204           }
04205           for (l1 = 0; l1 <= o1; l1++, psij1++)
04206           {
04207             psij2 = psij_const2;
04208             gj = g + ((u0 + l0) * n1 + l1) * n2 + u2;
04209             for (l2 = 0; l2 <= 2 * m + 1; l2++)
04210               (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
04211           }
04212         }
04213       else/* asserts (u2>o2) */
04214       {
04215         for (l0 = 0; l0 <= 2 * m + 1; l0++, psij0++)
04216         {
04217           psij1 = psij_const1;
04218           for (l1 = 0; l1 < 2 * m + 1 - o1; l1++, psij1++)
04219           {
04220             psij2 = psij_const2;
04221             gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
04222             for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
04223               (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
04224             gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2;
04225             for (l2 = 0; l2 <= o2; l2++)
04226               (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
04227           }
04228           for (l1 = 0; l1 <= o1; l1++, psij1++)
04229           {
04230             psij2 = psij_const2;
04231             gj = g + ((u0 + l0) * n1 + l1) * n2 + u2;
04232             for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
04233               (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
04234             gj = g + ((u0 + l0) * n1 + l1) * n2;
04235             for (l2 = 0; l2 <= o2; l2++)
04236               (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
04237           }
04238         }
04239       }
04240   else /* asserts (u0>o0) */
04241     if (u1 < o1)
04242       if (u2 < o2)
04243       {
04244         for (l0 = 0; l0 < 2 * m + 1 - o0; l0++, psij0++)
04245         {
04246           psij1 = psij_const1;
04247           for (l1 = 0; l1 <= 2 * m + 1; l1++, psij1++)
04248           {
04249             psij2 = psij_const2;
04250             gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
04251             for (l2 = 0; l2 <= 2 * m + 1; l2++)
04252               (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
04253           }
04254         }
04255 
04256         for (l0 = 0; l0 <= o0; l0++, psij0++)
04257         {
04258           psij1 = psij_const1;
04259           for (l1 = 0; l1 <= 2 * m + 1; l1++, psij1++)
04260           {
04261             psij2 = psij_const2;
04262             gj = g + (l0 * n1 + (u1 + l1)) * n2 + u2;
04263             for (l2 = 0; l2 <= 2 * m + 1; l2++)
04264               (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
04265           }
04266         }
04267       } else/* asserts (u2>o2) */
04268       {
04269         for (l0 = 0; l0 < 2 * m + 1 - o0; l0++, psij0++)
04270         {
04271           psij1 = psij_const1;
04272           for (l1 = 0; l1 <= 2 * m + 1; l1++, psij1++)
04273           {
04274             psij2 = psij_const2;
04275             gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
04276             for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
04277               (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
04278             gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2;
04279             for (l2 = 0; l2 <= o2; l2++)
04280               (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
04281           }
04282         }
04283 
04284         for (l0 = 0; l0 <= o0; l0++, psij0++)
04285         {
04286           psij1 = psij_const1;
04287           for (l1 = 0; l1 <= 2 * m + 1; l1++, psij1++)
04288           {
04289             psij2 = psij_const2;
04290             gj = g + (l0 * n1 + (u1 + l1)) * n2 + u2;
04291             for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
04292               (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
04293             gj = g + (l0 * n1 + (u1 + l1)) * n2;
04294             for (l2 = 0; l2 <= o2; l2++)
04295               (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
04296           }
04297         }
04298       }
04299     else /* asserts (u1>o1) */
04300       if (u2 < o2)
04301       {
04302         for (l0 = 0; l0 < 2 * m + 1 - o0; l0++, psij0++)
04303         {
04304           psij1 = psij_const1;
04305           for (l1 = 0; l1 < 2 * m + 1 - o1; l1++, psij1++)
04306           {
04307             psij2 = psij_const2;
04308             gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
04309             for (l2 = 0; l2 <= 2 * m + 1; l2++)
04310               (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
04311           }
04312           for (l1 = 0; l1 <= o1; l1++, psij1++)
04313           {
04314             psij2 = psij_const2;
04315             gj = g + ((u0 + l0) * n1 + l1) * n2 + u2;
04316             for (l2 = 0; l2 <= 2 * m + 1; l2++)
04317               (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
04318           }
04319         }
04320         for (l0 = 0; l0 <= o0; l0++, psij0++)
04321         {
04322           psij1 = psij_const1;
04323           for (l1 = 0; l1 < 2 * m + 1 - o1; l1++, psij1++)
04324           {
04325             psij2 = psij_const2;
04326             gj = g + (l0 * n1 + (u1 + l1)) * n2 + u2;
04327             for (l2 = 0; l2 <= 2 * m + 1; l2++)
04328               (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
04329           }
04330           for (l1 = 0; l1 <= o1; l1++, psij1++)
04331           {
04332             psij2 = psij_const2;
04333             gj = g + (l0 * n1 + l1) * n2 + u2;
04334             for (l2 = 0; l2 <= 2 * m + 1; l2++)
04335               (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
04336           }
04337         }
04338       } else/* asserts (u2>o2) */
04339       {
04340         for (l0 = 0; l0 < 2 * m + 1 - o0; l0++, psij0++)
04341         {
04342           psij1 = psij_const1;
04343           for (l1 = 0; l1 < 2 * m + 1 - o1; l1++, psij1++)
04344           {
04345             psij2 = psij_const2;
04346             gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
04347             for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
04348               (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
04349             gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2;
04350             for (l2 = 0; l2 <= o2; l2++)
04351               (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
04352           }
04353           for (l1 = 0; l1 <= o1; l1++, psij1++)
04354           {
04355             psij2 = psij_const2;
04356             gj = g + ((u0 + l0) * n1 + l1) * n2 + u2;
04357             for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
04358               (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
04359             gj = g + ((u0 + l0) * n1 + l1) * n2;
04360             for (l2 = 0; l2 <= o2; l2++)
04361               (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
04362           }
04363         }
04364 
04365         for (l0 = 0; l0 <= o0; l0++, psij0++)
04366         {
04367           psij1 = psij_const1;
04368           for (l1 = 0; l1 < 2 * m + 1 - o1; l1++, psij1++)
04369           {
04370             psij2 = psij_const2;
04371             gj = g + (l0 * n1 + (u1 + l1)) * n2 + u2;
04372             for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
04373               (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
04374             gj = g + (l0 * n1 + (u1 + l1)) * n2;
04375             for (l2 = 0; l2 <= o2; l2++)
04376               (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
04377           }
04378           for (l1 = 0; l1 <= o1; l1++, psij1++)
04379           {
04380             psij2 = psij_const2;
04381             gj = g + (l0 * n1 + l1) * n2 + u2;
04382             for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
04383               (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
04384             gj = g + (l0 * n1 + l1) * n2;
04385             for (l2 = 0; l2 <= o2; l2++)
04386               (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
04387           }
04388         }
04389       }
04390 }
04391 #endif
04392 
04393 static void nfft_trafo_3d_B(X(plan) *ths)
04394 {
04395   const INT n0 = ths->n[0];
04396   const INT n1 = ths->n[1];
04397   const INT n2 = ths->n[2];
04398   const INT M = ths->M_total;
04399   const INT m = ths->m;
04400 
04401   const C* g = (C*) ths->g;
04402 
04403   INT k;
04404 
04405   if(ths->flags & PRE_FULL_PSI)
04406   {
04407     const INT lprod = (2*m+2) * (2*m+2) * (2*m+2);
04408 #ifdef _OPENMP
04409     #pragma omp parallel for default(shared) private(k)
04410 #endif
04411     for (k = 0; k < M; k++)
04412     {
04413       INT l;
04414       INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
04415       ths->f[j] = K(0.0);
04416       for (l = 0; l < lprod; l++)
04417         ths->f[j] += ths->psi[j*lprod+l] * g[ths->psi_index_g[j*lprod+l]];
04418     }
04419     return;
04420   } /* if(PRE_FULL_PSI) */
04421 
04422   if(ths->flags & PRE_PSI)
04423   {
04424 #ifdef _OPENMP
04425     #pragma omp parallel for default(shared) private(k)
04426 #endif
04427     for (k = 0; k < M; k++)
04428     {
04429       INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
04430       nfft_trafo_3d_compute(ths->f+j, g, ths->psi+j*3*(2*m+2), ths->psi+(j*3+1)*(2*m+2), ths->psi+(j*3+2)*(2*m+2), ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
04431     }
04432     return;
04433   } /* if(PRE_PSI) */
04434 
04435   if(ths->flags & PRE_FG_PSI)
04436   {
04437     R fg_exp_l[3*(2*m+2)];
04438 
04439     nfft_3d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
04440     nfft_3d_init_fg_exp_l(fg_exp_l+2*m+2, m, ths->b[1]);
04441     nfft_3d_init_fg_exp_l(fg_exp_l+2*(2*m+2), m, ths->b[2]);
04442 
04443 #ifdef _OPENMP
04444     #pragma omp parallel for default(shared) private(k)
04445 #endif
04446     for (k = 0; k < M; k++)
04447     {
04448       INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
04449       INT l;
04450       R psij_const[3*(2*m+2)];
04451       R fg_psij0 = ths->psi[2*j*3];
04452       R fg_psij1 = ths->psi[2*j*3+1];
04453       R fg_psij2 = K(1.0);
04454 
04455       psij_const[0] = fg_psij0;
04456       for(l=1; l<=2*m+1; l++)
04457       {
04458         fg_psij2 *= fg_psij1;
04459         psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l];
04460       }
04461 
04462       fg_psij0 = ths->psi[2*(j*3+1)];
04463       fg_psij1 = ths->psi[2*(j*3+1)+1];
04464       fg_psij2 = K(1.0);
04465       psij_const[2*m+2] = fg_psij0;
04466       for(l=1; l<=2*m+1; l++)
04467       {
04468         fg_psij2 *= fg_psij1;
04469         psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l];
04470       }
04471 
04472       fg_psij0 = ths->psi[2*(j*3+2)];
04473       fg_psij1 = ths->psi[2*(j*3+2)+1];
04474       fg_psij2 = K(1.0);
04475       psij_const[2*(2*m+2)] = fg_psij0;
04476       for(l=1; l<=2*m+1; l++)
04477       {
04478         fg_psij2 *= fg_psij1;
04479         psij_const[2*(2*m+2)+l] = fg_psij0*fg_psij2*fg_exp_l[2*(2*m+2)+l];
04480       }
04481 
04482       nfft_trafo_3d_compute(ths->f+j, g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
04483     }
04484 
04485     return;
04486   } /* if(PRE_FG_PSI) */
04487 
04488   if(ths->flags & FG_PSI)
04489   {
04490     R fg_exp_l[3*(2*m+2)];
04491 
04492     nfft_3d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
04493     nfft_3d_init_fg_exp_l(fg_exp_l+2*m+2, m, ths->b[1]);
04494     nfft_3d_init_fg_exp_l(fg_exp_l+2*(2*m+2), m, ths->b[2]);
04495 
04496     sort(ths);
04497 
04498 #ifdef _OPENMP
04499     #pragma omp parallel for default(shared) private(k)
04500 #endif
04501     for (k = 0; k < M; k++)
04502     {
04503       INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
04504       INT u, o, l;
04505       R psij_const[3*(2*m+2)];
04506       R fg_psij0, fg_psij1, fg_psij2;
04507 
04508       uo(ths,j,&u,&o,(INT)0);
04509       fg_psij0 = (PHI(ths->n[0], ths->x[3*j] - ((R)u) / (R)(n0),0));
04510       fg_psij1 = EXP(K(2.0) * ((R)(n0) * (ths->x[3*j]) - (R)(u)) / ths->b[0]);
04511       fg_psij2 = K(1.0);
04512       psij_const[0] = fg_psij0;
04513       for(l=1; l<=2*m+1; l++)
04514       {
04515         fg_psij2 *= fg_psij1;
04516         psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l];
04517       }
04518 
04519       uo(ths,j,&u,&o,(INT)1);
04520       fg_psij0 = (PHI(ths->n[1], ths->x[3*j+1] - ((R)u) / (R)(n1),1));
04521       fg_psij1 = EXP(K(2.0) * ((R)(n1) * (ths->x[3*j+1]) - (R)(u)) / ths->b[1]);
04522       fg_psij2 = K(1.0);
04523       psij_const[2*m+2] = fg_psij0;
04524       for(l=1; l<=2*m+1; l++)
04525       {
04526         fg_psij2 *= fg_psij1;
04527         psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l];
04528       }
04529 
04530       uo(ths,j,&u,&o,(INT)2);
04531       fg_psij0 = (PHI(ths->n[2], ths->x[3*j+2] - ((R)u) / (R)(n2),2));
04532       fg_psij1 = EXP(K(2.0) * ((R)(n2) * (ths->x[3*j+2]) - (R)(u)) / ths->b[2]);
04533       fg_psij2 = K(1.0);
04534       psij_const[2*(2*m+2)] = fg_psij0;
04535       for(l=1; l<=2*m+1; l++)
04536       {
04537         fg_psij2 *= fg_psij1;
04538         psij_const[2*(2*m+2)+l] = fg_psij0*fg_psij2*fg_exp_l[2*(2*m+2)+l];
04539       }
04540 
04541       nfft_trafo_3d_compute(ths->f+j, g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
04542     }
04543 
04544     return;
04545   } /* if(FG_PSI) */
04546 
04547   if(ths->flags & PRE_LIN_PSI)
04548   {
04549     const INT K = ths->K, ip_s = K / (m + 2);
04550 
04551     sort(ths);
04552 
04553 #ifdef _OPENMP
04554     #pragma omp parallel for default(shared) private(k)
04555 #endif
04556     for (k = 0; k < M; k++)
04557     {
04558       INT u, o, l;
04559       R ip_y, ip_w;
04560       INT ip_u;
04561       R psij_const[3*(2*m+2)];
04562       INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
04563 
04564       uo(ths,j,&u,&o,(INT)0);
04565       ip_y = FABS((R)(n0) * ths->x[3*j+0] - (R)(u)) * ((R)ip_s);
04566       ip_u = (INT)(LRINT(FLOOR(ip_y)));
04567       ip_w = ip_y - (R)(ip_u);
04568       for(l=0; l < 2*m+2; l++)
04569         psij_const[l] = ths->psi[ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) +
04570           ths->psi[ABS(ip_u-l*ip_s+1)]*(ip_w);
04571 
04572       uo(ths,j,&u,&o,(INT)1);
04573       ip_y = FABS((R)(n1) * ths->x[3*j+1] - (R)(u)) * ((R)ip_s);
04574       ip_u = (INT)(LRINT(FLOOR(ip_y)));
04575       ip_w = ip_y - (R)(ip_u);
04576       for(l=0; l < 2*m+2; l++)
04577         psij_const[2*m+2+l] = ths->psi[(K+1)+ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) +
04578           ths->psi[(K+1)+ABS(ip_u-l*ip_s+1)]*(ip_w);
04579 
04580       uo(ths,j,&u,&o,(INT)2);
04581       ip_y = FABS((R)(n2) * ths->x[3*j+2] - (R)(u)) * ((R)ip_s);
04582       ip_u = (INT)(LRINT(FLOOR(ip_y)));
04583       ip_w = ip_y - (R)(ip_u);
04584       for(l=0; l < 2*m+2; l++)
04585         psij_const[2*(2*m+2)+l] = ths->psi[2*(K+1)+ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) +
04586           ths->psi[2*(K+1)+ABS(ip_u-l*ip_s+1)]*(ip_w);
04587 
04588       nfft_trafo_3d_compute(ths->f+j, g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
04589     }
04590     return;
04591   } /* if(PRE_LIN_PSI) */
04592 
04593   /* no precomputed psi at all */
04594 
04595   sort(ths);
04596 
04597 #ifdef _OPENMP
04598   #pragma omp parallel for default(shared) private(k)
04599 #endif
04600   for (k = 0; k < M; k++)
04601   {
04602     R psij_const[3*(2*m+2)];
04603     INT u, o, l;
04604     INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
04605 
04606     uo(ths,j,&u,&o,(INT)0);
04607     for(l=0;l<=2*m+1;l++)
04608       psij_const[l]=(PHI(ths->n[0], ths->x[3*j] - ((R)((u+l))) / (R)(n0),0));
04609 
04610     uo(ths,j,&u,&o,(INT)1);
04611     for(l=0;l<=2*m+1;l++)
04612       psij_const[2*m+2+l]=(PHI(ths->n[1], ths->x[3*j+1] - ((R)((u+l))) / (R)(n1),1));
04613 
04614     uo(ths,j,&u,&o,(INT)2);
04615     for(l=0;l<=2*m+1;l++)
04616       psij_const[2*(2*m+2)+l]=(PHI(ths->n[2], ths->x[3*j+2] - ((R)((u+l))) / (R)(n2),2));
04617 
04618     nfft_trafo_3d_compute(ths->f+j, g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
04619   }
04620 }
04621 
04622 #ifdef OMP_ASSERT
04623 #define MACRO_adjoint_3d_B_OMP_BLOCKWISE_ASSERT_A \
04624 { \
04625           assert(ar_x[2*k] >= min_u_a || k == M-1); \
04626           if (k > 0) \
04627             assert(ar_x[2*k-2] < min_u_a); \
04628 }
04629 #else
04630 #define MACRO_adjoint_3d_B_OMP_BLOCKWISE_ASSERT_A
04631 #endif
04632 
04633 #ifdef OMP_ASSERT
04634 #define MACRO_adjoint_3d_B_OMP_BLOCKWISE_ASSERT_B \
04635 { \
04636           assert(ar_x[2*k] >= min_u_b || k == M-1); \
04637           if (k > 0) \
04638             assert(ar_x[2*k-2] < min_u_b); \
04639 }
04640 #else
04641 #define MACRO_adjoint_3d_B_OMP_BLOCKWISE_ASSERT_B
04642 #endif
04643 
04644 #define MACRO_adjoint_3d_B_OMP_BLOCKWISE_COMPUTE_PRE_PSI \
04645             nfft_adjoint_3d_compute_omp_blockwise(ths->f[j], g, \
04646                 ths->psi+j*3*(2*m+2), \
04647                 ths->psi+(j*3+1)*(2*m+2), \
04648                 ths->psi+(j*3+2)*(2*m+2), \
04649                 ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, \
04650                 n0, n1, n2, m, my_u0, my_o0);
04651 
04652 #define MACRO_adjoint_3d_B_OMP_BLOCKWISE_COMPUTE_PRE_FG_PSI \
04653 { \
04654             INT u, o, l; \
04655             R psij_const[3*(2*m+2)]; \
04656             R fg_psij0 = ths->psi[2*j*3]; \
04657             R fg_psij1 = ths->psi[2*j*3+1]; \
04658             R fg_psij2 = K(1.0); \
04659  \
04660             psij_const[0] = fg_psij0; \
04661             for(l=1; l<=2*m+1; l++) \
04662             { \
04663               fg_psij2 *= fg_psij1; \
04664               psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l]; \
04665             } \
04666  \
04667             fg_psij0 = ths->psi[2*(j*3+1)]; \
04668             fg_psij1 = ths->psi[2*(j*3+1)+1]; \
04669             fg_psij2 = K(1.0); \
04670             psij_const[2*m+2] = fg_psij0; \
04671             for(l=1; l<=2*m+1; l++) \
04672             { \
04673               fg_psij2 *= fg_psij1; \
04674               psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l]; \
04675             } \
04676  \
04677             fg_psij0 = ths->psi[2*(j*3+2)]; \
04678             fg_psij1 = ths->psi[2*(j*3+2)+1]; \
04679             fg_psij2 = K(1.0); \
04680             psij_const[2*(2*m+2)] = fg_psij0; \
04681             for(l=1; l<=2*m+1; l++) \
04682             { \
04683               fg_psij2 *= fg_psij1; \
04684               psij_const[2*(2*m+2)+l] = fg_psij0*fg_psij2*fg_exp_l[2*(2*m+2)+l]; \
04685             } \
04686  \
04687             nfft_adjoint_3d_compute_omp_blockwise(ths->f[j], g, \
04688                 psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, \
04689                 ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, \
04690                 n0, n1, n2, m, my_u0, my_o0); \
04691 }
04692 
04693 #define MACRO_adjoint_3d_B_OMP_BLOCKWISE_COMPUTE_FG_PSI \
04694 { \
04695             INT u, o, l; \
04696             R psij_const[3*(2*m+2)]; \
04697             R fg_psij0, fg_psij1, fg_psij2; \
04698  \
04699             uo(ths,j,&u,&o,(INT)0); \
04700             fg_psij0 = (PHI(ths->n[0],ths->x[3*j]-((R)u)/n0,0)); \
04701             fg_psij1 = EXP(K(2.0)*(n0*(ths->x[3*j]) - u)/ths->b[0]); \
04702             fg_psij2 = K(1.0); \
04703             psij_const[0] = fg_psij0; \
04704             for(l=1; l<=2*m+1; l++) \
04705             { \
04706               fg_psij2 *= fg_psij1; \
04707               psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l]; \
04708             } \
04709  \
04710             uo(ths,j,&u,&o,(INT)1); \
04711             fg_psij0 = (PHI(ths->n[1],ths->x[3*j+1]-((R)u)/n1,1)); \
04712             fg_psij1 = EXP(K(2.0)*(n1*(ths->x[3*j+1]) - u)/ths->b[1]); \
04713             fg_psij2 = K(1.0); \
04714             psij_const[2*m+2] = fg_psij0; \
04715             for(l=1; l<=2*m+1; l++) \
04716             { \
04717               fg_psij2 *= fg_psij1; \
04718               psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l]; \
04719             } \
04720  \
04721             uo(ths,j,&u,&o,(INT)2); \
04722             fg_psij0 = (PHI(ths->n[2],ths->x[3*j+2]-((R)u)/n2,2)); \
04723             fg_psij1 = EXP(K(2.0)*(n2*(ths->x[3*j+2]) - u)/ths->b[2]); \
04724             fg_psij2 = K(1.0); \
04725             psij_const[2*(2*m+2)] = fg_psij0; \
04726             for(l=1; l<=2*m+1; l++) \
04727             { \
04728               fg_psij2 *= fg_psij1; \
04729               psij_const[2*(2*m+2)+l] = fg_psij0*fg_psij2*fg_exp_l[2*(2*m+2)+l]; \
04730             } \
04731  \
04732             nfft_adjoint_3d_compute_omp_blockwise(ths->f[j], g, \
04733                 psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, \
04734                 ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, \
04735                 n0, n1, n2, m, my_u0, my_o0); \
04736 }
04737 
04738 #define MACRO_adjoint_3d_B_OMP_BLOCKWISE_COMPUTE_PRE_LIN_PSI \
04739 { \
04740             INT u, o, l; \
04741             R psij_const[3*(2*m+2)]; \
04742             INT ip_u; \
04743             R ip_y, ip_w; \
04744  \
04745             uo(ths,j,&u,&o,(INT)0); \
04746             ip_y = FABS(n0*ths->x[3*j+0] - u)*((R)ip_s); \
04747             ip_u = LRINT(FLOOR(ip_y)); \
04748             ip_w = ip_y-ip_u; \
04749             for(l=0; l < 2*m+2; l++) \
04750               psij_const[l] = ths->psi[ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) + \
04751                 ths->psi[ABS(ip_u-l*ip_s+1)]*(ip_w); \
04752  \
04753             uo(ths,j,&u,&o,(INT)1); \
04754             ip_y = FABS(n1*ths->x[3*j+1] - u)*((R)ip_s); \
04755             ip_u = LRINT(FLOOR(ip_y)); \
04756             ip_w = ip_y-ip_u; \
04757             for(l=0; l < 2*m+2; l++) \
04758               psij_const[2*m+2+l] = ths->psi[(K+1)+ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) + \
04759                 ths->psi[(K+1)+ABS(ip_u-l*ip_s+1)]*(ip_w); \
04760  \
04761             uo(ths,j,&u,&o,(INT)2); \
04762             ip_y = FABS(n2*ths->x[3*j+2] - u)*((R)ip_s); \
04763             ip_u = LRINT(FLOOR(ip_y)); \
04764             ip_w = ip_y-ip_u; \
04765             for(l=0; l < 2*m+2; l++) \
04766               psij_const[2*(2*m+2)+l] = ths->psi[2*(K+1)+ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) + \
04767                 ths->psi[2*(K+1)+ABS(ip_u-l*ip_s+1)]*(ip_w); \
04768  \
04769             nfft_adjoint_3d_compute_omp_blockwise(ths->f[j], g, \
04770                 psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, \
04771                 ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, \
04772                 n0, n1, n2, m, my_u0, my_o0); \
04773 }
04774 
04775 #define MACRO_adjoint_3d_B_OMP_BLOCKWISE_COMPUTE_NO_PSI \
04776 { \
04777             INT u, o, l; \
04778             R psij_const[3*(2*m+2)]; \
04779  \
04780             uo(ths,j,&u,&o,(INT)0); \
04781             for(l=0;l<=2*m+1;l++) \
04782               psij_const[l]=(PHI(ths->n[0],ths->x[3*j]-((R)((u+l)))/n0,0)); \
04783  \
04784             uo(ths,j,&u,&o,(INT)1); \
04785             for(l=0;l<=2*m+1;l++) \
04786               psij_const[2*m+2+l]=(PHI(ths->n[1],ths->x[3*j+1]-((R)((u+l)))/n1,1)); \
04787  \
04788             uo(ths,j,&u,&o,(INT)2); \
04789             for(l=0;l<=2*m+1;l++) \
04790               psij_const[2*(2*m+2)+l]=(PHI(ths->n[2],ths->x[3*j+2]-((R)((u+l)))/n2,2)); \
04791  \
04792             nfft_adjoint_3d_compute_omp_blockwise(ths->f[j], g, \
04793                 psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, \
04794                 ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, \
04795                 n0, n1, n2, m, my_u0, my_o0); \
04796 }
04797 
04798 #define MACRO_adjoint_3d_B_OMP_BLOCKWISE(whichone) \
04799 { \
04800     if (ths->flags & NFFT_OMP_BLOCKWISE_ADJOINT) \
04801     { \
04802       _Pragma("omp parallel private(k)") \
04803       { \
04804         INT my_u0, my_o0, min_u_a, max_u_a, min_u_b, max_u_b; \
04805         INT *ar_x = ths->index_x; \
04806  \
04807         nfft_adjoint_B_omp_blockwise_init(&my_u0, &my_o0, &min_u_a, &max_u_a, \
04808             &min_u_b, &max_u_b, 3, ths->n, m); \
04809  \
04810         if (min_u_a != -1) \
04811         { \
04812           k = index_x_binary_search(ar_x, M, min_u_a); \
04813  \
04814           MACRO_adjoint_3d_B_OMP_BLOCKWISE_ASSERT_A \
04815  \
04816           while (k < M) \
04817           { \
04818             INT u_prod = ar_x[2*k]; \
04819             INT j = ar_x[2*k+1]; \
04820  \
04821             if (u_prod < min_u_a || u_prod > max_u_a) \
04822               break; \
04823  \
04824             MACRO_adjoint_3d_B_OMP_BLOCKWISE_COMPUTE_ ##whichone \
04825  \
04826             k++; \
04827           } \
04828         } \
04829  \
04830         if (min_u_b != -1) \
04831         { \
04832           INT k = index_x_binary_search(ar_x, M, min_u_b); \
04833  \
04834           MACRO_adjoint_3d_B_OMP_BLOCKWISE_ASSERT_B \
04835  \
04836           while (k < M) \
04837           { \
04838             INT u_prod = ar_x[2*k]; \
04839             INT j = ar_x[2*k+1]; \
04840  \
04841             if (u_prod < min_u_b || u_prod > max_u_b) \
04842               break; \
04843  \
04844             MACRO_adjoint_3d_B_OMP_BLOCKWISE_COMPUTE_ ##whichone \
04845  \
04846             k++; \
04847           } \
04848         } \
04849       } /* omp parallel */ \
04850       return; \
04851     } /* if(NFFT_OMP_BLOCKWISE_ADJOINT) */ \
04852 }
04853 
04854 static void nfft_adjoint_3d_B(X(plan) *ths)
04855 {
04856   INT k;
04857   const INT n0 = ths->n[0];
04858   const INT n1 = ths->n[1];
04859   const INT n2 = ths->n[2];
04860   const INT M = ths->M_total;
04861   const INT m = ths->m;
04862 
04863   C* g = (C*) ths->g;
04864 
04865   memset(g, 0, (size_t)(ths->n_total) * sizeof(C));
04866 
04867   if(ths->flags & PRE_FULL_PSI)
04868   {
04869     nfft_adjoint_B_compute_full_psi(g, ths->psi_index_g, ths->psi, ths->f, M,
04870         (INT)3, ths->n, m, ths->flags, ths->index_x);
04871     return;
04872   } /* if(PRE_FULL_PSI) */
04873 
04874   if(ths->flags & PRE_PSI)
04875   {
04876 #ifdef _OPENMP
04877     MACRO_adjoint_3d_B_OMP_BLOCKWISE(PRE_PSI)
04878 #endif
04879 
04880 #ifdef _OPENMP
04881     #pragma omp parallel for default(shared) private(k)
04882 #endif
04883     for (k = 0; k < M; k++)
04884     {
04885       INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
04886 #ifdef _OPENMP
04887       nfft_adjoint_3d_compute_omp_atomic(ths->f[j], g, ths->psi+j*3*(2*m+2), ths->psi+(j*3+1)*(2*m+2), ths->psi+(j*3+2)*(2*m+2), ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
04888 #else
04889       nfft_adjoint_3d_compute_serial(ths->f+j, g, ths->psi+j*3*(2*m+2), ths->psi+(j*3+1)*(2*m+2), ths->psi+(j*3+2)*(2*m+2), ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
04890 #endif
04891     }
04892     return;
04893   } /* if(PRE_PSI) */
04894 
04895   if(ths->flags & PRE_FG_PSI)
04896   {
04897     R fg_exp_l[3*(2*m+2)];
04898 
04899     nfft_3d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
04900     nfft_3d_init_fg_exp_l(fg_exp_l+2*m+2, m, ths->b[1]);
04901     nfft_3d_init_fg_exp_l(fg_exp_l+2*(2*m+2), m, ths->b[2]);
04902 
04903 #ifdef _OPENMP
04904     MACRO_adjoint_3d_B_OMP_BLOCKWISE(PRE_FG_PSI)
04905 #endif
04906 
04907 #ifdef _OPENMP
04908     #pragma omp parallel for default(shared) private(k)
04909 #endif
04910     for (k = 0; k < M; k++)
04911     {
04912       R psij_const[3*(2*m+2)];
04913       INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
04914       INT l;
04915       R fg_psij0 = ths->psi[2*j*3];
04916       R fg_psij1 = ths->psi[2*j*3+1];
04917       R fg_psij2 = K(1.0);
04918 
04919       psij_const[0] = fg_psij0;
04920       for(l=1; l<=2*m+1; l++)
04921       {
04922         fg_psij2 *= fg_psij1;
04923         psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l];
04924       }
04925 
04926       fg_psij0 = ths->psi[2*(j*3+1)];
04927       fg_psij1 = ths->psi[2*(j*3+1)+1];
04928       fg_psij2 = K(1.0);
04929       psij_const[2*m+2] = fg_psij0;
04930       for(l=1; l<=2*m+1; l++)
04931       {
04932         fg_psij2 *= fg_psij1;
04933         psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l];
04934       }
04935 
04936       fg_psij0 = ths->psi[2*(j*3+2)];
04937       fg_psij1 = ths->psi[2*(j*3+2)+1];
04938       fg_psij2 = K(1.0);
04939       psij_const[2*(2*m+2)] = fg_psij0;
04940       for(l=1; l<=2*m+1; l++)
04941       {
04942         fg_psij2 *= fg_psij1;
04943         psij_const[2*(2*m+2)+l] = fg_psij0*fg_psij2*fg_exp_l[2*(2*m+2)+l];
04944       }
04945 
04946 #ifdef _OPENMP
04947       nfft_adjoint_3d_compute_omp_atomic(ths->f[j], g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
04948 #else
04949       nfft_adjoint_3d_compute_serial(ths->f+j, g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
04950 #endif
04951     }
04952 
04953     return;
04954   } /* if(PRE_FG_PSI) */
04955 
04956   if(ths->flags & FG_PSI)
04957   {
04958     R fg_exp_l[3*(2*m+2)];
04959 
04960     nfft_3d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
04961     nfft_3d_init_fg_exp_l(fg_exp_l+2*m+2, m, ths->b[1]);
04962     nfft_3d_init_fg_exp_l(fg_exp_l+2*(2*m+2), m, ths->b[2]);
04963 
04964     sort(ths);
04965 
04966 #ifdef _OPENMP
04967     MACRO_adjoint_3d_B_OMP_BLOCKWISE(FG_PSI)
04968 #endif
04969 
04970 #ifdef _OPENMP
04971     #pragma omp parallel for default(shared) private(k)
04972 #endif
04973     for (k = 0; k < M; k++)
04974     {
04975       INT u,o,l;
04976       INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
04977       R psij_const[3*(2*m+2)];
04978       R fg_psij0, fg_psij1, fg_psij2;
04979 
04980       uo(ths,j,&u,&o,(INT)0);
04981       fg_psij0 = (PHI(ths->n[0], ths->x[3*j] - ((R)u) / (R)(n0),0));
04982       fg_psij1 = EXP(K(2.0) * ((R)(n0) * (ths->x[3*j]) - (R)(u))/ths->b[0]);
04983       fg_psij2 = K(1.0);
04984       psij_const[0] = fg_psij0;
04985       for(l=1; l<=2*m+1; l++)
04986       {
04987         fg_psij2 *= fg_psij1;
04988         psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l];
04989       }
04990 
04991       uo(ths,j,&u,&o,(INT)1);
04992       fg_psij0 = (PHI(ths->n[1], ths->x[3*j+1] - ((R)u) / (R)(n1),1));
04993       fg_psij1 = EXP(K(2.0) * ((R)(n1) * (ths->x[3*j+1]) - (R)(u))/ths->b[1]);
04994       fg_psij2 = K(1.0);
04995       psij_const[2*m+2] = fg_psij0;
04996       for(l=1; l<=2*m+1; l++)
04997       {
04998         fg_psij2 *= fg_psij1;
04999         psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l];
05000       }
05001 
05002       uo(ths,j,&u,&o,(INT)2);
05003       fg_psij0 = (PHI(ths->n[2], ths->x[3*j+2] - ((R)u) / (R)(n2),2));
05004       fg_psij1 = EXP(K(2.0) * ((R)(n2) * (ths->x[3*j+2]) - (R)(u))/ths->b[2]);
05005       fg_psij2 = K(1.0);
05006       psij_const[2*(2*m+2)] = fg_psij0;
05007       for(l=1; l<=2*m+1; l++)
05008       {
05009         fg_psij2 *= fg_psij1;
05010         psij_const[2*(2*m+2)+l] = fg_psij0*fg_psij2*fg_exp_l[2*(2*m+2)+l];
05011       }
05012 
05013 #ifdef _OPENMP
05014       nfft_adjoint_3d_compute_omp_atomic(ths->f[j], g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
05015 #else
05016       nfft_adjoint_3d_compute_serial(ths->f+j, g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
05017 #endif
05018     }
05019 
05020     return;
05021   } /* if(FG_PSI) */
05022 
05023   if(ths->flags & PRE_LIN_PSI)
05024   {
05025     const INT K = ths->K;
05026     const INT ip_s = K / (m + 2);
05027 
05028     sort(ths);
05029 
05030 #ifdef _OPENMP
05031     MACRO_adjoint_3d_B_OMP_BLOCKWISE(PRE_LIN_PSI)
05032 #endif
05033 
05034 #ifdef _OPENMP
05035     #pragma omp parallel for default(shared) private(k)
05036 #endif
05037     for (k = 0; k < M; k++)
05038     {
05039       INT u,o,l;
05040       INT ip_u;
05041       R ip_y, ip_w;
05042       INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
05043       R psij_const[3*(2*m+2)];
05044 
05045       uo(ths,j,&u,&o,(INT)0);
05046       ip_y = FABS((R)(n0) * ths->x[3*j+0] - (R)(u)) * ((R)ip_s);
05047       ip_u = (INT)(LRINT(FLOOR(ip_y)));
05048       ip_w = ip_y - (R)(ip_u);
05049       for(l=0; l < 2*m+2; l++)
05050         psij_const[l] = ths->psi[ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) +
05051           ths->psi[ABS(ip_u-l*ip_s+1)]*(ip_w);
05052 
05053       uo(ths,j,&u,&o,(INT)1);
05054       ip_y = FABS((R)(n1) * ths->x[3*j+1] - (R)(u)) * ((R)ip_s);
05055       ip_u = (INT)(LRINT(FLOOR(ip_y)));
05056       ip_w = ip_y - (R)(ip_u);
05057       for(l=0; l < 2*m+2; l++)
05058         psij_const[2*m+2+l] = ths->psi[(K+1)+ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) +
05059           ths->psi[(K+1)+ABS(ip_u-l*ip_s+1)]*(ip_w);
05060 
05061       uo(ths,j,&u,&o,(INT)2);
05062       ip_y = FABS((R)(n2) * ths->x[3*j+2] - (R)(u))*((R)ip_s);
05063       ip_u = (INT)(LRINT(FLOOR(ip_y)));
05064       ip_w = ip_y - (R)(ip_u);
05065       for(l=0; l < 2*m+2; l++)
05066         psij_const[2*(2*m+2)+l] = ths->psi[2*(K+1)+ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) +
05067           ths->psi[2*(K+1)+ABS(ip_u-l*ip_s+1)]*(ip_w);
05068 
05069 #ifdef _OPENMP
05070       nfft_adjoint_3d_compute_omp_atomic(ths->f[j], g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
05071 #else
05072       nfft_adjoint_3d_compute_serial(ths->f+j, g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
05073 #endif
05074     }
05075     return;
05076   } /* if(PRE_LIN_PSI) */
05077 
05078   /* no precomputed psi at all */
05079   sort(ths);
05080 
05081 #ifdef _OPENMP
05082   MACRO_adjoint_3d_B_OMP_BLOCKWISE(NO_PSI)
05083 #endif
05084 
05085 #ifdef _OPENMP
05086   #pragma omp parallel for default(shared) private(k)
05087 #endif
05088   for (k = 0; k < M; k++)
05089   {
05090     INT u,o,l;
05091     R psij_const[3*(2*m+2)];
05092     INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
05093 
05094     uo(ths,j,&u,&o,(INT)0);
05095     for(l=0;l<=2*m+1;l++)
05096       psij_const[l]=(PHI(ths->n[0], ths->x[3*j] - ((R)((u+l))) / (R)(n0),0));
05097 
05098     uo(ths,j,&u,&o,(INT)1);
05099     for(l=0;l<=2*m+1;l++)
05100       psij_const[2*m+2+l]=(PHI(ths->n[1], ths->x[3*j+1] - ((R)((u+l))) / (R)(n1),1));
05101 
05102     uo(ths,j,&u,&o,(INT)2);
05103     for(l=0;l<=2*m+1;l++)
05104       psij_const[2*(2*m+2)+l]=(PHI(ths->n[2], ths->x[3*j+2] - ((R)((u+l))) / (R)(n2),2));
05105 
05106 #ifdef _OPENMP
05107     nfft_adjoint_3d_compute_omp_atomic(ths->f[j], g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
05108 #else
05109     nfft_adjoint_3d_compute_serial(ths->f+j, g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
05110 #endif
05111   }
05112 }
05113 
05114 
05115 void X(trafo_3d)(X(plan) *ths)
05116 {
05117   INT k0,k1,k2,n0,n1,n2,N0,N1,N2;
05118   C *g_hat,*f_hat;
05119   R *c_phi_inv01, *c_phi_inv02, *c_phi_inv11, *c_phi_inv12, *c_phi_inv21, *c_phi_inv22;
05120   R ck01, ck02, ck11, ck12, ck21, ck22;
05121   C *g_hat111,*f_hat111,*g_hat211,*f_hat211,*g_hat121,*f_hat121,*g_hat221,*f_hat221;
05122   C *g_hat112,*f_hat112,*g_hat212,*f_hat212,*g_hat122,*f_hat122,*g_hat222,*f_hat222;
05123 
05124   ths->g_hat=ths->g1;
05125   ths->g=ths->g2;
05126 
05127   N0=ths->N[0];
05128   N1=ths->N[1];
05129   N2=ths->N[2];
05130   n0=ths->n[0];
05131   n1=ths->n[1];
05132   n2=ths->n[2];
05133 
05134   f_hat=(C*)ths->f_hat;
05135   g_hat=(C*)ths->g_hat;
05136 
05137   TIC(0)
05138 #ifdef _OPENMP
05139   #pragma omp parallel for default(shared) private(k0)
05140   for (k0 = 0; k0 < ths->n_total; k0++)
05141     ths->g_hat[k0] = 0.0;
05142 #else
05143   memset(ths->g_hat, 0, (size_t)(ths->n_total) * sizeof(C));
05144 #endif
05145 
05146   if(ths->flags & PRE_PHI_HUT)
05147     {
05148       c_phi_inv01=ths->c_phi_inv[0];
05149       c_phi_inv02=&ths->c_phi_inv[0][N0/2];
05150 
05151 #ifdef _OPENMP
05152       #pragma omp parallel for default(shared) private(k0,k1,k2,ck01,ck02,c_phi_inv11,c_phi_inv12,ck11,ck12,c_phi_inv21,c_phi_inv22,g_hat111,f_hat111,g_hat211,f_hat211,g_hat121,f_hat121,g_hat221,f_hat221,g_hat112,f_hat112,g_hat212,f_hat212,g_hat122,f_hat122,g_hat222,f_hat222,ck21,ck22)
05153 #endif
05154       for(k0=0;k0<N0/2;k0++)
05155   {
05156     ck01=c_phi_inv01[k0];
05157     ck02=c_phi_inv02[k0];
05158     c_phi_inv11=ths->c_phi_inv[1];
05159     c_phi_inv12=&ths->c_phi_inv[1][N1/2];
05160 
05161     for(k1=0;k1<N1/2;k1++)
05162       {
05163         ck11=c_phi_inv11[k1];
05164         ck12=c_phi_inv12[k1];
05165         c_phi_inv21=ths->c_phi_inv[2];
05166         c_phi_inv22=&ths->c_phi_inv[2][N2/2];
05167 
05168         g_hat111=g_hat + ((n0-(N0/2)+k0)*n1+n1-(N1/2)+k1)*n2+n2-(N2/2);
05169         f_hat111=f_hat + (k0*N1+k1)*N2;
05170         g_hat211=g_hat + (k0*n1+n1-(N1/2)+k1)*n2+n2-(N2/2);
05171         f_hat211=f_hat + (((N0/2)+k0)*N1+k1)*N2;
05172         g_hat121=g_hat + ((n0-(N0/2)+k0)*n1+k1)*n2+n2-(N2/2);
05173         f_hat121=f_hat + (k0*N1+(N1/2)+k1)*N2;
05174         g_hat221=g_hat + (k0*n1+k1)*n2+n2-(N2/2);
05175         f_hat221=f_hat + (((N0/2)+k0)*N1+(N1/2)+k1)*N2;
05176 
05177         g_hat112=g_hat + ((n0-(N0/2)+k0)*n1+n1-(N1/2)+k1)*n2;
05178         f_hat112=f_hat + (k0*N1+k1)*N2+(N2/2);
05179         g_hat212=g_hat + (k0*n1+n1-(N1/2)+k1)*n2;
05180         f_hat212=f_hat + (((N0/2)+k0)*N1+k1)*N2+(N2/2);
05181         g_hat122=g_hat + ((n0-(N0/2)+k0)*n1+k1)*n2;
05182         f_hat122=f_hat + (k0*N1+N1/2+k1)*N2+(N2/2);
05183         g_hat222=g_hat + (k0*n1+k1)*n2;
05184         f_hat222=f_hat + (((N0/2)+k0)*N1+(N1/2)+k1)*N2+(N2/2);
05185 
05186         for(k2=0;k2<N2/2;k2++)
05187     {
05188       ck21=c_phi_inv21[k2];
05189       ck22=c_phi_inv22[k2];
05190 
05191       g_hat111[k2] = f_hat111[k2] * ck01 * ck11 * ck21;
05192       g_hat211[k2] = f_hat211[k2] * ck02 * ck11 * ck21;
05193       g_hat121[k2] = f_hat121[k2] * ck01 * ck12 * ck21;
05194       g_hat221[k2] = f_hat221[k2] * ck02 * ck12 * ck21;
05195 
05196       g_hat112[k2] = f_hat112[k2] * ck01 * ck11 * ck22;
05197       g_hat212[k2] = f_hat212[k2] * ck02 * ck11 * ck22;
05198       g_hat122[k2] = f_hat122[k2] * ck01 * ck12 * ck22;
05199       g_hat222[k2] = f_hat222[k2] * ck02 * ck12 * ck22;
05200     }
05201       }
05202   }
05203     }
05204   else
05205 #ifdef _OPENMP
05206     #pragma omp parallel for default(shared) private(k0,k1,k2,ck01,ck02,ck11,ck12,ck21,ck22)
05207 #endif
05208     for(k0=0;k0<N0/2;k0++)
05209       {
05210   ck01=K(1.0)/(PHI_HUT(ths->n[0],k0-N0/2,0));
05211   ck02=K(1.0)/(PHI_HUT(ths->n[0],k0,0));
05212   for(k1=0;k1<N1/2;k1++)
05213     {
05214       ck11=K(1.0)/(PHI_HUT(ths->n[1],k1-N1/2,1));
05215       ck12=K(1.0)/(PHI_HUT(ths->n[1],k1,1));
05216 
05217       for(k2=0;k2<N2/2;k2++)
05218         {
05219     ck21=K(1.0)/(PHI_HUT(ths->n[2],k2-N2/2,2));
05220     ck22=K(1.0)/(PHI_HUT(ths->n[2],k2,2));
05221 
05222     g_hat[((n0-N0/2+k0)*n1+n1-N1/2+k1)*n2+n2-N2/2+k2] = f_hat[(k0*N1+k1)*N2+k2]                  * ck01 * ck11 * ck21;
05223     g_hat[(k0*n1+n1-N1/2+k1)*n2+n2-N2/2+k2]           = f_hat[((N0/2+k0)*N1+k1)*N2+k2]           * ck02 * ck11 * ck21;
05224     g_hat[((n0-N0/2+k0)*n1+k1)*n2+n2-N2/2+k2]         = f_hat[(k0*N1+N1/2+k1)*N2+k2]             * ck01 * ck12 * ck21;
05225     g_hat[(k0*n1+k1)*n2+n2-N2/2+k2]                   = f_hat[((N0/2+k0)*N1+N1/2+k1)*N2+k2]      * ck02 * ck12 * ck21;
05226 
05227     g_hat[((n0-N0/2+k0)*n1+n1-N1/2+k1)*n2+k2]         = f_hat[(k0*N1+k1)*N2+N2/2+k2]             * ck01 * ck11 * ck22;
05228     g_hat[(k0*n1+n1-N1/2+k1)*n2+k2]                   = f_hat[((N0/2+k0)*N1+k1)*N2+N2/2+k2]      * ck02 * ck11 * ck22;
05229     g_hat[((n0-N0/2+k0)*n1+k1)*n2+k2]                 = f_hat[(k0*N1+N1/2+k1)*N2+N2/2+k2]        * ck01 * ck12 * ck22;
05230     g_hat[(k0*n1+k1)*n2+k2]                           = f_hat[((N0/2+k0)*N1+N1/2+k1)*N2+N2/2+k2] * ck02 * ck12 * ck22;
05231         }
05232     }
05233       }
05234 
05235   TOC(0)
05236 
05237   TIC_FFTW(1)
05238   FFTW(execute)(ths->my_fftw_plan1);
05239   TOC_FFTW(1);
05240 
05241   TIC(2);
05242   nfft_trafo_3d_B(ths);
05243   TOC(2);
05244 }
05245 
05246 void X(adjoint_3d)(X(plan) *ths)
05247 {
05248   INT k0,k1,k2,n0,n1,n2,N0,N1,N2;
05249   C *g_hat,*f_hat;
05250   R *c_phi_inv01, *c_phi_inv02, *c_phi_inv11, *c_phi_inv12, *c_phi_inv21, *c_phi_inv22;
05251   R ck01, ck02, ck11, ck12, ck21, ck22;
05252   C *g_hat111,*f_hat111,*g_hat211,*f_hat211,*g_hat121,*f_hat121,*g_hat221,*f_hat221;
05253   C *g_hat112,*f_hat112,*g_hat212,*f_hat212,*g_hat122,*f_hat122,*g_hat222,*f_hat222;
05254 
05255   ths->g_hat=ths->g1;
05256   ths->g=ths->g2;
05257 
05258   N0=ths->N[0];
05259   N1=ths->N[1];
05260   N2=ths->N[2];
05261   n0=ths->n[0];
05262   n1=ths->n[1];
05263   n2=ths->n[2];
05264 
05265   f_hat=(C*)ths->f_hat;
05266   g_hat=(C*)ths->g_hat;
05267 
05268   TIC(2);
05269   nfft_adjoint_3d_B(ths);
05270   TOC(2);
05271 
05272   TIC_FFTW(1)
05273   FFTW(execute)(ths->my_fftw_plan2);
05274   TOC_FFTW(1);
05275 
05276   TIC(0)
05277   if(ths->flags & PRE_PHI_HUT)
05278     {
05279       c_phi_inv01=ths->c_phi_inv[0];
05280       c_phi_inv02=&ths->c_phi_inv[0][N0/2];
05281 
05282 #ifdef _OPENMP
05283       #pragma omp parallel for default(shared) private(k0,k1,k2,ck01,ck02,c_phi_inv11,c_phi_inv12,ck11,ck12,c_phi_inv21,c_phi_inv22,g_hat111,f_hat111,g_hat211,f_hat211,g_hat121,f_hat121,g_hat221,f_hat221,g_hat112,f_hat112,g_hat212,f_hat212,g_hat122,f_hat122,g_hat222,f_hat222,ck21,ck22)
05284 #endif
05285       for(k0=0;k0<N0/2;k0++)
05286   {
05287     ck01=c_phi_inv01[k0];
05288     ck02=c_phi_inv02[k0];
05289     c_phi_inv11=ths->c_phi_inv[1];
05290     c_phi_inv12=&ths->c_phi_inv[1][N1/2];
05291 
05292     for(k1=0;k1<N1/2;k1++)
05293       {
05294         ck11=c_phi_inv11[k1];
05295         ck12=c_phi_inv12[k1];
05296         c_phi_inv21=ths->c_phi_inv[2];
05297         c_phi_inv22=&ths->c_phi_inv[2][N2/2];
05298 
05299         g_hat111=g_hat + ((n0-(N0/2)+k0)*n1+n1-(N1/2)+k1)*n2+n2-(N2/2);
05300         f_hat111=f_hat + (k0*N1+k1)*N2;
05301         g_hat211=g_hat + (k0*n1+n1-(N1/2)+k1)*n2+n2-(N2/2);
05302         f_hat211=f_hat + (((N0/2)+k0)*N1+k1)*N2;
05303         g_hat121=g_hat + ((n0-(N0/2)+k0)*n1+k1)*n2+n2-(N2/2);
05304         f_hat121=f_hat + (k0*N1+(N1/2)+k1)*N2;
05305         g_hat221=g_hat + (k0*n1+k1)*n2+n2-(N2/2);
05306         f_hat221=f_hat + (((N0/2)+k0)*N1+(N1/2)+k1)*N2;
05307 
05308         g_hat112=g_hat + ((n0-(N0/2)+k0)*n1+n1-(N1/2)+k1)*n2;
05309         f_hat112=f_hat + (k0*N1+k1)*N2+(N2/2);
05310         g_hat212=g_hat + (k0*n1+n1-(N1/2)+k1)*n2;
05311         f_hat212=f_hat + (((N0/2)+k0)*N1+k1)*N2+(N2/2);
05312         g_hat122=g_hat + ((n0-(N0/2)+k0)*n1+k1)*n2;
05313         f_hat122=f_hat + (k0*N1+(N1/2)+k1)*N2+(N2/2);
05314         g_hat222=g_hat + (k0*n1+k1)*n2;
05315         f_hat222=f_hat + (((N0/2)+k0)*N1+(N1/2)+k1)*N2+(N2/2);
05316 
05317         for(k2=0;k2<N2/2;k2++)
05318     {
05319       ck21=c_phi_inv21[k2];
05320       ck22=c_phi_inv22[k2];
05321 
05322       f_hat111[k2] = g_hat111[k2] * ck01 * ck11 * ck21;
05323       f_hat211[k2] = g_hat211[k2] * ck02 * ck11 * ck21;
05324       f_hat121[k2] = g_hat121[k2] * ck01 * ck12 * ck21;
05325       f_hat221[k2] = g_hat221[k2] * ck02 * ck12 * ck21;
05326 
05327       f_hat112[k2] = g_hat112[k2] * ck01 * ck11 * ck22;
05328       f_hat212[k2] = g_hat212[k2] * ck02 * ck11 * ck22;
05329       f_hat122[k2] = g_hat122[k2] * ck01 * ck12 * ck22;
05330       f_hat222[k2] = g_hat222[k2] * ck02 * ck12 * ck22;
05331     }
05332       }
05333   }
05334     }
05335   else
05336 #ifdef _OPENMP
05337     #pragma omp parallel for default(shared) private(k0,k1,k2,ck01,ck02,ck11,ck12,ck21,ck22)
05338 #endif
05339     for(k0=0;k0<N0/2;k0++)
05340       {
05341   ck01=K(1.0)/(PHI_HUT(ths->n[0],k0-N0/2,0));
05342   ck02=K(1.0)/(PHI_HUT(ths->n[0],k0,0));
05343   for(k1=0;k1<N1/2;k1++)
05344     {
05345       ck11=K(1.0)/(PHI_HUT(ths->n[1],k1-N1/2,1));
05346       ck12=K(1.0)/(PHI_HUT(ths->n[1],k1,1));
05347 
05348       for(k2=0;k2<N2/2;k2++)
05349         {
05350     ck21=K(1.0)/(PHI_HUT(ths->n[2],k2-N2/2,2));
05351     ck22=K(1.0)/(PHI_HUT(ths->n[2],k2,2));
05352 
05353     f_hat[(k0*N1+k1)*N2+k2]                  = g_hat[((n0-N0/2+k0)*n1+n1-N1/2+k1)*n2+n2-N2/2+k2] * ck01 * ck11 * ck21;
05354     f_hat[((N0/2+k0)*N1+k1)*N2+k2]           = g_hat[(k0*n1+n1-N1/2+k1)*n2+n2-N2/2+k2]           * ck02 * ck11 * ck21;
05355     f_hat[(k0*N1+N1/2+k1)*N2+k2]             = g_hat[((n0-N0/2+k0)*n1+k1)*n2+n2-N2/2+k2]         * ck01 * ck12 * ck21;
05356     f_hat[((N0/2+k0)*N1+N1/2+k1)*N2+k2]      = g_hat[(k0*n1+k1)*n2+n2-N2/2+k2]                   * ck02 * ck12 * ck21;
05357 
05358     f_hat[(k0*N1+k1)*N2+N2/2+k2]             = g_hat[((n0-N0/2+k0)*n1+n1-N1/2+k1)*n2+k2]         * ck01 * ck11 * ck22;
05359     f_hat[((N0/2+k0)*N1+k1)*N2+N2/2+k2]      = g_hat[(k0*n1+n1-N1/2+k1)*n2+k2]                   * ck02 * ck11 * ck22;
05360     f_hat[(k0*N1+N1/2+k1)*N2+N2/2+k2]        = g_hat[((n0-N0/2+k0)*n1+k1)*n2+k2]                 * ck01 * ck12 * ck22;
05361     f_hat[((N0/2+k0)*N1+N1/2+k1)*N2+N2/2+k2] = g_hat[(k0*n1+k1)*n2+k2]                           * ck02 * ck12 * ck22;
05362         }
05363     }
05364       }
05365 
05366   TOC(0)
05367 }
05368 
05371 void X(trafo)(X(plan) *ths)
05372 {
05373   switch(ths->d)
05374   {
05375     case 1: X(trafo_1d)(ths); break;
05376     case 2: X(trafo_2d)(ths); break;
05377     case 3: X(trafo_3d)(ths); break;
05378     default:
05379     {
05380       /* use ths->my_fftw_plan1 */
05381       ths->g_hat = ths->g1;
05382       ths->g = ths->g2;
05383 
05387       TIC(0)
05388       D_A(ths);
05389       TOC(0)
05390 
05395       TIC_FFTW(1)
05396       FFTW(execute)(ths->my_fftw_plan1);
05397       TOC_FFTW(1)
05398 
05402       TIC(2)
05403       B_A(ths);
05404       TOC(2)
05405     }
05406   }
05407 } /* nfft_trafo */
05408 
05409 void X(adjoint)(X(plan) *ths)
05410 {
05411   switch(ths->d)
05412   {
05413     case 1: X(adjoint_1d)(ths); break;
05414     case 2: X(adjoint_2d)(ths); break;
05415     case 3: X(adjoint_3d)(ths); break;
05416     default:
05417     {
05418       /* use ths->my_fftw_plan2 */
05419       ths->g_hat=ths->g1;
05420       ths->g=ths->g2;
05421       
05425       TIC(2)
05426       B_T(ths);
05427       TOC(2)
05428 
05433       TIC_FFTW(1)
05434       FFTW(execute)(ths->my_fftw_plan2);
05435       TOC_FFTW(1)
05436 
05440       TIC(0)
05441       D_T(ths);
05442       TOC(0)
05443     }
05444   }
05445 } /* nfft_adjoint */
05446 
05447 
05450 static void precompute_phi_hut(X(plan) *ths)
05451 {
05452   INT ks[ths->d]; /* index over all frequencies */
05453   INT t; /* index over all dimensions */
05454 
05455   ths->c_phi_inv = (R**) Y(malloc)((size_t)(ths->d) * sizeof(R*));
05456 
05457   for (t = 0; t < ths->d; t++)
05458   {
05459     ths->c_phi_inv[t] = (R*)Y(malloc)((size_t)(ths->N[t]) * sizeof(R));
05460 
05461     for (ks[t] = 0; ks[t] < ths->N[t]; ks[t]++)
05462     {
05463       ths->c_phi_inv[t][ks[t]]= K(1.0) / (PHI_HUT(ths->n[t], ks[t] - ths->N[t] / 2,t));
05464     }
05465   }
05466 } /* nfft_phi_hut */
05467 
05472 void X(precompute_lin_psi)(X(plan) *ths)
05473 {
05474   INT t;                                
05475   INT j;                                
05476   R step;                          
05478   for (t=0; t<ths->d; t++)
05479     {
05480       step = ((R)(ths->m+2)) / ((R)(ths->K * ths->n[t]));
05481       for(j = 0;j <= ths->K; j++)
05482   {
05483     ths->psi[(ths->K+1)*t + j] = PHI(ths->n[t], (R)(j) * step,t);
05484   } /* for(j) */
05485     } /* for(t) */
05486 }
05487 
05488 void X(precompute_fg_psi)(X(plan) *ths)
05489 {
05490   INT t;                                
05491   INT u, o;                             
05493   sort(ths);
05494 
05495   for (t=0; t<ths->d; t++)
05496   {
05497     INT j;
05498 #ifdef _OPENMP
05499     #pragma omp parallel for default(shared) private(j,u,o)
05500 #endif
05501     for (j = 0; j < ths->M_total; j++)
05502       {
05503   uo(ths,j,&u,&o,t);
05504 
05505         ths->psi[2*(j*ths->d+t)]=
05506             (PHI(ths->n[t] ,(ths->x[j*ths->d+t] - ((R)u) / (R)(ths->n[t])),t));
05507 
05508         ths->psi[2*(j*ths->d+t)+1]=
05509             EXP(K(2.0) * ((R)(ths->n[t]) * ths->x[j*ths->d+t] - (R)(u)) / ths->b[t]);
05510       } /* for(j) */
05511   }
05512   /* for(t) */
05513 } /* nfft_precompute_fg_psi */
05514 
05515 void X(precompute_psi)(X(plan) *ths)
05516 {
05517   INT t; /* index over all dimensions */
05518   INT l; /* index u<=l<=o */
05519   INT lj; /* index 0<=lj<u+o+1 */
05520   INT u, o; /* depends on x_j */
05521 
05522   sort(ths);
05523 
05524   for (t=0; t<ths->d; t++)
05525   {
05526     INT j;
05527 #ifdef _OPENMP
05528     #pragma omp parallel for default(shared) private(j,l,lj,u,o)
05529 #endif
05530     for (j = 0; j < ths->M_total; j++)
05531     {
05532       uo(ths,j,&u,&o,t);
05533 
05534       for(l = u, lj = 0; l <= o; l++, lj++)
05535         ths->psi[(j * ths->d + t) * (2 * ths->m + 2) + lj] =
05536             (PHI(ths->n[t], (ths->x[j*ths->d+t] - ((R)l) / (R)(ths->n[t])), t));
05537     } /* for(j) */
05538   }
05539   /* for(t) */
05540 } /* nfft_precompute_psi */
05541 
05542 #ifdef _OPENMP
05543 static void nfft_precompute_full_psi_omp(X(plan) *ths)
05544 {
05545   INT j;                                
05546   INT lprod;                            
05548   {
05549     INT t;
05550     for(t=0,lprod = 1; t<ths->d; t++)
05551         lprod *= 2*ths->m+2;
05552   }
05553 
05554   #pragma omp parallel for default(shared) private(j)
05555   for(j=0; j<ths->M_total; j++)
05556     {
05557       INT t,t2;                             
05558       INT l_L;                              
05559       INT l[ths->d];                        
05560       INT lj[ths->d];                       
05561       INT ll_plain[ths->d+1];               
05563       INT u[ths->d], o[ths->d];             
05565       R phi_prod[ths->d+1];
05566       INT ix = j*lprod;
05567 
05568       phi_prod[0]=1;
05569       ll_plain[0]=0;
05570 
05571       MACRO_init_uo_l_lj_t;
05572 
05573       for(l_L=0; l_L<lprod; l_L++, ix++)
05574       {
05575         MACRO_update_phi_prod_ll_plain(without_PRE_PSI);
05576 
05577         ths->psi_index_g[ix]=ll_plain[ths->d];
05578         ths->psi[ix]=phi_prod[ths->d];
05579 
05580         MACRO_count_uo_l_lj_t;
05581       } /* for(l_L) */
05582 
05583       ths->psi_index_f[j]=lprod;
05584     } /* for(j) */
05585 }
05586 #endif
05587 
05588 void X(precompute_full_psi)(X(plan) *ths)
05589 {
05590 #ifdef _OPENMP
05591   sort(ths);
05592 
05593   nfft_precompute_full_psi_omp(ths);
05594 #else
05595   INT t, t2; /* index over all dimensions */
05596   INT j; /* index over all nodes */
05597   INT l_L; /* plain index 0 <= l_L < lprod */
05598   INT l[ths->d]; /* multi index u<=l<=o */
05599   INT lj[ths->d]; /* multi index 0<=lj<u+o+1 */
05600   INT ll_plain[ths->d+1]; /* postfix plain index */
05601   INT lprod; /* 'bandwidth' of matrix B */
05602   INT u[ths->d], o[ths->d]; /* depends on x_j */
05603 
05604   R phi_prod[ths->d+1];
05605 
05606   INT ix, ix_old;
05607 
05608   sort(ths);
05609 
05610   phi_prod[0] = K(1.0);
05611   ll_plain[0] = 0;
05612 
05613   for (t = 0, lprod = 1; t < ths->d; t++)
05614     lprod *= 2 * ths->m + 2;
05615 
05616   for (j = 0, ix = 0, ix_old = 0; j < ths->M_total; j++)
05617   {
05618     MACRO_init_uo_l_lj_t;
05619 
05620     for (l_L = 0; l_L < lprod; l_L++, ix++)
05621     {
05622       MACRO_update_phi_prod_ll_plain(without_PRE_PSI);
05623 
05624       ths->psi_index_g[ix] = ll_plain[ths->d];
05625       ths->psi[ix] = phi_prod[ths->d];
05626 
05627       MACRO_count_uo_l_lj_t;
05628     } /* for(l_L) */
05629 
05630     ths->psi_index_f[j] = ix - ix_old;
05631     ix_old = ix;
05632   } /* for(j) */
05633 #endif
05634 }
05635 
05636 void X(precompute_one_psi)(X(plan) *ths)
05637 {
05638   if(ths->flags & PRE_LIN_PSI)
05639     X(precompute_lin_psi)(ths);
05640   if(ths->flags & PRE_FG_PSI)
05641     X(precompute_fg_psi)(ths);
05642   if(ths->flags & PRE_PSI)
05643     X(precompute_psi)(ths);
05644   if(ths->flags & PRE_FULL_PSI)
05645     X(precompute_full_psi)(ths);
05646 }
05647 
05648 static void init_help(X(plan) *ths)
05649 {
05650   INT t; /* index over all dimensions */
05651   INT lprod; /* 'bandwidth' of matrix B */
05652 
05653   if (ths->flags & NFFT_OMP_BLOCKWISE_ADJOINT)
05654     ths->flags |= NFFT_SORT_NODES;
05655 
05656   ths->N_total = intprod(ths->N, 0, ths->d);
05657   ths->n_total = intprod(ths->n, 0, ths->d);
05658 
05659   ths->sigma = (R*) Y(malloc)((size_t)(ths->d) * sizeof(R));
05660 
05661   for(t = 0;t < ths->d; t++)
05662     ths->sigma[t] = ((R)ths->n[t]) / (R)(ths->N[t]);
05663 
05664   WINDOW_HELP_INIT;
05665 
05666   if(ths->flags & MALLOC_X)
05667     ths->x = (R*)Y(malloc)((size_t)(ths->d * ths->M_total) * sizeof(R));
05668 
05669   if(ths->flags & MALLOC_F_HAT)
05670     ths->f_hat = (C*)Y(malloc)((size_t)(ths->N_total) * sizeof(C));
05671 
05672   if(ths->flags & MALLOC_F)
05673     ths->f = (C*)Y(malloc)((size_t)(ths->M_total) * sizeof(C));
05674 
05675   if(ths->flags & PRE_PHI_HUT)
05676     precompute_phi_hut(ths);
05677 
05678   if (ths->flags & PRE_LIN_PSI)
05679   {
05680       if (ths->K == 0)
05681       {
05682         ths->K = Y(m2K)(ths->m);
05683       }
05684       ths->psi = (R*) Y(malloc)((size_t)((ths->K+1) * ths->d) * sizeof(R));
05685   }
05686 
05687   if(ths->flags & PRE_FG_PSI)
05688     ths->psi = (R*) Y(malloc)((size_t)(ths->M_total * ths->d * 2) * sizeof(R));
05689 
05690   if(ths->flags & PRE_PSI)
05691     ths->psi = (R*) Y(malloc)((size_t)(ths->M_total * ths->d * (2 * ths->m + 2)) * sizeof(R));
05692 
05693   if(ths->flags & PRE_FULL_PSI)
05694   {
05695       for (t = 0, lprod = 1; t < ths->d; t++)
05696         lprod *= 2 * ths->m + 2;
05697 
05698       ths->psi = (R*) Y(malloc)((size_t)(ths->M_total * lprod) * sizeof(R));
05699 
05700       ths->psi_index_f = (INT*) Y(malloc)((size_t)(ths->M_total) * sizeof(INT));
05701       ths->psi_index_g = (INT*) Y(malloc)((size_t)(ths->M_total * lprod) * sizeof(INT));
05702   }
05703 
05704   if(ths->flags & FFTW_INIT)
05705   {
05706 #ifdef _OPENMP
05707     INT nthreads = Y(get_num_threads)();
05708 #endif
05709 
05710     ths->g1 = (C*)Y(malloc)((size_t)(ths->n_total) * sizeof(C));
05711 
05712     if(ths->flags & FFT_OUT_OF_PLACE)
05713       ths->g2 = (C*) Y(malloc)((size_t)(ths->n_total) * sizeof(C));
05714     else
05715       ths->g2 = ths->g1;
05716 
05717 #ifdef _OPENMP
05718 #pragma omp critical (nfft_omp_critical_fftw_plan)
05719 {
05720     FFTW(plan_with_nthreads)(nthreads);
05721 #endif
05722     {
05723       int *_n = Y(malloc)((size_t)(ths->d) * sizeof(int));
05724 
05725       for (t = 0; t < ths->d; t++)
05726         _n[t] = (int)(ths->n[t]);
05727 
05728       ths->my_fftw_plan1 = FFTW(plan_dft)((int)ths->d, _n, ths->g1, ths->g2, FFTW_FORWARD, ths->fftw_flags);
05729       ths->my_fftw_plan2 = FFTW(plan_dft)((int)ths->d, _n, ths->g2, ths->g1, FFTW_BACKWARD, ths->fftw_flags);
05730       Y(free)(_n);
05731     }
05732 #ifdef _OPENMP
05733 }
05734 #endif
05735   }
05736 
05737   if(ths->flags & NFFT_SORT_NODES)
05738     ths->index_x = (INT*) Y(malloc)(sizeof(INT) * 2U * (size_t)(ths->M_total));
05739   else
05740     ths->index_x = NULL;
05741 
05742   ths->mv_trafo = (void (*) (void* ))X(trafo);
05743   ths->mv_adjoint = (void (*) (void* ))X(adjoint);
05744 }
05745 
05746 void X(init)(X(plan) *ths, int d, int *N, int M_total)
05747 {
05748   INT t; /* index over all dimensions */
05749 
05750   ths->d = (INT)d;
05751 
05752   ths->N = (INT*) Y(malloc)((size_t)(d) * sizeof(INT));
05753 
05754   for (t = 0; t < d; t++)
05755     ths->N[t] = (INT)N[t];
05756 
05757   ths->M_total = (INT)M_total;
05758 
05759   ths->n = (INT*) Y(malloc)((size_t)(d) * sizeof(INT));
05760 
05761   for (t = 0; t < d; t++)
05762     ths->n[t] = 2 * (Y(next_power_of_2)(ths->N[t]));
05763 
05764   ths->m = WINDOW_HELP_ESTIMATE_m;
05765 
05766   if (d > 1)
05767   {
05768 #ifdef _OPENMP
05769     ths->flags = PRE_PHI_HUT | PRE_PSI | MALLOC_X| MALLOC_F_HAT | MALLOC_F |
05770                       FFTW_INIT | FFT_OUT_OF_PLACE | NFFT_SORT_NODES |
05771                  NFFT_OMP_BLOCKWISE_ADJOINT;
05772 #else
05773     ths->flags = PRE_PHI_HUT | PRE_PSI | MALLOC_X| MALLOC_F_HAT | MALLOC_F |
05774                       FFTW_INIT | FFT_OUT_OF_PLACE | NFFT_SORT_NODES;
05775 #endif
05776   }
05777   else
05778     ths->flags = PRE_PHI_HUT | PRE_PSI | MALLOC_X| MALLOC_F_HAT | MALLOC_F |
05779                       FFTW_INIT | FFT_OUT_OF_PLACE;
05780 
05781   ths->fftw_flags= FFTW_ESTIMATE| FFTW_DESTROY_INPUT;
05782 
05783   ths->K = 0;
05784   init_help(ths);
05785 }
05786 
05787 void X(init_guru)(X(plan) *ths, int d, int *N, int M_total, int *n, int m,
05788   unsigned flags, unsigned fftw_flags)
05789 {
05790   INT t; /* index over all dimensions */
05791 
05792   ths->d = (INT)d;
05793   ths->M_total = (INT)M_total;
05794   ths->N = (INT*)Y(malloc)((size_t)(ths->d) * sizeof(INT));
05795 
05796   for (t = 0; t < d; t++)
05797     ths->N[t] = (INT)N[t];
05798 
05799   ths->n = (INT*)Y(malloc)((size_t)(ths->d) * sizeof(INT));
05800 
05801   for (t = 0; t < d; t++)
05802     ths->n[t] = (INT)n[t];
05803 
05804   ths->m = (INT)m;
05805 
05806   ths->flags = flags;
05807   ths->fftw_flags = fftw_flags;
05808 
05809   ths->K = 0;
05810   init_help(ths);
05811 }
05812 
05813 void X(init_lin)(X(plan) *ths, int d, int *N, int M_total, int *n, int m, int K,
05814   unsigned flags, unsigned fftw_flags)
05815 {
05816   INT t; /* index over all dimensions */
05817 
05818   ths->d = (INT)d;
05819   ths->M_total = (INT)M_total;
05820   ths->N = (INT*)Y(malloc)((size_t)(ths->d) * sizeof(INT));
05821 
05822   for (t = 0; t < d; t++)
05823     ths->N[t] = (INT)N[t];
05824 
05825   ths->n = (INT*)Y(malloc)((size_t)(ths->d) * sizeof(INT));
05826 
05827   for (t = 0; t < d; t++)
05828     ths->n[t] = (INT)n[t];
05829 
05830   ths->m = (INT)m;
05831 
05832   ths->flags = flags;
05833   ths->fftw_flags = fftw_flags;
05834 
05835   ths->K = K;
05836   init_help(ths);
05837 }
05838 
05839 void X(init_1d)(X(plan) *ths, int N1, int M_total)
05840 {
05841   int N[1];
05842 
05843   N[0] = N1;
05844 
05845   X(init)(ths, 1, N, M_total);
05846 }
05847 
05848 void X(init_2d)(X(plan) *ths, int N1, int N2, int M_total)
05849 {
05850   int N[2];
05851 
05852   N[0] = N1;
05853   N[1] = N2;
05854   X(init)(ths, 2, N, M_total);
05855 }
05856 
05857 void X(init_3d)(X(plan) *ths, int N1, int N2, int N3, int M_total)
05858 {
05859   int N[3];
05860 
05861   N[0] = N1;
05862   N[1] = N2;
05863   N[2] = N3;
05864   X(init)(ths, 3, N, M_total);
05865 }
05866 
05867 const char* X(check)(X(plan) *ths)
05868 {
05869   INT j;
05870 
05871   if (!ths->f)
05872       return "Member f not initialized.";
05873 
05874   if (!ths->x)
05875       return "Member x not initialized.";
05876 
05877   if (!ths->f_hat)
05878       return "Member f_hat not initialized.";
05879 
05880   if ((ths->flags & PRE_LIN_PSI) && ths->K < ths->M_total)
05881     return "Number of nodes too small to use PRE_LIN_PSI.";
05882 
05883   for (j = 0; j < ths->M_total * ths->d; j++)
05884   {
05885     if ((ths->x[j]<-K(0.5)) || (ths->x[j]>= K(0.5)))
05886     {
05887       return "ths->x out of range [-0.5,0.5)";
05888     }
05889   }
05890 
05891   for (j = 0; j < ths->d; j++)
05892   {
05893     if (ths->sigma[j] <= 1)
05894       return "Oversampling factor too small";
05895 
05896     if(ths->N[j] <= ths->m)
05897       return "Polynomial degree N is smaller than cut-off m";
05898 
05899     if(ths->N[j]%2 == 1)
05900       return "polynomial degree N has to be even";
05901   }
05902   return 0;
05903 }
05904 
05905 void X(finalize)(X(plan) *ths)
05906 {
05907   INT t; /* index over dimensions */
05908 
05909   if(ths->flags & NFFT_SORT_NODES)
05910     Y(free)(ths->index_x);
05911 
05912   if(ths->flags & FFTW_INIT)
05913   {
05914 #ifdef _OPENMP
05915     #pragma omp critical (nfft_omp_critical_fftw_plan)
05916 #endif
05917     FFTW(destroy_plan)(ths->my_fftw_plan2);
05918 #ifdef _OPENMP
05919     #pragma omp critical (nfft_omp_critical_fftw_plan)
05920 #endif
05921     FFTW(destroy_plan)(ths->my_fftw_plan1);
05922 
05923     if(ths->flags & FFT_OUT_OF_PLACE)
05924       Y(free)(ths->g2);
05925 
05926     Y(free)(ths->g1);
05927   }
05928 
05929   if(ths->flags & PRE_FULL_PSI)
05930   {
05931     Y(free)(ths->psi_index_g);
05932     Y(free)(ths->psi_index_f);
05933     Y(free)(ths->psi);
05934   }
05935 
05936   if(ths->flags & PRE_PSI)
05937     Y(free)(ths->psi);
05938 
05939   if(ths->flags & PRE_FG_PSI)
05940     Y(free)(ths->psi);
05941 
05942   if(ths->flags & PRE_LIN_PSI)
05943     Y(free)(ths->psi);
05944 
05945   if(ths->flags & PRE_PHI_HUT)
05946   {
05947     for (t = 0; t < ths->d; t++)
05948         Y(free)(ths->c_phi_inv[t]);
05949     Y(free)(ths->c_phi_inv);
05950   }
05951 
05952   if(ths->flags & MALLOC_F)
05953     Y(free)(ths->f);
05954 
05955   if(ths->flags & MALLOC_F_HAT)
05956     Y(free)(ths->f_hat);
05957 
05958   if(ths->flags & MALLOC_X)
05959     Y(free)(ths->x);
05960 
05961   WINDOW_HELP_FINALIZE;
05962 
05963   Y(free)(ths->sigma);
05964   Y(free)(ths->n);
05965   Y(free)(ths->N);
05966 }