Changes in / [1448db7a:25b82a1] in sasmodels


Ignore:
Location:
sasmodels
Files:
4 edited

Legend:

Unmodified
Added
Removed
  • sasmodels/generate.py

    rf2f67a6 rae2b6b5  
    473473    dll_code = load_template('kernel_iq.c') 
    474474    ocl_code = load_template('kernel_iq.cl') 
     475    #ocl_code = load_template('kernel_iq_local.cl') 
    475476    user_code = [open(f).read() for f in model_sources(model_info)] 
    476477 
  • sasmodels/kernel_iq.c

    rf2f67a6 rae2b6b5  
    5252  // Storage for the current parameter values.  These will be updated as we 
    5353  // walk the polydispersity cube. 
    54   local ParameterBlock local_values;  // current parameter values 
     54  ParameterBlock local_values;  // current parameter values 
    5555  double *pvec = (double *)(&local_values);  // Alias named parameters with a vector 
    5656  double norm; 
     
    7474    norm = CALL_VOLUME(local_values); 
    7575 
    76     const double scale = values[0]; 
    77     const double background = values[1]; 
    78     // result[nq] = norm; // Total volume normalization 
     76    double scale, background; 
     77    scale = values[0]; 
     78    background = values[1]; 
    7979 
    8080    #ifdef USE_OPENMP 
    8181    #pragma omp parallel for 
    8282    #endif 
    83     for (int i=0; i < nq; i++) { 
    84       double scattering = CALL_IQ(q, i, local_values); 
    85       result[i] = (norm>0. ? scale*scattering/norm + background : background); 
     83    for (int q_index=0; q_index < nq; q_index++) { 
     84      double scattering = CALL_IQ(q, q_index, local_values); 
     85      result[q_index] = (norm>0. ? scale*scattering/norm + background : background); 
    8686    } 
    8787    return; 
     
    8989 
    9090#if MAX_PD > 0 
    91   // If it is the first round initialize the result to zero, otherwise 
    92   // assume that the previous result has been passed back. 
    93   // Note: doing this even in the monodisperse case in order to handle the 
    94   // rare case where the model parameters are invalid and zero is returned. 
    95   // So slightly increased cost for slightly smaller code size. 
     91 
     92  // need product of weights at every Iq calc, so keep product of 
     93  // weights from the outer loops so that weight = partial_weight * fast_weight 
     94  double partial_weight; // product of weight w4*w3*w2 but not w1 
     95  double spherical_correction; // cosine correction for latitude variation 
     96  double weight; // product of partial_weight*w1*spherical_correction 
     97 
     98  // Location in the polydispersity hypercube, one index per dimension. 
     99  int pd_index[MAX_PD]; 
     100 
     101  // Location of the coordinated parameters in their own sub-cubes. 
     102  int offset[NPARS]; 
     103 
     104  // Number of coordinated indices 
     105  const int num_coord = details->num_coord; 
     106 
     107  // Number of elements in the longest polydispersity loop 
     108  const int fast_length = details->pd_length[0]; 
     109 
     110  // Trigger the reset behaviour that happens at the end the fast loop 
     111  // by setting the initial index >= weight vector length. 
     112  pd_index[0] = fast_length; 
     113 
     114  // Default the spherical correction to 1.0 in case it is not otherwise set 
     115  spherical_correction = 1.0; 
     116 
     117  // Since we are no longer looping over the entire polydispersity hypercube 
     118  // for each q, we need to track the result and normalization values between 
     119  // calls.  This means initializing them to 0 at the start and accumulating 
     120  // them between calls. 
     121  norm = pd_start == 0 ? 0.0 : result[nq]; 
    96122  if (pd_start == 0) { 
    97123    #ifdef USE_OPENMP 
    98124    #pragma omp parallel for 
    99125    #endif 
    100     for (int i=0; i < nq+1; i++) { 
    101       result[i] = 0.0; 
    102     } 
    103     norm = 0.0; 
    104   } else { 
    105     norm = result[nq]; 
    106   } 
    107  
    108   // need product of weights at every Iq calc, so keep product of 
    109   // weights from the outer loops so that weight = partial_weight * fast_weight 
    110   double partial_weight = NAN; // product of weight w4*w3*w2 but not w1 
    111   double spherical_correction = 1.0;  // cosine correction for latitude variation 
    112  
    113   // Location in the polydispersity hypercube, one index per dimension. 
    114   local int pd_index[MAX_PD]; 
    115  
    116   // Location of the coordinated parameters in their own sub-cubes. 
    117   local int offset[NPARS]; 
    118  
    119   // Trigger the reset behaviour that happens at the end the fast loop 
    120   // by setting the initial index >= weight vector length. 
    121   const int fast_length = details->pd_length[0]; 
    122   pd_index[0] = fast_length; 
    123  
    124   // Number of coordinated indices 
    125   const int num_coord = details->num_coord; 
     126    for (int q_index=0; q_index < nq; q_index++) { 
     127      result[q_index] = 0.0; 
     128    } 
     129  } 
    126130 
    127131  // Loop over the weights then loop over q, accumulating values 
     
    172176    } 
    173177 
    174     // Increment fast index 
    175     const double wi = weights[details->pd_offset[0] + pd_index[0]++]; 
    176     double weight = partial_weight*wi; 
     178    // Update fast parameters 
    177179    //printf("fast %d: ", loop_index); 
    178180    for (int k=0; k < num_coord; k++) { 
     
    189191    //printf("\n"); 
    190192 
     193    // Increment fast index 
     194    const double wi = weights[details->pd_offset[0] + pd_index[0]]; 
     195    weight = partial_weight*wi; 
     196    pd_index[0]++; 
     197 
    191198    #ifdef INVALID 
    192199    if (INVALID(local_values)) continue; 
     
    204211      #pragma omp parallel for 
    205212      #endif 
    206       for (int i=0; i < nq; i++) { 
    207         const double scattering = CALL_IQ(q, i, local_values); 
    208         result[i] += weight*scattering; 
    209       } 
    210     } 
    211   } 
    212  
    213   // End of the PD loop we can normalize 
     213      for (int q_index=0; q_index < nq; q_index++) { 
     214        const double scattering = CALL_IQ(q, q_index, local_values); 
     215        result[q_index] += weight*scattering; 
     216      } 
     217    } 
     218  } 
     219 
    214220  if (pd_stop >= details->total_pd) { 
    215     const double scale = values[0]; 
    216     const double background = values[1]; 
     221    // End of the PD loop we can normalize 
     222    double scale, background; 
     223    scale = values[0]; 
     224    background = values[1]; 
    217225    #ifdef USE_OPENMP 
    218226    #pragma omp parallel for 
    219227    #endif 
    220     for (int i=0; i < nq; i++) { 
    221       result[i] = (norm>0. ? scale*result[i]/norm + background : background); 
     228    for (int q_index=0; q_index < nq; q_index++) { 
     229      result[q_index] = (norm>0. ? scale*result[q_index]/norm + background : background); 
    222230    } 
    223231  } 
  • sasmodels/kernel_iq.cl

    rf2f67a6 rae2b6b5  
    5050    ) 
    5151{ 
    52   double norm; 
    53  
    54   // who we are and what element we are working with 
    55   const int q_index = get_global_id(0); 
    56  
    57   // number of active loops 
    58   const int num_active = details->num_active; 
    59  
    6052  // Storage for the current parameter values.  These will be updated as we 
    6153  // walk the polydispersity cube. 
    6254  ParameterBlock local_values;  // current parameter values 
    6355  double *pvec = (double *)(&local_values);  // Alias named parameters with a vector 
     56  double norm; 
     57 
     58  // who we are and what element we are working with 
     59  const int q_index = get_global_id(0); 
     60 
     61  // number of active loops 
     62  const int num_active = details->num_active; 
     63 
    6464  // Fill in the initial variables 
    65   for (int k = 0; k < NPARS; k++) { 
     65  for (int k=0; k < NPARS; k++) { 
    6666    pvec[k] = values[details->par_offset[k]]; 
    6767  } 
     
    7272    if (INVALID(local_values)) { return; } 
    7373    #endif 
     74    norm = CALL_VOLUME(local_values); 
    7475 
    7576    double scale, background; 
    76     norm = CALL_VOLUME(local_values); 
    7777    scale = values[0]; 
    7878    background = values[1]; 
    79  
    80     // if (i==0) result[nq] = norm; // Total volume normalization 
    8179 
    8280    if (q_index < nq) { 
     
    8987#if MAX_PD > 0 
    9088 
    91   // If it is the first round initialize the result to zero, otherwise 
    92   // assume that the previous result has been passed back. 
    93   // Note: doing this even in the monodisperse case in order to handle the 
    94   // rare case where the model parameters are invalid and zero is returned. 
    95   // So slightly increased cost for slightly smaller code size. 
    9689  double this_result; 
    9790 
     
    10295  // weights from the outer loops so that weight = partial_weight * fast_weight 
    10396  double partial_weight; // product of weight w4*w3*w2 but not w1 
    104   double spherical_correction;  // cosine correction for latitude variation 
     97  double spherical_correction; // cosine correction for latitude variation 
     98  double weight; // product of partial_weight*w1*spherical_correction 
    10599 
    106100  // Location in the polydispersity hypercube, one index per dimension. 
     
    110104  int offset[NPARS]; 
    111105 
     106  // Number of coordinated indices 
     107  const int num_coord = details->num_coord; 
     108 
    112109  // Number of elements in the longest polydispersity loop 
    113110  const int fast_length = details->pd_length[0]; 
    114111 
    115   // Number of coordinated indices 
    116   const int num_coord = details->num_coord; 
    117  
    118   // We could in theory spread this work across different threads, but 
    119   // lets keep it simple; 
    120   norm = pd_start == 0 ? 0.0 : result[nq]; 
    121   spherical_correction = 1.0;  // the usual case. 
    122   // partial_weight = NAN; 
    123112  // Trigger the reset behaviour that happens at the end the fast loop 
    124113  // by setting the initial index >= weight vector length. 
    125114  pd_index[0] = fast_length; 
     115 
     116  // Default the spherical correction to 1.0 in case it is not otherwise set 
     117  spherical_correction = 1.0; 
    126118 
    127119  // Since we are no longer looping over the entire polydispersity hypercube 
     
    129121  // calls.  This means initializing them to 0 at the start and accumulating 
    130122  // them between calls. 
     123  norm = pd_start == 0 ? 0.0 : result[nq]; 
    131124  if (q_index < nq) { 
    132125    this_result = pd_start == 0 ? 0.0 : result[q_index]; 
     
    141134      // Compute position in polydispersity hypercube 
    142135      for (int k=0; k < num_active; k++) { 
    143           pd_index[k] = (loop_index/details->pd_stride[k])%details->pd_length[k]; 
    144           //printf("pd_index[%d] = %d\n",k,pd_index[k]); 
     136        pd_index[k] = (loop_index/details->pd_stride[k])%details->pd_length[k]; 
     137        //printf("pd_index[%d] = %d\n",k,pd_index[k]); 
    145138      } 
    146139 
     
    163156      //printf("slow %d: ", loop_index); 
    164157      for (int k=0; k < num_coord; k++) { 
    165         if (k < num_coord) { 
    166           int par = details->par_coord[k]; 
    167           int coord = details->pd_coord[k]; 
    168           int this_offset = details->par_offset[par]; 
    169           int block_size = 1; 
    170           for (int bit=0; coord != 0; bit++) { 
    171             if (coord&1) { 
    172                 this_offset += block_size * pd_index[bit]; 
    173                 block_size *= details->pd_length[bit]; 
    174             } 
    175             coord >>= 1; 
     158        int par = details->par_coord[k]; 
     159        int coord = details->pd_coord[k]; 
     160        int this_offset = details->par_offset[par]; 
     161        int block_size = 1; 
     162        for (int bit=0; coord != 0; bit++) { 
     163          if (coord&1) { 
     164              this_offset += block_size * pd_index[bit]; 
     165              block_size *= details->pd_length[bit]; 
    176166          } 
    177           offset[par] = this_offset; 
    178           pvec[par] = values[this_offset]; 
    179           //printf("par[%d]=v[%d]=%g \n", k, offset[k], pvec[k]); 
    180           // if theta is not coordinated with fast index, precompute spherical correction 
    181           if (par == details->theta_par && !(details->par_coord[k]&1)) { 
    182             spherical_correction = fmax(fabs(cos(M_PI_180*pvec[details->theta_par])), 1.e-6); 
    183           } 
     167          coord >>= 1; 
    184168        } 
     169        offset[par] = this_offset; 
     170        pvec[par] = values[this_offset]; 
     171        //printf("par[%d]=v[%d]=%g \n", k, offset[k], pvec[k]); 
     172        // if theta is not coordinated with fast index, precompute spherical correction 
     173        if (par == details->theta_par && !(details->par_coord[k]&1)) { 
     174          spherical_correction = fmax(fabs(cos(M_PI_180*pvec[details->theta_par])), 1.e-6); 
     175        } 
    185176      } 
    186177      //printf("\n"); 
    187178    } 
    188179 
    189     double weight; 
     180    // Update fast parameters 
     181    //printf("fast %d: ", loop_index); 
     182    for (int k=0; k < num_coord; k++) { 
     183      if (details->pd_coord[k]&1) { 
     184        const int par = details->par_coord[k]; 
     185        pvec[par] = values[offset[par]++]; 
     186        //printf("p[%d]=v[%d]=%g ", par, offset[par]-1, pvec[par]); 
     187        // if theta is coordinated with fast index, compute spherical correction each time 
     188        if (par == details->theta_par) { 
     189          spherical_correction = fmax(fabs(cos(M_PI_180*pvec[details->theta_par])), 1.e-6); 
     190        } 
     191      } 
     192    } 
     193    //printf("\n"); 
     194 
     195    // Increment fast index 
    190196    const double wi = weights[details->pd_offset[0] + pd_index[0]]; 
    191197    weight = partial_weight*wi; 
    192198    pd_index[0]++; 
    193  
    194     // Increment fast index 
    195     //printf("fast %d: ", loop_index); 
    196     for (int k=0; k < num_coord; k++) { 
    197       if (k < num_coord) { 
    198         if (details->pd_coord[k]&1) { 
    199           const int par = details->par_coord[k]; 
    200           pvec[par] = values[offset[par]++]; 
    201           //printf("p[%d]=v[%d]=%g ", par, offset[par]-1, pvec[par]); 
    202           // if theta is coordinated with fast index, compute spherical correction each time 
    203           if (par == details->theta_par) { 
    204             spherical_correction = fmax(fabs(cos(M_PI_180*pvec[details->theta_par])), 1.e-6); 
    205           } 
    206         } 
    207       } 
    208     } 
    209     //printf("\n"); 
    210199 
    211200    #ifdef INVALID 
     
    229218    if (pd_stop >= details->total_pd) { 
    230219      // End of the PD loop we can normalize 
    231       const double scale = values[0]; 
    232       const double background = values[1]; 
     220      double scale, background; 
     221      scale = values[0]; 
     222      background = values[1]; 
    233223      result[q_index] = (norm>0. ? scale*this_result/norm + background : background); 
    234224    } else { 
     
    236226      result[q_index] = this_result; 
    237227    } 
    238     // Accumulate norm. 
     228 
     229    // Remember the updated norm. 
    239230    if (q_index == 0) result[nq] = norm; 
    240231  } 
  • sasmodels/kernelcl.py

    rf2f67a6 rae2b6b5  
    511511                             hostbuf=values) 
    512512 
    513         start, stop = 0, call_details.total_pd 
    514         args = [ 
    515             np.uint32(self.q_input.nq), np.int32(start), np.int32(stop), 
    516             details_b, weights_b, values_b, self.q_input.q_b, self.result_b, 
    517             self.real(cutoff), 
    518         ] 
    519         self.kernel(self.queue, self.q_input.global_size, None, *args) 
     513        # Call kernel and retrieve results 
     514        step = 100 
     515        for start in range(0, call_details.total_pd, step): 
     516            stop = min(start+step, call_details.total_pd) 
     517            args = [ 
     518                np.uint32(self.q_input.nq), np.int32(start), np.int32(stop), 
     519                details_b, weights_b, values_b, self.q_input.q_b, self.result_b, 
     520                self.real(cutoff), 
     521            ] 
     522            self.kernel(self.queue, self.q_input.global_size, None, *args) 
    520523        cl.enqueue_copy(self.queue, self.result, self.result_b) 
     524 
     525        # Free buffers 
    521526        for v in (details_b, weights_b, values_b): 
    522527            if v is not None: v.release() 
Note: See TracChangeset for help on using the changeset viewer.