Changeset b9c12fe5 in sasmodels


Ignore:
Timestamp:
Jul 15, 2016 11:25:10 PM (7 years ago)
Author:
Paul Kienzle <pkienzle@…>
Branches:
master, core_shell_microgels, costrafo411, magnetic_model, release_v0.94, release_v0.95, ticket-1257-vesicle-product, ticket_1156, ticket_1265_superball, ticket_822_more_unit_tests
Children:
f3bd37f
Parents:
def2c1b
Message:

faster using private pvector

File:
1 edited

Legend:

Unmodified
Added
Removed
  • sasmodels/kernel_iq.cl

    ra738209 rb9c12fe5  
    4848  // Storage for the current parameter values.  These will be updated as we 
    4949  // walk the polydispersity cube.  local_values will be aliased to pvec. 
    50   local ParameterBlock local_values; 
     50  ParameterBlock local_values; 
     51  double *pvec = (double *)&local_values; 
    5152 
    5253  // who we are and what element we are working with 
    5354  const int q_index = get_global_id(0); 
    54   const int thread = get_local_id(0); 
    5555 
    5656  // Fill in the initial variables 
    57   event_t e = async_work_group_copy((local double *)&local_values, values+2, NPARS, 0); 
    58   wait_group_events(1, &e); 
     57  for (int i=0; i < NPARS; i++) { 
     58    pvec[i] = values[2+i]; 
     59  } 
    5960 
    6061  // Monodisperse computation 
     
    8283 
    8384  //printf("Entering polydispersity from %d to %d\n", pd_start, pd_stop); 
    84   // norm will be shared across all threads. 
    8585 
    86   // "values" is global and can't be assigned to a local, so even though only 
    87   // the alias is only needed for thread 0 it is allocated in all threads. 
    8886  global const double *pd_value = values+2+NPARS; 
    8987  global const double *pd_weight = pd_value+details->pd_sum; 
     
    9189  // need product of weights at every Iq calc, so keep product of 
    9290  // weights from the outer loops so that weight = partial_weight * fast_weight 
    93   local double pd_norm; 
    94   local double partial_weight; // product of weight w4*w3*w2 but not w1 
    95   local double spherical_correction; // cosine correction for latitude variation 
    96   local double weight; // product of partial_weight*w1*spherical_correction 
    97   local double *pvec; 
    98   local int p0_par; 
    99   local int p0_length; 
    100   local int p0_offset; 
    101   local int p0_is_theta; 
    102   local int p0_index; 
     91  double pd_norm; 
     92  double partial_weight; // product of weight w4*w3*w2 but not w1 
     93  double spherical_correction; // cosine correction for latitude variation 
     94  double weight; // product of partial_weight*w1*spherical_correction 
     95  int p0_par; 
     96  int p0_length; 
     97  int p0_offset; 
     98  int p0_is_theta; 
     99  int p0_index; 
    103100 
    104101  // Number of elements in the longest polydispersity loop 
    105   barrier(CLK_LOCAL_MEM_FENCE); 
    106   if (thread == 0) { 
    107     pvec = (local double *)(&local_values); 
     102  p0_par = details->pd_par[0]; 
     103  p0_length = details->pd_length[0]; 
     104  p0_offset = details->pd_offset[0]; 
     105  p0_is_theta = (p0_par == details->theta_par); 
    108106 
    109     // Number of elements in the longest polydispersity loop 
    110     p0_par = details->pd_par[0]; 
    111     p0_length = details->pd_length[0]; 
    112     p0_offset = details->pd_offset[0]; 
    113     p0_is_theta = (p0_par == details->theta_par); 
     107  // Trigger the reset behaviour that happens at the end the fast loop 
     108  // by setting the initial index >= weight vector length. 
     109  p0_index = p0_length; 
    114110 
    115     // Trigger the reset behaviour that happens at the end the fast loop 
    116     // by setting the initial index >= weight vector length. 
    117     p0_index = p0_length; 
     111  // Default the spherical correction to 1.0 in case it is not otherwise set 
     112  spherical_correction = 1.0; 
     113  weight=1.0; 
    118114 
    119     // Default the spherical correction to 1.0 in case it is not otherwise set 
    120     spherical_correction = 1.0; 
    121  
    122     // Since we are no longer looping over the entire polydispersity hypercube 
    123     // for each q, we need to track the result and normalization values between 
    124     // calls.  This means initializing them to 0 at the start and accumulating 
    125     // them between calls. 
    126     pd_norm = pd_start == 0 ? 0.0 : result[nq]; 
    127   } 
    128   barrier(CLK_LOCAL_MEM_FENCE); 
     115  // Since we are no longer looping over the entire polydispersity hypercube 
     116  // for each q, we need to track the result and normalization values between 
     117  // calls.  This means initializing them to 0 at the start and accumulating 
     118  // them between calls. 
     119  pd_norm = pd_start == 0 ? 0.0 : result[nq]; 
    129120 
    130121  if (q_index < nq) { 
     
    134125  // Loop over the weights then loop over q, accumulating values 
    135126  for (int loop_index=pd_start; loop_index < pd_stop; loop_index++) { 
    136     barrier(CLK_LOCAL_MEM_FENCE); 
    137     if (thread == 0) { 
    138       // check if fast loop needs to be reset 
    139       if (p0_index == p0_length) { 
    140         //printf("should be here with %d active\n", num_active); 
     127    // check if fast loop needs to be reset 
     128    if (p0_index == p0_length) { 
     129      //printf("should be here with %d active\n", num_active); 
    141130 
    142         // Compute position in polydispersity hypercube and partial weight 
    143         partial_weight = 1.0; 
    144         for (int k=1; k < details->num_active; k++) { 
    145           int pk = details->pd_par[k]; 
    146           int index = details->pd_offset[k] + (loop_index/details->pd_stride[k])%details->pd_length[k]; 
    147           pvec[pk] = pd_value[index]; 
    148           partial_weight *= pd_weight[index]; 
    149           //printf("index[%d] = %d\n",k,index); 
    150           if (pk == details->theta_par) { 
    151             spherical_correction = fmax(fabs(cos(M_PI_180*pvec[pk])), 1.e-6); 
    152           } 
     131      // Compute position in polydispersity hypercube and partial weight 
     132      partial_weight = 1.0; 
     133      for (int k=1; k < details->num_active; k++) { 
     134        int pk = details->pd_par[k]; 
     135        int index = details->pd_offset[k] + (loop_index/details->pd_stride[k])%details->pd_length[k]; 
     136        pvec[pk] = pd_value[index]; 
     137        partial_weight *= pd_weight[index]; 
     138        //printf("index[%d] = %d\n",k,index); 
     139        if (pk == details->theta_par) { 
     140          spherical_correction = fmax(fabs(cos(M_PI_180*pvec[pk])), 1.e-6); 
    153141        } 
    154         p0_index = loop_index%p0_length; 
    155         //printf("\n"); 
    156142      } 
     143      p0_index = loop_index%p0_length; 
     144      //printf("\n"); 
     145    } 
    157146 
    158       // Update parameter p0 
    159       weight = partial_weight*pd_weight[p0_offset + p0_index]; 
    160       pvec[p0_par] = pd_value[p0_offset + p0_index]; 
    161       if (p0_is_theta) { 
    162         spherical_correction = fmax(fabs(cos(M_PI_180*pvec[p0_par])), 1.e-6); 
    163       } 
    164       p0_index++; 
     147    // Update parameter p0 
     148    weight = partial_weight*pd_weight[p0_offset + p0_index]; 
     149    pvec[p0_par] = pd_value[p0_offset + p0_index]; 
     150    if (p0_is_theta) { 
     151      spherical_correction = fmax(fabs(cos(M_PI_180*pvec[p0_par])), 1.e-6); 
    165152    } 
    166     barrier(CLK_LOCAL_MEM_FENCE); 
     153    p0_index++; 
    167154    //printf("\n"); 
    168155 
Note: See TracChangeset for help on using the changeset viewer.