Changes in sasmodels/kernelcl.py [c036ddb:5399809] in sasmodels


Ignore:
File:
1 edited

Legend:

Unmodified
Added
Removed
  • sasmodels/kernelcl.py

    rc036ddb r5399809  
    481481        # at this point, so instead using 32, which is good on the set of 
    482482        # architectures tested so far. 
     483        extra_q = 3  # total weight, weighted volume and weighted radius 
    483484        if self.is_2d: 
    484             # Note: 16 rather than 15 because result is 1 longer than input. 
    485             width = ((self.nq+16)//16)*16 
     485            width = ((self.nq+15+extra_q)//16)*16 
    486486            self.q = np.empty((width, 2), dtype=dtype) 
    487487            self.q[:self.nq, 0] = q_vectors[0] 
    488488            self.q[:self.nq, 1] = q_vectors[1] 
    489489        else: 
    490             # Note: 32 rather than 31 because result is 1 longer than input. 
    491             width = ((self.nq+32)//32)*32 
     490            width = ((self.nq+31+extra_q)//32)*32 
    492491            self.q = np.empty(width, dtype=dtype) 
    493492            self.q[:self.nq] = q_vectors[0] 
     
    539538        self.dim = '2d' if q_input.is_2d else '1d' 
    540539        # leave room for f1/f2 results in case we need to compute beta for 1d models 
    541         num_returns = 1 if self.dim == '2d' else 2  # 
    542         # plus 1 for the normalization value 
    543         self.result = np.empty((q_input.nq+1)*num_returns, dtype) 
     540        nout = 2 if self.info.have_Fq and self.dim == '1d' else 1 
     541        # plus 3 weight, volume, radius 
     542        self.result = np.empty(q_input.nq*nout + 3, self.dtype) 
    544543 
    545544        # Inputs and outputs for each kernel call 
     
    549548 
    550549        self.result_b = cl.Buffer(self.queue.context, mf.READ_WRITE, 
    551                                   q_input.global_size[0] * num_returns * dtype.itemsize) 
     550                                  q_input.global_size[0] * nout * dtype.itemsize) 
    552551        self.q_input = q_input # allocated by GpuInput above 
    553552 
     
    558557                     else np.float32)  # will never get here, so use np.float32 
    559558 
    560     def Iq(self, call_details, values, cutoff, magnetic): 
    561         # type: (CallDetails, np.ndarray, np.ndarray, float, bool) -> np.ndarray 
    562         self._call_kernel(call_details, values, cutoff, magnetic) 
    563         #print("returned",self.q_input.q, self.result) 
    564         pd_norm = self.result[self.q_input.nq] 
    565         scale = values[0]/(pd_norm if pd_norm != 0.0 else 1.0) 
    566         background = values[1] 
    567         #print("scale",scale,background) 
    568         return scale*self.result[:self.q_input.nq] + background 
    569     __call__ = Iq 
    570  
    571     def beta(self, call_details, values, cutoff, magnetic): 
    572         # type: (CallDetails, np.ndarray, np.ndarray, float, bool) -> np.ndarray 
    573         if self.dim == '2d': 
    574             raise NotImplementedError("beta not yet supported for 2D") 
    575         self._call_kernel(call_details, values, cutoff, magnetic) 
    576         w_norm = self.result[2*self.q_input.nq + 1] 
    577         pd_norm = self.result[self.q_input.nq] 
    578         if w_norm == 0.: 
    579             w_norm = 1. 
    580         F2 = self.result[:self.q_input.nq]/w_norm 
    581         F1 = self.result[self.q_input.nq+1:2*self.q_input.nq+1]/w_norm 
    582         volume_avg = pd_norm/w_norm 
    583         return F1, F2, volume_avg 
    584  
    585     def _call_kernel(self, call_details, values, cutoff, magnetic): 
     559    def _call_kernel(self, call_details, values, cutoff, magnetic, effective_radius_type): 
    586560        # type: (CallDetails, np.ndarray, np.ndarray, float, bool) -> np.ndarray 
    587561        context = self.queue.context 
     
    597571            details_b, values_b, self.q_input.q_b, self.result_b, 
    598572            self.real(cutoff), 
     573            np.uint32(effective_radius_type), 
    599574        ] 
    600575        #print("Calling OpenCL") 
Note: See TracChangeset for help on using the changeset viewer.