Changeset 869fd7b in sasmodels


Ignore:
Timestamp:
Oct 26, 2018 11:29:56 AM (2 weeks ago)
Author:
Paul Kienzle <pkienzle@…>
Branches:
beta_approx, py3, ticket-1015-gpu-mem-error, ticket-1157, ticket-608-user-defined-weights, ticket_1156
Children:
12f4c19
Parents:
81751c2
Message:

update cuda kernel to support Fq interface

File:
1 edited

Legend:

Unmodified
Added
Removed
  • sasmodels/kernelcuda.py

    r8b31efa r869fd7b  
    431431    def __init__(self, kernel, dtype, model_info, q_vectors): 
    432432        # type: (cl.Kernel, np.dtype, ModelInfo, List[np.ndarray]) -> None 
    433         q_input = GpuInput(q_vectors, dtype) 
     433        self.q_input = GpuInput(q_vectors, dtype) 
    434434        self.kernel = kernel 
     435        self._as_dtype = (np.float32 if dtype == generate.F32 
     436                          else np.float64 if dtype == generate.F64 
     437                          else np.float16 if dtype == generate.F16 
     438                          else np.float32)  # will never get here, so use np.float32 
     439 
     440        # attributes accessed from the outside 
     441        self.dim = '2d' if self.q_input.is_2d else '1d' 
    435442        self.info = model_info 
    436443        self.dtype = dtype 
    437         self.dim = '2d' if q_input.is_2d else '1d' 
    438         # plus three for the normalization values 
    439         self.result = np.empty(q_input.nq+1, dtype) 
     444 
     445        # holding place for the returned value 
     446        nout = 2 if self.info.have_Fq and self.dim == '1d' else 1 
     447        extra_q = 4  # total weight, form volume, shell volume and R_eff 
     448        self.result = np.empty(self.q_input.nq*nout+extra_q, dtype) 
    440449 
    441450        # Inputs and outputs for each kernel call 
    442451        # Note: res may be shorter than res_b if global_size != nq 
    443         self.result_b = cuda.mem_alloc(q_input.global_size[0] * dtype.itemsize) 
    444         self.q_input = q_input # allocated by GpuInput above 
    445  
     452        width = ((self.result.size+31)//32)*32 * self.dtype.itemsize 
     453        self.result_b = cuda.mem_alloc(width) 
    446454        self._need_release = [self.result_b] 
    447         self.real = (np.float32 if dtype == generate.F32 
    448                      else np.float64 if dtype == generate.F64 
    449                      else np.float16 if dtype == generate.F16 
    450                      else np.float32)  # will never get here, so use np.float32 
    451  
    452     def __call__(self, call_details, values, cutoff, magnetic): 
     455 
     456    def _call_kernel(self, call_details, values, cutoff, magnetic, effective_radius_type): 
    453457        # type: (CallDetails, np.ndarray, np.ndarray, float, bool) -> np.ndarray 
    454458        # Arrange data transfer to card 
     
    460464            np.uint32(self.q_input.nq), None, None, 
    461465            details_b, values_b, self.q_input.q_b, self.result_b, 
    462             self.real(cutoff), 
     466            self._as_dtype(cutoff), 
     467            np.uint32(effective_radius_type), 
    463468        ] 
    464469        grid = partition(self.q_input.nq) 
     
    488493        values_b.free() 
    489494 
    490         pd_norm = self.result[self.q_input.nq] 
    491         scale = values[0]/(pd_norm if pd_norm != 0.0 else 1.0) 
    492         background = values[1] 
    493         #print("scale",scale,values[0],self.result[self.q_input.nq],background) 
    494         return scale*self.result[:self.q_input.nq] + background 
    495         # return self.result[:self.q_input.nq] 
    496  
    497495    def release(self): 
    498496        # type: () -> None 
Note: See TracChangeset for help on using the changeset viewer.