Ticket #508: load_otoko.py

File load_otoko.py, 5.9 KB (added by smk78, 8 years ago)

Peter Parkers Python routine for loading OTOKO format data files

Line 
1"""
2Here we handle loading of "OTOKO" data (for more info about this format see
3the comment in load_otoko_data).  Given the paths of header and data files, we
4aim to load the data into numpy arrays for use later.
5"""
6
7import itertools
8import os
9import struct
10import numpy as np
11
12class CStyleStruct:
13    """A nice and easy way to get "C-style struct" functionality."""
14    def __init__(self, **kwds):
15        self.__dict__.update(kwds)
16
17class OtokoParsingError(Exception):
18    pass
19
20class OtokoData:
21    def __init__(self, q_axis, data_axis):
22        self.q_axis = q_axis
23        self.data_axis = data_axis
24
25def load_otoko_data(q_axis_header_path, data_axis_header_path):
26    """
27    Loads "OTOKO" data, which is a format that stores each axis separately.
28    An axis is represented by a "header" file, which in turn will give details
29    of one or more binary files where the actual data is stored.
30
31    Given the paths of two header files, this function will load each axis in
32    turn.  If loading is successfull then an instance of the OtokoData class
33    will be returned, else an exception will be raised.
34
35    For more information on the OTOKO file format, please see:
36    http://www.diamond.ac.uk/Home/Beamlines/small-angle/SAXS-Software/CCP13/
37    XOTOKO.html
38
39    The BSL format, which is based on OTOKO, is also supported.  Find out more
40    about the BSL format at http://www.diamond.ac.uk/Home/Beamlines/small-angle
41    /SAXS-Software/CCP13/BSL.html.
42    """
43    q_axis    = _load_otoko_axis(q_axis_header_path)
44    data_axis = _load_otoko_axis(data_axis_header_path)
45
46    return OtokoData(q_axis, data_axis)
47
48def _load_otoko_axis(header_path):
49    """
50    Loads an "OTOKO" axis, given the header file path.  Essentially, the
51    header file contains information about the data in the form of integer
52    "indicators", as well as the names of each of the binary files which are
53    assumed to be in the same directory as the header.
54    """
55    if not os.path.exists(header_path):
56        raise OtokoParsingError("The header file %s does not exist." % header_path)
57
58    binary_file_info_list = []
59    total_frames = 0
60    header_dir = os.path.dirname(os.path.abspath(header_path))
61
62    with open(header_path, "r") as header_file:
63        lines = header_file.readlines()
64        if len(lines) < 4:
65            raise OtokoParsingError("Expected more lines in %s." % header_path)
66
67        info = lines[0] + lines[1]
68
69        def pairwise(iterable):
70            """
71            s -> (s0,s1), (s2,s3), (s4, s5), ...
72            From http://stackoverflow.com/a/5389547/778572
73            """
74            a = iter(iterable)
75            return itertools.izip(a, a)
76
77        for indicators, filename in pairwise(lines[2:]):
78            indicators = indicators.split()
79
80            if len(indicators) != 10:
81                raise OtokoParsingError(
82                    "Expected 10 integer indicators on line 3 of %s." \
83                    % header_path)
84            if not all([i.isdigit() for i in indicators]):
85                raise OtokoParsingError(
86                    "Expected all indicators on line 3 of %s to be integers." \
87                    % header_path)
88
89            binary_file_info = CStyleStruct(
90                # The indicators at indices 4 to 8 are always zero since they
91                # have been reserved for future use by the format.  Also, the
92                # "last_file" indicator seems to be there for legacy reasons,
93                # as it doesn't appear to be something we have to bother
94                # enforcing correct use of; we just define the last file as
95                # being the last file in the list.
96                file_path  = os.path.join(header_dir, filename.strip()),
97                n_channels = int(indicators[0]),
98                n_frames   = int(indicators[1]),
99                dimensions = int(indicators[2]),
100                swap_bytes = int(indicators[3]) == 0,
101                last_file  = int(indicators[9]) == 0 # We don't use this.
102            )
103            binary_file_info_list.append(binary_file_info)
104
105            total_frames += binary_file_info.n_frames
106
107    # Check that all binary files are listed in the header as having the same
108    # number of channels, since I don't think CorFunc can handle ragged data.
109    all_n_channels = [info.n_channels for info in binary_file_info_list]
110    if not all(all_n_channels[0] == c for c in all_n_channels):
111        raise OtokoParsingError(
112            "Expected all binary files listed in %s to have the same number of channels." % header_path)
113
114    data = np.zeros(shape=(total_frames, all_n_channels[0]))
115    frames_so_far = 0
116
117    for info in binary_file_info_list:
118        if not os.path.exists(info.file_path):
119            raise OtokoParsingError(
120                "The data file %s does not exist." % info.file_path)
121
122        with open(info.file_path, "rb") as binary_file:
123            # Ideally we'd like to use numpy's fromfile() to read in binary
124            # data, but we are forced to roll our own float-by-float file
125            # reader because of the rules imposed on us by the file format;
126            # namely, if the swap indicator flag has been raised then the bytes
127            # of each float occur in reverse order.
128            for frame in range(info.n_frames):
129                for channel in range(info.n_channels):
130                    b = bytes(binary_file.read(4))
131                    if info.swap_bytes:
132                        b = b[::-1] # "Extended slice" syntax, used to reverse.
133                    value = struct.unpack('f', b)[0]
134                    data[frames_so_far + frame][channel] = value
135
136            frames_so_far += info.n_frames
137
138    return CStyleStruct(
139        header_path = header_path,
140        data = data,
141        binary_file_info_list = binary_file_info_list,
142        header_info = info
143    )