1 | """ |
---|
2 | Read a relaxed JSON file. |
---|
3 | |
---|
4 | Relaxed JSON allows comments (introduced by // and going to the end |
---|
5 | of the line), optional quotes on key names in dictionaries and optional |
---|
6 | trailing commas at the ends of lists and dictionaries. It also strips |
---|
7 | the leading characters up to the first '{'. Multiline strings can be |
---|
8 | formatted using "\n\" at the end of each line. |
---|
9 | |
---|
10 | If the file contains e.g., "var _ = {...}", then it can be edited with |
---|
11 | a JavaScript aware editor such as VJET for eclipse, and it will be easier |
---|
12 | to locate errors and adjust formatting. |
---|
13 | """ |
---|
14 | import re |
---|
15 | import json |
---|
16 | from contextlib import contextmanager |
---|
17 | |
---|
18 | try: |
---|
19 | from collections import OrderedDict |
---|
20 | except: |
---|
21 | from ordered_dict import OrderedDict |
---|
22 | |
---|
23 | |
---|
24 | _LEADING_TEXT = re.compile(r'^.*?[{]', re.DOTALL) |
---|
25 | _LINE_CONTINUATION = re.compile(r'\\\s*\n') |
---|
26 | _TRAILING_COMMENT = re.compile(r'(?P<comment>\s*//.*?)\n') |
---|
27 | _MULTILINE_COMMENT = re.compile(r'(?P<comment>/\*.*?\*/)', re.DOTALL) |
---|
28 | _UNQUOTED_FIELDNAME = re.compile(r'(?P<prefix>[,{]\s*)(?P<key>[^\s,{}:"]+)(?P<tail>\s*:)') |
---|
29 | _TRAILING_COMMA = re.compile(r',(?P<tail>\s*[]}])') |
---|
30 | |
---|
31 | def relaxed_load(path, **kw): |
---|
32 | return relaxed_loads(open(path).read(), **kw) |
---|
33 | |
---|
34 | def relaxed_loads(text, **kw): |
---|
35 | """ |
---|
36 | Parse and return a relaxed JSON string. |
---|
37 | """ |
---|
38 | ordered = kw.pop('ordered', False) |
---|
39 | if ordered: kw['object_pairs_hook'] = OrderedDict |
---|
40 | # TODO: need a little state machine that performs the translation so that |
---|
41 | # TODO: line and column numbers are preserved, and so that we can have |
---|
42 | # TODO: http:// in a string (instead of it being treated like a comment). |
---|
43 | #print "== raw text\n", text |
---|
44 | text = _LINE_CONTINUATION.sub('', text) |
---|
45 | #print "== joined lines\n", text |
---|
46 | text = _TRAILING_COMMENT.sub(r'\n', text) |
---|
47 | text = _MULTILINE_COMMENT.sub(r'', text) |
---|
48 | #print "== stripped comments\n", text |
---|
49 | text = _LEADING_TEXT.sub('{', text) |
---|
50 | #print "== trimmed text\n", text |
---|
51 | text = _UNQUOTED_FIELDNAME.sub(r'\g<prefix>"\g<key>"\g<tail>', text) |
---|
52 | #print "== quoted field names\n", text |
---|
53 | text = _TRAILING_COMMA.sub(r'\g<tail>', text) |
---|
54 | #print "== processed text\n", text |
---|
55 | try: |
---|
56 | obj = json.loads(text, object_hook=decode_dict_as_str, **kw) |
---|
57 | except ValueError, e: |
---|
58 | msg = [str(e)] |
---|
59 | M = re.findall('line ([0-9]*) column ([0-9]*)', msg[0]) |
---|
60 | if M: |
---|
61 | line,col = int(M[0][0]), int(M[0][1]) |
---|
62 | lines = text.split("\n") |
---|
63 | if line>=2: msg.append(lines[line-2]) |
---|
64 | if line>=1: msg.append(lines[line-1]) |
---|
65 | msg.append(" "*(col-1) + "^") |
---|
66 | if line<len(lines): msg.append(lines[line]) |
---|
67 | msg = "\n".join(msg) |
---|
68 | raise e.__class__(msg) |
---|
69 | return obj |
---|
70 | |
---|
71 | @contextmanager |
---|
72 | def float_format(formatstr='.15g'): |
---|
73 | """ |
---|
74 | Allow the float format to be changed for a json encoding action. |
---|
75 | |
---|
76 | This is a context manager, and should be used for example as:: |
---|
77 | |
---|
78 | >>> with float_format('.2g'): |
---|
79 | >>> print json.dumps(sqrt(2)) |
---|
80 | 1.41 |
---|
81 | """ |
---|
82 | formatter = json.encoder.FLOAT_REPR |
---|
83 | json.encoder.FLOAT_REPR = lambda o: format(o, formatstr) |
---|
84 | yield |
---|
85 | json.encoder.FLOAT_REPR = formatter |
---|
86 | |
---|
87 | def numpy_encoder(o): |
---|
88 | """ |
---|
89 | JSON encoder for numpy data. |
---|
90 | |
---|
91 | To automatically convert numpy data to lists when writing a datastream |
---|
92 | use json.dumps(object, default=numpy_json). |
---|
93 | """ |
---|
94 | try: |
---|
95 | return o.tolist() |
---|
96 | except AttributeError: |
---|
97 | raise TypeError |
---|
98 | |
---|
99 | |
---|
100 | def _decode_list(lst): |
---|
101 | newlist = [] |
---|
102 | for i in lst: |
---|
103 | if isinstance(i, unicode): |
---|
104 | i = i.encode('utf-8') |
---|
105 | elif isinstance(i, list): |
---|
106 | i = _decode_list(i) |
---|
107 | newlist.append(i) |
---|
108 | return newlist |
---|
109 | |
---|
110 | def decode_dict_as_str(dct): |
---|
111 | newdict = {} |
---|
112 | for k, v in dct.iteritems(): |
---|
113 | if isinstance(k, unicode): |
---|
114 | k = k.encode('utf-8') |
---|
115 | if isinstance(v, unicode): |
---|
116 | v = v.encode('utf-8') |
---|
117 | elif isinstance(v, list): |
---|
118 | v = _decode_list(v) |
---|
119 | newdict[k] = v |
---|
120 | return newdict |
---|
121 | |
---|
122 | |
---|
123 | def test(): |
---|
124 | """ |
---|
125 | Verify that the translation from pseudo-JSON to JSON works. |
---|
126 | """ |
---|
127 | good = """\ |
---|
128 | // This is a source definition with no errors |
---|
129 | var entry = { |
---|
130 | field : { // A comment about the field |
---|
131 | "field" : "te\\ |
---|
132 | x\\ |
---|
133 | t", |
---|
134 | other$field : 56, |
---|
135 | }, |
---|
136 | /* |
---|
137 | multiline comment |
---|
138 | */ |
---|
139 | secondfield : { |
---|
140 | content: ["string", "string"], /* a second comment */ |
---|
141 | content: [{name:"good", value:3, URL:"http:\\/\\/my.url.com"},] |
---|
142 | }, |
---|
143 | } |
---|
144 | """ |
---|
145 | broken = """\ |
---|
146 | // This is a source definition with a missing comma |
---|
147 | { |
---|
148 | field : { // A comment about the field |
---|
149 | field : "te\\ |
---|
150 | x\\ |
---|
151 | t" |
---|
152 | other$field : 56, |
---|
153 | }, |
---|
154 | /* |
---|
155 | multiline comment |
---|
156 | */ |
---|
157 | secondfield : { |
---|
158 | content: ["string", "string"], /* a second comment */ |
---|
159 | content: [{name:"good", value:3},] |
---|
160 | }, |
---|
161 | } |
---|
162 | """ |
---|
163 | result = relaxed_loads(good) |
---|
164 | assert result['field']['field'] == "text" |
---|
165 | assert result['field']['other$field'] == 56 |
---|
166 | assert result['secondfield']['content'][0]['name'] == 'good' |
---|
167 | try: relaxed_loads(broken) |
---|
168 | except ValueError, _: pass |
---|
169 | else: raise Exception("No exception raised in broken") |
---|
170 | |
---|
171 | if __name__ == "__main__": |
---|
172 | test() |
---|