xref: /illumos-kvm-cmd/json-lexer.c (revision 68396ea9)
1 /*
2  * JSON lexer
3  *
4  * Copyright IBM, Corp. 2009
5  *
6  * Authors:
7  *  Anthony Liguori   <aliguori@us.ibm.com>
8  *
9  * This work is licensed under the terms of the GNU LGPL, version 2.1 or later.
10  * See the COPYING.LIB file in the top-level directory.
11  *
12  */
13 
14 #include "qstring.h"
15 #include "qlist.h"
16 #include "qdict.h"
17 #include "qint.h"
18 #include "qemu-common.h"
19 #include "json-lexer.h"
20 
21 /*
22  * \"([^\\\"]|(\\\"\\'\\\\\\/\\b\\f\\n\\r\\t\\u[0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F]))*\"
23  * '([^\\']|(\\\"\\'\\\\\\/\\b\\f\\n\\r\\t\\u[0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F]))*'
24  * 0|([1-9][0-9]*(.[0-9]+)?([eE]([-+])?[0-9]+))
25  * [{}\[\],:]
26  * [a-z]+
27  *
28  */
29 
30 enum json_lexer_state {
31     ERROR = 0,
32     IN_DQ_UCODE3,
33     IN_DQ_UCODE2,
34     IN_DQ_UCODE1,
35     IN_DQ_UCODE0,
36     IN_DQ_STRING_ESCAPE,
37     IN_DQ_STRING,
38     IN_SQ_UCODE3,
39     IN_SQ_UCODE2,
40     IN_SQ_UCODE1,
41     IN_SQ_UCODE0,
42     IN_SQ_STRING_ESCAPE,
43     IN_SQ_STRING,
44     IN_ZERO,
45     IN_DIGITS,
46     IN_DIGIT,
47     IN_EXP_E,
48     IN_MANTISSA,
49     IN_MANTISSA_DIGITS,
50     IN_NONZERO_NUMBER,
51     IN_NEG_NONZERO_NUMBER,
52     IN_KEYWORD,
53     IN_ESCAPE,
54     IN_ESCAPE_L,
55     IN_ESCAPE_LL,
56     IN_ESCAPE_I,
57     IN_ESCAPE_I6,
58     IN_ESCAPE_I64,
59     IN_WHITESPACE,
60     IN_START,
61 };
62 
63 #define TERMINAL(state) [0 ... 0x7F] = (state)
64 
65 /* Return whether TERMINAL is a terminal state and the transition to it
66    from OLD_STATE required lookahead.  This happens whenever the table
67    below uses the TERMINAL macro.  */
68 #define TERMINAL_NEEDED_LOOKAHEAD(old_state, terminal) \
69             (json_lexer[(old_state)][0] == (terminal))
70 
71 static const uint8_t json_lexer[][256] =  {
72     /* double quote string */
73     [IN_DQ_UCODE3] = {
74         ['0' ... '9'] = IN_DQ_STRING,
75         ['a' ... 'f'] = IN_DQ_STRING,
76         ['A' ... 'F'] = IN_DQ_STRING,
77     },
78     [IN_DQ_UCODE2] = {
79         ['0' ... '9'] = IN_DQ_UCODE3,
80         ['a' ... 'f'] = IN_DQ_UCODE3,
81         ['A' ... 'F'] = IN_DQ_UCODE3,
82     },
83     [IN_DQ_UCODE1] = {
84         ['0' ... '9'] = IN_DQ_UCODE2,
85         ['a' ... 'f'] = IN_DQ_UCODE2,
86         ['A' ... 'F'] = IN_DQ_UCODE2,
87     },
88     [IN_DQ_UCODE0] = {
89         ['0' ... '9'] = IN_DQ_UCODE1,
90         ['a' ... 'f'] = IN_DQ_UCODE1,
91         ['A' ... 'F'] = IN_DQ_UCODE1,
92     },
93     [IN_DQ_STRING_ESCAPE] = {
94         ['b'] = IN_DQ_STRING,
95         ['f'] =  IN_DQ_STRING,
96         ['n'] =  IN_DQ_STRING,
97         ['r'] =  IN_DQ_STRING,
98         ['t'] =  IN_DQ_STRING,
99         ['/'] = IN_DQ_STRING,
100         ['\\'] = IN_DQ_STRING,
101         ['\''] = IN_DQ_STRING,
102         ['\"'] = IN_DQ_STRING,
103         ['u'] = IN_DQ_UCODE0,
104     },
105     [IN_DQ_STRING] = {
106         [1 ... 0xFF] = IN_DQ_STRING,
107         ['\\'] = IN_DQ_STRING_ESCAPE,
108         ['"'] = JSON_STRING,
109     },
110 
111     /* single quote string */
112     [IN_SQ_UCODE3] = {
113         ['0' ... '9'] = IN_SQ_STRING,
114         ['a' ... 'f'] = IN_SQ_STRING,
115         ['A' ... 'F'] = IN_SQ_STRING,
116     },
117     [IN_SQ_UCODE2] = {
118         ['0' ... '9'] = IN_SQ_UCODE3,
119         ['a' ... 'f'] = IN_SQ_UCODE3,
120         ['A' ... 'F'] = IN_SQ_UCODE3,
121     },
122     [IN_SQ_UCODE1] = {
123         ['0' ... '9'] = IN_SQ_UCODE2,
124         ['a' ... 'f'] = IN_SQ_UCODE2,
125         ['A' ... 'F'] = IN_SQ_UCODE2,
126     },
127     [IN_SQ_UCODE0] = {
128         ['0' ... '9'] = IN_SQ_UCODE1,
129         ['a' ... 'f'] = IN_SQ_UCODE1,
130         ['A' ... 'F'] = IN_SQ_UCODE1,
131     },
132     [IN_SQ_STRING_ESCAPE] = {
133         ['b'] = IN_SQ_STRING,
134         ['f'] =  IN_SQ_STRING,
135         ['n'] =  IN_SQ_STRING,
136         ['r'] =  IN_SQ_STRING,
137         ['t'] =  IN_SQ_STRING,
138         ['/'] = IN_DQ_STRING,
139         ['\\'] = IN_DQ_STRING,
140         ['\''] = IN_SQ_STRING,
141         ['\"'] = IN_SQ_STRING,
142         ['u'] = IN_SQ_UCODE0,
143     },
144     [IN_SQ_STRING] = {
145         [1 ... 0xFF] = IN_SQ_STRING,
146         ['\\'] = IN_SQ_STRING_ESCAPE,
147         ['\''] = JSON_STRING,
148     },
149 
150     /* Zero */
151     [IN_ZERO] = {
152         TERMINAL(JSON_INTEGER),
153         ['0' ... '9'] = ERROR,
154         ['.'] = IN_MANTISSA,
155     },
156 
157     /* Float */
158     [IN_DIGITS] = {
159         TERMINAL(JSON_FLOAT),
160         ['0' ... '9'] = IN_DIGITS,
161     },
162 
163     [IN_DIGIT] = {
164         ['0' ... '9'] = IN_DIGITS,
165     },
166 
167     [IN_EXP_E] = {
168         ['-'] = IN_DIGIT,
169         ['+'] = IN_DIGIT,
170         ['0' ... '9'] = IN_DIGITS,
171     },
172 
173     [IN_MANTISSA_DIGITS] = {
174         TERMINAL(JSON_FLOAT),
175         ['0' ... '9'] = IN_MANTISSA_DIGITS,
176         ['e'] = IN_EXP_E,
177         ['E'] = IN_EXP_E,
178     },
179 
180     [IN_MANTISSA] = {
181         ['0' ... '9'] = IN_MANTISSA_DIGITS,
182     },
183 
184     /* Number */
185     [IN_NONZERO_NUMBER] = {
186         TERMINAL(JSON_INTEGER),
187         ['0' ... '9'] = IN_NONZERO_NUMBER,
188         ['e'] = IN_EXP_E,
189         ['E'] = IN_EXP_E,
190         ['.'] = IN_MANTISSA,
191     },
192 
193     [IN_NEG_NONZERO_NUMBER] = {
194         ['0'] = IN_ZERO,
195         ['1' ... '9'] = IN_NONZERO_NUMBER,
196     },
197 
198     /* keywords */
199     [IN_KEYWORD] = {
200         TERMINAL(JSON_KEYWORD),
201         ['a' ... 'z'] = IN_KEYWORD,
202     },
203 
204     /* whitespace */
205     [IN_WHITESPACE] = {
206         TERMINAL(JSON_SKIP),
207         [' '] = IN_WHITESPACE,
208         ['\t'] = IN_WHITESPACE,
209         ['\r'] = IN_WHITESPACE,
210         ['\n'] = IN_WHITESPACE,
211     },
212 
213     /* escape */
214     [IN_ESCAPE_LL] = {
215         ['d'] = JSON_ESCAPE,
216     },
217 
218     [IN_ESCAPE_L] = {
219         ['d'] = JSON_ESCAPE,
220         ['l'] = IN_ESCAPE_LL,
221     },
222 
223     [IN_ESCAPE_I64] = {
224         ['d'] = JSON_ESCAPE,
225     },
226 
227     [IN_ESCAPE_I6] = {
228         ['4'] = IN_ESCAPE_I64,
229     },
230 
231     [IN_ESCAPE_I] = {
232         ['6'] = IN_ESCAPE_I6,
233     },
234 
235     [IN_ESCAPE] = {
236         ['d'] = JSON_ESCAPE,
237         ['i'] = JSON_ESCAPE,
238         ['p'] = JSON_ESCAPE,
239         ['s'] = JSON_ESCAPE,
240         ['f'] = JSON_ESCAPE,
241         ['l'] = IN_ESCAPE_L,
242         ['I'] = IN_ESCAPE_I,
243     },
244 
245     /* top level rule */
246     [IN_START] = {
247         ['"'] = IN_DQ_STRING,
248         ['\''] = IN_SQ_STRING,
249         ['0'] = IN_ZERO,
250         ['1' ... '9'] = IN_NONZERO_NUMBER,
251         ['-'] = IN_NEG_NONZERO_NUMBER,
252         ['{'] = JSON_OPERATOR,
253         ['}'] = JSON_OPERATOR,
254         ['['] = JSON_OPERATOR,
255         [']'] = JSON_OPERATOR,
256         [','] = JSON_OPERATOR,
257         [':'] = JSON_OPERATOR,
258         ['a' ... 'z'] = IN_KEYWORD,
259         ['%'] = IN_ESCAPE,
260         [' '] = IN_WHITESPACE,
261         ['\t'] = IN_WHITESPACE,
262         ['\r'] = IN_WHITESPACE,
263         ['\n'] = IN_WHITESPACE,
264     },
265 };
266 
json_lexer_init(JSONLexer * lexer,JSONLexerEmitter func)267 void json_lexer_init(JSONLexer *lexer, JSONLexerEmitter func)
268 {
269     lexer->emit = func;
270     lexer->state = IN_START;
271     lexer->token = qstring_new();
272     lexer->x = lexer->y = 0;
273 }
274 
json_lexer_feed_char(JSONLexer * lexer,char ch)275 static int json_lexer_feed_char(JSONLexer *lexer, char ch)
276 {
277     int char_consumed, new_state;
278 
279     lexer->x++;
280     if (ch == '\n') {
281         lexer->x = 0;
282         lexer->y++;
283     }
284 
285     do {
286         new_state = json_lexer[lexer->state][(uint8_t)ch];
287         char_consumed = !TERMINAL_NEEDED_LOOKAHEAD(lexer->state, new_state);
288         if (char_consumed) {
289             qstring_append_chr(lexer->token, ch);
290         }
291 
292         switch (new_state) {
293         case JSON_OPERATOR:
294         case JSON_ESCAPE:
295         case JSON_INTEGER:
296         case JSON_FLOAT:
297         case JSON_KEYWORD:
298         case JSON_STRING:
299             lexer->emit(lexer, lexer->token, new_state, lexer->x, lexer->y);
300         case JSON_SKIP:
301             QDECREF(lexer->token);
302             lexer->token = qstring_new();
303             new_state = IN_START;
304             break;
305         case ERROR:
306             return -EINVAL;
307         default:
308             break;
309         }
310         lexer->state = new_state;
311     } while (!char_consumed);
312     return 0;
313 }
314 
json_lexer_feed(JSONLexer * lexer,const char * buffer,size_t size)315 int json_lexer_feed(JSONLexer *lexer, const char *buffer, size_t size)
316 {
317     size_t i;
318 
319     for (i = 0; i < size; i++) {
320         int err;
321 
322         err = json_lexer_feed_char(lexer, buffer[i]);
323         if (err < 0) {
324             return err;
325         }
326     }
327 
328     return 0;
329 }
330 
json_lexer_flush(JSONLexer * lexer)331 int json_lexer_flush(JSONLexer *lexer)
332 {
333     return lexer->state == IN_START ? 0 : json_lexer_feed_char(lexer, 0);
334 }
335 
json_lexer_destroy(JSONLexer * lexer)336 void json_lexer_destroy(JSONLexer *lexer)
337 {
338     QDECREF(lexer->token);
339 }
340