24 | * be conservative in what you do, be liberal in what you accept from others 25 | *26 | *
27 | * Applied to JSON-like content from others, it will produce well-formed JSON 28 | * that should satisfy any parser you use. 29 | *
30 | * Applied to your output before you send, it will coerce minor mistakes in 31 | * encoding and make it easier to embed your JSON in HTML and XML. 32 | * 33 | *
{foo:"bar"}
Unquoted property names are quoted.
46 | * //comments
JS style line and block comments are removed.
47 | * (...)
Grouping parentheses are removed.
48 | * eval
builtin (after being wrapped in parentheses)
66 | * or by JSON.parse
.
67 | * Specifically, the output will not contain any string literals with embedded
68 | * JS newlines (U+2028 Paragraph separator or U+2029 Line separator).
69 | * eval
will
76 | * have no side-effects and no free variables, so is neither a code-injection
77 | * vector, nor a vector for exfiltration of secrets.
78 | *
79 | * This library only ensures that the JSON string → Javascript object 80 | * phase has no side effects and resolves no free variables, and cannot control 81 | * how other client side code later interprets the resulting Javascript object. 82 | * So if client-side code takes a part of the parsed data that is controlled by 83 | * an attacker and passes it back through a powerful interpreter like 84 | * {@code eval} or {@code innerHTML} then that client-side code might suffer 85 | * unintended side-effects. 86 | * 87 | *
The sanitize method takes O(n) time where n is the length in UTF-16
93 | * code-units.
94 | */
95 | public final class JsonSanitizer {
96 |
97 | /** The default for the maximumNestingDepth constructor parameter. */
98 | public static final int DEFAULT_NESTING_DEPTH = 64;
99 |
100 | /** The maximum value for the maximumNestingDepth constructor parameter. */
101 | public static final int MAXIMUM_NESTING_DEPTH = 4096;
102 |
103 | /**
104 | * Given JSON-like content, produces a string of JSON that is safe to embed,
105 | * safe to pass to JavaScript's {@code eval} operator.
106 | *
107 | * @param jsonish JSON-like content.
108 | * @return embeddable JSON
109 | */
110 | public static String sanitize(String jsonish) {
111 | return sanitize(jsonish, DEFAULT_NESTING_DEPTH);
112 | }
113 |
114 | /**
115 | * Same as {@link JsonSanitizer#sanitize(String)}, but allows to set a custom
116 | * maximum nesting depth.
117 | *
118 | * @param jsonish JSON-like content.
119 | * @param maximumNestingDepth maximum nesting depth.
120 | * @return embeddable JSON
121 | */
122 | public static String sanitize(String jsonish, int maximumNestingDepth) {
123 | JsonSanitizer s = new JsonSanitizer(jsonish, maximumNestingDepth);
124 | s.sanitize();
125 | return s.toString();
126 | }
127 |
128 | /**
129 | * Describes where we are in a state machine that consists of transitions on
130 | * complete values, colons, commas, and brackets.
131 | */
132 | private enum State {
133 | /**
134 | * Immediately after '[' and
135 | * {@link #BEFORE_ELEMENT before the first element}.
136 | */
137 | START_ARRAY,
138 | /** Before a JSON value in an array or at the top level. */
139 | BEFORE_ELEMENT,
140 | /**
141 | * After a JSON value in an array or at the top level, and before any
142 | * following comma or close bracket.
143 | */
144 | AFTER_ELEMENT,
145 | /** Immediately after '{' and {@link #BEFORE_KEY before the first key}. */
146 | START_MAP,
147 | /** Before a key in a key-value map. */
148 | BEFORE_KEY,
149 | /** After a key in a key-value map but before the required colon. */
150 | AFTER_KEY,
151 | /** Before a value in a key-value map. */
152 | BEFORE_VALUE,
153 | /**
154 | * After a value in a key-value map but before any following comma or
155 | * close bracket.
156 | */
157 | AFTER_VALUE,
158 | ;
159 | }
160 |
161 | /**
162 | * The maximum nesting depth. According to RFC4627 it is implementation-specific.
163 | */
164 | private final int maximumNestingDepth;
165 |
166 | private final String jsonish;
167 |
168 | /**
169 | * The number of brackets that have been entered and not subsequently exited.
170 | * Also, the length of the used prefix of {@link #isMap}.
171 | */
172 | private int bracketDepth;
173 | /**
174 | * {@code isMap[i]} when {@code 0 <= i && i < bracketDepth} is true iff
175 | * the i-th open bracket was a '{'
, not a '['
.
176 | */
177 | private boolean[] isMap;
178 | /**
179 | * If non-null, then contains the sanitized form of
180 | * {@code jsonish.substring(0, cleaned)}.
181 | * If {@code null}, then no unclean constructs have been found in
182 | * {@code jsonish} yet.
183 | */
184 | private StringBuilder sanitizedJson;
185 | /**
186 | * The length of the prefix of {@link #jsonish} that has been written onto
187 | * {@link #sanitizedJson}.
188 | */
189 | private int cleaned;
190 |
191 | private static final boolean SUPER_VERBOSE_AND_SLOW_LOGGING = false;
192 |
193 | JsonSanitizer(String jsonish) {
194 | this(jsonish, DEFAULT_NESTING_DEPTH);
195 | }
196 |
197 | JsonSanitizer(String jsonish, int maximumNestingDepth) {
198 | this.maximumNestingDepth = Math.min(Math.max(1, maximumNestingDepth),MAXIMUM_NESTING_DEPTH);
199 | if (SUPER_VERBOSE_AND_SLOW_LOGGING) {
200 | System.err.println("\n" + jsonish + "\n========");
201 | }
202 | this.jsonish = jsonish != null ? jsonish : "null";
203 | }
204 |
205 | int getMaximumNestingDepth() {
206 | return this.maximumNestingDepth;
207 | }
208 |
209 | void sanitize() {
210 | // Return to consistent state.
211 | bracketDepth = cleaned = 0;
212 | sanitizedJson = null;
213 |
214 | State state = State.START_ARRAY;
215 | int n = jsonish.length();
216 |
217 | // Walk over each token and either validate it, by just advancing i and
218 | // computing the next state, or manipulate cleaned&sanitizedJson so that
219 | // sanitizedJson contains the sanitized equivalent of
220 | // jsonish.substring(0, cleaned).
221 | token_loop:
222 | for (int i = 0; i < n; ++i) {
223 | try {
224 | char ch = jsonish.charAt(i);
225 | if (SUPER_VERBOSE_AND_SLOW_LOGGING) {
226 | String sanitizedJsonStr =
227 | (sanitizedJson == null ? "" : sanitizedJson)
228 | + jsonish.substring(cleaned, i);
229 | System.err.println("i=" + i + ", ch=" + ch + ", state=" + state
230 | + ", sanitized=" + sanitizedJsonStr);
231 | }
232 | switch (ch) {
233 | case '\t': case '\n': case '\r': case ' ':
234 | break;
235 |
236 | case '"': case '\'':
237 | state = requireValueState(i, state, true);
238 | int strEnd = endOfQuotedString(jsonish, i);
239 | sanitizeString(i, strEnd);
240 | i = strEnd - 1;
241 | break;
242 |
243 | case '(': case ')':
244 | // Often JSON-like content which is meant for use by eval is
245 | // wrapped in parentheses so that the JS parser treats contained
246 | // curly brackets as part of an object constructor instead of a
247 | // block statement.
248 | // We elide these grouping parentheses to ensure valid JSON.
249 | elide(i, i + 1);
250 | break;
251 |
252 | case '{': case '[':
253 | state = requireValueState(i, state, false);
254 | if (isMap == null) {
255 | isMap = new boolean[maximumNestingDepth];
256 | }
257 | boolean map = ch == '{';
258 | isMap[bracketDepth] = map;
259 | ++bracketDepth;
260 | state = map ? State.START_MAP : State.START_ARRAY;
261 | break;
262 |
263 | case '}': case ']':
264 | if (bracketDepth == 0) {
265 | elide(i, jsonish.length());
266 | break token_loop;
267 | }
268 |
269 | // Strip trailing comma to convert {"a":0,} -> {"a":0}
270 | // and [1,2,3,] -> [1,2,3,]
271 | switch (state) {
272 | case BEFORE_VALUE:
273 | insert(i, "null");
274 | break;
275 | case BEFORE_ELEMENT: case BEFORE_KEY:
276 | elideTrailingComma(i);
277 | break;
278 | case AFTER_KEY:
279 | insert(i, ":null");
280 | break;
281 | case START_MAP: case START_ARRAY:
282 | case AFTER_ELEMENT: case AFTER_VALUE: break;
283 | }
284 |
285 | --bracketDepth;
286 | char closeBracket = isMap[bracketDepth] ? '}' : ']';
287 | if (ch != closeBracket) {
288 | replace(i, i + 1, closeBracket);
289 | }
290 | state = bracketDepth == 0 || !isMap[bracketDepth - 1]
291 | ? State.AFTER_ELEMENT : State.AFTER_VALUE;
292 | break;
293 | case ',':
294 | if (bracketDepth == 0) { throw UNBRACKETED_COMMA; }
295 | // Convert comma elisions like [1,,3] to [1,null,3].
296 | // [1,,3] in JS is an array that has no element at index 1
297 | // according to the "in" operator so accessing index 1 will
298 | // yield the special value "undefined" which is equivalent to
299 | // JS's "null" value according to "==".
300 | switch (state) {
301 | // Normal
302 | case AFTER_ELEMENT:
303 | state = State.BEFORE_ELEMENT;
304 | break;
305 | case AFTER_VALUE:
306 | state = State.BEFORE_KEY;
307 | break;
308 | // Array elision.
309 | case START_ARRAY: case BEFORE_ELEMENT:
310 | insert(i, "null");
311 | state = State.BEFORE_ELEMENT;
312 | break;
313 | // Ignore
314 | case START_MAP: case BEFORE_KEY:
315 | case AFTER_KEY:
316 | elide(i, i + 1);
317 | break;
318 | // Supply missing value.
319 | case BEFORE_VALUE:
320 | insert(i, "null");
321 | state = State.BEFORE_KEY;
322 | break;
323 | }
324 | break;
325 |
326 | case ':':
327 | if (state == State.AFTER_KEY) {
328 | state = State.BEFORE_VALUE;
329 | } else {
330 | elide(i, i + 1);
331 | }
332 | break;
333 |
334 | case '/':
335 | // Skip over JS-style comments since people like inserting them into
336 | // data files and getting huffy with Crockford when he says no to
337 | // versioning JSON to allow ignorable tokens.
338 | int end = i + 1;
339 | if (i + 1 < n) {
340 | switch (jsonish.charAt(i + 1)) {
341 | case '/':
342 | end = n; // Worst case.
343 | for (int j = i + 2; j < n; ++j) {
344 | char cch = jsonish.charAt(j);
345 | if (cch == '\n' || cch == '\r'
346 | || cch == '\u2028' || cch == '\u2029') {
347 | end = j + 1;
348 | break;
349 | }
350 | }
351 | break;
352 | case '*':
353 | end = n;
354 | if (i + 3 < n) {
355 | for (int j = i + 2;
356 | (j = jsonish.indexOf('/', j + 1)) >= 0;) {
357 | if (jsonish.charAt(j - 1) == '*') {
358 | end = j + 1;
359 | break;
360 | }
361 | }
362 | }
363 | break;
364 | }
365 | }
366 | elide(i, end);
367 | i = end - 1;
368 | break;
369 |
370 | default:
371 | // Three kinds of other values can occur.
372 | // 1. Numbers
373 | // 2. Keyword values ("false", "null", "true")
374 | // 3. Unquoted JS property names as in the JS expression
375 | // ({ foo: "bar"})
376 | // which is equivalent to the JSON
377 | // { "foo": "bar" }
378 | // 4. Cruft tokens like BOMs.
379 |
380 | // Look for a run of '.', [0-9], [a-zA-Z_$], [+-] which subsumes
381 | // all the above without including any JSON special characters
382 | // outside keyword and number.
383 | int runEnd;
384 | for (runEnd = i; runEnd < n; ++runEnd) {
385 | char tch = jsonish.charAt(runEnd);
386 | if (('a' <= tch && tch <= 'z') || ('0' <= tch && tch <= '9')
387 | || tch == '+' || tch == '-' || tch == '.'
388 | || ('A' <= tch && tch <= 'Z') || tch == '_' || tch == '$') {
389 | continue;
390 | }
391 | break;
392 | }
393 |
394 | if (runEnd == i) {
395 | elide(i, i + 1);
396 | break;
397 | }
398 |
399 | state = requireValueState(i, state, true);
400 |
401 | boolean isNumber = ('0' <= ch && ch <= '9')
402 | || ch == '.' || ch == '+' || ch == '-';
403 | boolean isKeyword = !isNumber && isKeyword(i, runEnd);
404 |
405 | if (!(isNumber || isKeyword)) {
406 | // We're going to have to quote the output. Further expand to
407 | // include more of an unquoted token in a string.
408 | for (; runEnd < n; ++runEnd) {
409 | if (isJsonSpecialChar(runEnd)) {
410 | break;
411 | }
412 | }
413 | if (runEnd < n && jsonish.charAt(runEnd) == '"') {
414 | ++runEnd;
415 | }
416 | }
417 |
418 | if (state == State.AFTER_KEY) {
419 | // We need to quote whatever we have since it is used as a
420 | // property name in a map and only quoted strings can be used that
421 | // way in JSON.
422 | insert(i, '"');
423 | if (isNumber) {
424 | // By JS rules,
425 | // { .5e-1: "bar" }
426 | // is the same as
427 | // { "0.05": "bar" }
428 | // because a number literal is converted to its string form
429 | // before being used as a property name.
430 | canonicalizeNumber(i, runEnd);
431 | // We intentionally ignore the return value of canonicalize.
432 | // Uncanonicalizable numbers just get put straight through as
433 | // string values.
434 | insert(runEnd, '"');
435 | } else {
436 | sanitizeString(i, runEnd);
437 | }
438 | } else {
439 | if (isNumber) {
440 | // Convert hex and octal constants to decimal and ensure that
441 | // integer and fraction portions are not empty.
442 | normalizeNumber(i, runEnd);
443 | } else if (!isKeyword) {
444 | // Treat as an unquoted string literal.
445 | insert(i, '"');
446 | sanitizeString(i, runEnd);
447 | }
448 | }
449 | i = runEnd - 1;
450 | }
451 | } catch (@SuppressWarnings("unused") UnbracketedComma e) {
452 | elide(i, jsonish.length());
453 | break;
454 | }
455 | }
456 |
457 | if (state == State.START_ARRAY && bracketDepth == 0) {
458 | // No tokens. Only whitespace
459 | insert(n, "null");
460 | state = State.AFTER_ELEMENT;
461 | }
462 |
463 | if (SUPER_VERBOSE_AND_SLOW_LOGGING) {
464 | System.err.println(
465 | "state=" + state + ", sanitizedJson=" + sanitizedJson
466 | + ", cleaned=" + cleaned + ", bracketDepth=" + bracketDepth);
467 | }
468 |
469 | if ((sanitizedJson != null && sanitizedJson.length() != 0)
470 | || cleaned != 0 || bracketDepth != 0) {
471 | if (sanitizedJson == null) {
472 | sanitizedJson = new StringBuilder(n + bracketDepth);
473 | }
474 | sanitizedJson.append(jsonish, cleaned, n);
475 | cleaned = n;
476 |
477 | switch (state) {
478 | case BEFORE_ELEMENT: case BEFORE_KEY:
479 | elideTrailingComma(n);
480 | break;
481 | case AFTER_KEY:
482 | sanitizedJson.append(":null");
483 | break;
484 | case BEFORE_VALUE:
485 | sanitizedJson.append("null");
486 | break;
487 | default: break;
488 | }
489 |
490 | // Insert brackets to close unclosed content.
491 | while (bracketDepth != 0) {
492 | sanitizedJson.append(isMap[--bracketDepth] ? '}' : ']');
493 | }
494 | }
495 | }
496 |
497 | /**
498 | * Ensures that the output corresponding to {@code jsonish[start:end]} is a
499 | * valid JSON string that has the same meaning when parsed by Javascript
500 | * {@code eval}.
501 | *