JSON.cpp revision 360784
1//=== JSON.cpp - JSON value, parsing and serialization - C++ -----------*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===---------------------------------------------------------------------===//
8
9#include "llvm/Support/JSON.h"
10#include "llvm/Support/ConvertUTF.h"
11#include "llvm/Support/Format.h"
12#include <cctype>
13
14namespace llvm {
15namespace json {
16
17Value &Object::operator[](const ObjectKey &K) {
18  return try_emplace(K, nullptr).first->getSecond();
19}
20Value &Object::operator[](ObjectKey &&K) {
21  return try_emplace(std::move(K), nullptr).first->getSecond();
22}
23Value *Object::get(StringRef K) {
24  auto I = find(K);
25  if (I == end())
26    return nullptr;
27  return &I->second;
28}
29const Value *Object::get(StringRef K) const {
30  auto I = find(K);
31  if (I == end())
32    return nullptr;
33  return &I->second;
34}
35llvm::Optional<std::nullptr_t> Object::getNull(StringRef K) const {
36  if (auto *V = get(K))
37    return V->getAsNull();
38  return llvm::None;
39}
40llvm::Optional<bool> Object::getBoolean(StringRef K) const {
41  if (auto *V = get(K))
42    return V->getAsBoolean();
43  return llvm::None;
44}
45llvm::Optional<double> Object::getNumber(StringRef K) const {
46  if (auto *V = get(K))
47    return V->getAsNumber();
48  return llvm::None;
49}
50llvm::Optional<int64_t> Object::getInteger(StringRef K) const {
51  if (auto *V = get(K))
52    return V->getAsInteger();
53  return llvm::None;
54}
55llvm::Optional<llvm::StringRef> Object::getString(StringRef K) const {
56  if (auto *V = get(K))
57    return V->getAsString();
58  return llvm::None;
59}
60const json::Object *Object::getObject(StringRef K) const {
61  if (auto *V = get(K))
62    return V->getAsObject();
63  return nullptr;
64}
65json::Object *Object::getObject(StringRef K) {
66  if (auto *V = get(K))
67    return V->getAsObject();
68  return nullptr;
69}
70const json::Array *Object::getArray(StringRef K) const {
71  if (auto *V = get(K))
72    return V->getAsArray();
73  return nullptr;
74}
75json::Array *Object::getArray(StringRef K) {
76  if (auto *V = get(K))
77    return V->getAsArray();
78  return nullptr;
79}
80bool operator==(const Object &LHS, const Object &RHS) {
81  if (LHS.size() != RHS.size())
82    return false;
83  for (const auto &L : LHS) {
84    auto R = RHS.find(L.first);
85    if (R == RHS.end() || L.second != R->second)
86      return false;
87  }
88  return true;
89}
90
91Array::Array(std::initializer_list<Value> Elements) {
92  V.reserve(Elements.size());
93  for (const Value &V : Elements) {
94    emplace_back(nullptr);
95    back().moveFrom(std::move(V));
96  }
97}
98
99Value::Value(std::initializer_list<Value> Elements)
100    : Value(json::Array(Elements)) {}
101
102void Value::copyFrom(const Value &M) {
103  Type = M.Type;
104  switch (Type) {
105  case T_Null:
106  case T_Boolean:
107  case T_Double:
108  case T_Integer:
109    memcpy(Union.buffer, M.Union.buffer, sizeof(Union.buffer));
110    break;
111  case T_StringRef:
112    create<StringRef>(M.as<StringRef>());
113    break;
114  case T_String:
115    create<std::string>(M.as<std::string>());
116    break;
117  case T_Object:
118    create<json::Object>(M.as<json::Object>());
119    break;
120  case T_Array:
121    create<json::Array>(M.as<json::Array>());
122    break;
123  }
124}
125
126void Value::moveFrom(const Value &&M) {
127  Type = M.Type;
128  switch (Type) {
129  case T_Null:
130  case T_Boolean:
131  case T_Double:
132  case T_Integer:
133    memcpy(Union.buffer, M.Union.buffer, sizeof(Union.buffer));
134    break;
135  case T_StringRef:
136    create<StringRef>(M.as<StringRef>());
137    break;
138  case T_String:
139    create<std::string>(std::move(M.as<std::string>()));
140    M.Type = T_Null;
141    break;
142  case T_Object:
143    create<json::Object>(std::move(M.as<json::Object>()));
144    M.Type = T_Null;
145    break;
146  case T_Array:
147    create<json::Array>(std::move(M.as<json::Array>()));
148    M.Type = T_Null;
149    break;
150  }
151}
152
153void Value::destroy() {
154  switch (Type) {
155  case T_Null:
156  case T_Boolean:
157  case T_Double:
158  case T_Integer:
159    break;
160  case T_StringRef:
161    as<StringRef>().~StringRef();
162    break;
163  case T_String:
164    as<std::string>().~basic_string();
165    break;
166  case T_Object:
167    as<json::Object>().~Object();
168    break;
169  case T_Array:
170    as<json::Array>().~Array();
171    break;
172  }
173}
174
175bool operator==(const Value &L, const Value &R) {
176  if (L.kind() != R.kind())
177    return false;
178  switch (L.kind()) {
179  case Value::Null:
180    return *L.getAsNull() == *R.getAsNull();
181  case Value::Boolean:
182    return *L.getAsBoolean() == *R.getAsBoolean();
183  case Value::Number:
184    // Workaround for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=323
185    // The same integer must convert to the same double, per the standard.
186    // However we see 64-vs-80-bit precision comparisons with gcc-7 -O3 -m32.
187    // So we avoid floating point promotion for exact comparisons.
188    if (L.Type == Value::T_Integer || R.Type == Value::T_Integer)
189      return L.getAsInteger() == R.getAsInteger();
190    return *L.getAsNumber() == *R.getAsNumber();
191  case Value::String:
192    return *L.getAsString() == *R.getAsString();
193  case Value::Array:
194    return *L.getAsArray() == *R.getAsArray();
195  case Value::Object:
196    return *L.getAsObject() == *R.getAsObject();
197  }
198  llvm_unreachable("Unknown value kind");
199}
200
201namespace {
202// Simple recursive-descent JSON parser.
203class Parser {
204public:
205  Parser(StringRef JSON)
206      : Start(JSON.begin()), P(JSON.begin()), End(JSON.end()) {}
207
208  bool checkUTF8() {
209    size_t ErrOffset;
210    if (isUTF8(StringRef(Start, End - Start), &ErrOffset))
211      return true;
212    P = Start + ErrOffset; // For line/column calculation.
213    return parseError("Invalid UTF-8 sequence");
214  }
215
216  bool parseValue(Value &Out);
217
218  bool assertEnd() {
219    eatWhitespace();
220    if (P == End)
221      return true;
222    return parseError("Text after end of document");
223  }
224
225  Error takeError() {
226    assert(Err);
227    return std::move(*Err);
228  }
229
230private:
231  void eatWhitespace() {
232    while (P != End && (*P == ' ' || *P == '\r' || *P == '\n' || *P == '\t'))
233      ++P;
234  }
235
236  // On invalid syntax, parseX() functions return false and set Err.
237  bool parseNumber(char First, Value &Out);
238  bool parseString(std::string &Out);
239  bool parseUnicode(std::string &Out);
240  bool parseError(const char *Msg); // always returns false
241
242  char next() { return P == End ? 0 : *P++; }
243  char peek() { return P == End ? 0 : *P; }
244  static bool isNumber(char C) {
245    return C == '0' || C == '1' || C == '2' || C == '3' || C == '4' ||
246           C == '5' || C == '6' || C == '7' || C == '8' || C == '9' ||
247           C == 'e' || C == 'E' || C == '+' || C == '-' || C == '.';
248  }
249
250  Optional<Error> Err;
251  const char *Start, *P, *End;
252};
253
254bool Parser::parseValue(Value &Out) {
255  eatWhitespace();
256  if (P == End)
257    return parseError("Unexpected EOF");
258  switch (char C = next()) {
259  // Bare null/true/false are easy - first char identifies them.
260  case 'n':
261    Out = nullptr;
262    return (next() == 'u' && next() == 'l' && next() == 'l') ||
263           parseError("Invalid JSON value (null?)");
264  case 't':
265    Out = true;
266    return (next() == 'r' && next() == 'u' && next() == 'e') ||
267           parseError("Invalid JSON value (true?)");
268  case 'f':
269    Out = false;
270    return (next() == 'a' && next() == 'l' && next() == 's' && next() == 'e') ||
271           parseError("Invalid JSON value (false?)");
272  case '"': {
273    std::string S;
274    if (parseString(S)) {
275      Out = std::move(S);
276      return true;
277    }
278    return false;
279  }
280  case '[': {
281    Out = Array{};
282    Array &A = *Out.getAsArray();
283    eatWhitespace();
284    if (peek() == ']') {
285      ++P;
286      return true;
287    }
288    for (;;) {
289      A.emplace_back(nullptr);
290      if (!parseValue(A.back()))
291        return false;
292      eatWhitespace();
293      switch (next()) {
294      case ',':
295        eatWhitespace();
296        continue;
297      case ']':
298        return true;
299      default:
300        return parseError("Expected , or ] after array element");
301      }
302    }
303  }
304  case '{': {
305    Out = Object{};
306    Object &O = *Out.getAsObject();
307    eatWhitespace();
308    if (peek() == '}') {
309      ++P;
310      return true;
311    }
312    for (;;) {
313      if (next() != '"')
314        return parseError("Expected object key");
315      std::string K;
316      if (!parseString(K))
317        return false;
318      eatWhitespace();
319      if (next() != ':')
320        return parseError("Expected : after object key");
321      eatWhitespace();
322      if (!parseValue(O[std::move(K)]))
323        return false;
324      eatWhitespace();
325      switch (next()) {
326      case ',':
327        eatWhitespace();
328        continue;
329      case '}':
330        return true;
331      default:
332        return parseError("Expected , or } after object property");
333      }
334    }
335  }
336  default:
337    if (isNumber(C))
338      return parseNumber(C, Out);
339    return parseError("Invalid JSON value");
340  }
341}
342
343bool Parser::parseNumber(char First, Value &Out) {
344  // Read the number into a string. (Must be null-terminated for strto*).
345  SmallString<24> S;
346  S.push_back(First);
347  while (isNumber(peek()))
348    S.push_back(next());
349  char *End;
350  // Try first to parse as integer, and if so preserve full 64 bits.
351  // strtoll returns long long >= 64 bits, so check it's in range too.
352  auto I = std::strtoll(S.c_str(), &End, 10);
353  if (End == S.end() && I >= std::numeric_limits<int64_t>::min() &&
354      I <= std::numeric_limits<int64_t>::max()) {
355    Out = int64_t(I);
356    return true;
357  }
358  // If it's not an integer
359  Out = std::strtod(S.c_str(), &End);
360  return End == S.end() || parseError("Invalid JSON value (number?)");
361}
362
363bool Parser::parseString(std::string &Out) {
364  // leading quote was already consumed.
365  for (char C = next(); C != '"'; C = next()) {
366    if (LLVM_UNLIKELY(P == End))
367      return parseError("Unterminated string");
368    if (LLVM_UNLIKELY((C & 0x1f) == C))
369      return parseError("Control character in string");
370    if (LLVM_LIKELY(C != '\\')) {
371      Out.push_back(C);
372      continue;
373    }
374    // Handle escape sequence.
375    switch (C = next()) {
376    case '"':
377    case '\\':
378    case '/':
379      Out.push_back(C);
380      break;
381    case 'b':
382      Out.push_back('\b');
383      break;
384    case 'f':
385      Out.push_back('\f');
386      break;
387    case 'n':
388      Out.push_back('\n');
389      break;
390    case 'r':
391      Out.push_back('\r');
392      break;
393    case 't':
394      Out.push_back('\t');
395      break;
396    case 'u':
397      if (!parseUnicode(Out))
398        return false;
399      break;
400    default:
401      return parseError("Invalid escape sequence");
402    }
403  }
404  return true;
405}
406
407static void encodeUtf8(uint32_t Rune, std::string &Out) {
408  if (Rune < 0x80) {
409    Out.push_back(Rune & 0x7F);
410  } else if (Rune < 0x800) {
411    uint8_t FirstByte = 0xC0 | ((Rune & 0x7C0) >> 6);
412    uint8_t SecondByte = 0x80 | (Rune & 0x3F);
413    Out.push_back(FirstByte);
414    Out.push_back(SecondByte);
415  } else if (Rune < 0x10000) {
416    uint8_t FirstByte = 0xE0 | ((Rune & 0xF000) >> 12);
417    uint8_t SecondByte = 0x80 | ((Rune & 0xFC0) >> 6);
418    uint8_t ThirdByte = 0x80 | (Rune & 0x3F);
419    Out.push_back(FirstByte);
420    Out.push_back(SecondByte);
421    Out.push_back(ThirdByte);
422  } else if (Rune < 0x110000) {
423    uint8_t FirstByte = 0xF0 | ((Rune & 0x1F0000) >> 18);
424    uint8_t SecondByte = 0x80 | ((Rune & 0x3F000) >> 12);
425    uint8_t ThirdByte = 0x80 | ((Rune & 0xFC0) >> 6);
426    uint8_t FourthByte = 0x80 | (Rune & 0x3F);
427    Out.push_back(FirstByte);
428    Out.push_back(SecondByte);
429    Out.push_back(ThirdByte);
430    Out.push_back(FourthByte);
431  } else {
432    llvm_unreachable("Invalid codepoint");
433  }
434}
435
436// Parse a UTF-16 \uNNNN escape sequence. "\u" has already been consumed.
437// May parse several sequential escapes to ensure proper surrogate handling.
438// We do not use ConvertUTF.h, it can't accept and replace unpaired surrogates.
439// These are invalid Unicode but valid JSON (RFC 8259, section 8.2).
440bool Parser::parseUnicode(std::string &Out) {
441  // Invalid UTF is not a JSON error (RFC 8529��8.2). It gets replaced by U+FFFD.
442  auto Invalid = [&] { Out.append(/* UTF-8 */ {'\xef', '\xbf', '\xbd'}); };
443  // Decodes 4 hex digits from the stream into Out, returns false on error.
444  auto Parse4Hex = [this](uint16_t &Out) -> bool {
445    Out = 0;
446    char Bytes[] = {next(), next(), next(), next()};
447    for (unsigned char C : Bytes) {
448      if (!std::isxdigit(C))
449        return parseError("Invalid \\u escape sequence");
450      Out <<= 4;
451      Out |= (C > '9') ? (C & ~0x20) - 'A' + 10 : (C - '0');
452    }
453    return true;
454  };
455  uint16_t First; // UTF-16 code unit from the first \u escape.
456  if (!Parse4Hex(First))
457    return false;
458
459  // We loop to allow proper surrogate-pair error handling.
460  while (true) {
461    // Case 1: the UTF-16 code unit is already a codepoint in the BMP.
462    if (LLVM_LIKELY(First < 0xD800 || First >= 0xE000)) {
463      encodeUtf8(First, Out);
464      return true;
465    }
466
467    // Case 2: it's an (unpaired) trailing surrogate.
468    if (LLVM_UNLIKELY(First >= 0xDC00)) {
469      Invalid();
470      return true;
471    }
472
473    // Case 3: it's a leading surrogate. We expect a trailing one next.
474    // Case 3a: there's no trailing \u escape. Don't advance in the stream.
475    if (LLVM_UNLIKELY(P + 2 > End || *P != '\\' || *(P + 1) != 'u')) {
476      Invalid(); // Leading surrogate was unpaired.
477      return true;
478    }
479    P += 2;
480    uint16_t Second;
481    if (!Parse4Hex(Second))
482      return false;
483    // Case 3b: there was another \u escape, but it wasn't a trailing surrogate.
484    if (LLVM_UNLIKELY(Second < 0xDC00 || Second >= 0xE000)) {
485      Invalid();      // Leading surrogate was unpaired.
486      First = Second; // Second escape still needs to be processed.
487      continue;
488    }
489    // Case 3c: a valid surrogate pair encoding an astral codepoint.
490    encodeUtf8(0x10000 | ((First - 0xD800) << 10) | (Second - 0xDC00), Out);
491    return true;
492  }
493}
494
495bool Parser::parseError(const char *Msg) {
496  int Line = 1;
497  const char *StartOfLine = Start;
498  for (const char *X = Start; X < P; ++X) {
499    if (*X == 0x0A) {
500      ++Line;
501      StartOfLine = X + 1;
502    }
503  }
504  Err.emplace(
505      std::make_unique<ParseError>(Msg, Line, P - StartOfLine, P - Start));
506  return false;
507}
508} // namespace
509
510Expected<Value> parse(StringRef JSON) {
511  Parser P(JSON);
512  Value E = nullptr;
513  if (P.checkUTF8())
514    if (P.parseValue(E))
515      if (P.assertEnd())
516        return std::move(E);
517  return P.takeError();
518}
519char ParseError::ID = 0;
520
521static std::vector<const Object::value_type *> sortedElements(const Object &O) {
522  std::vector<const Object::value_type *> Elements;
523  for (const auto &E : O)
524    Elements.push_back(&E);
525  llvm::sort(Elements,
526             [](const Object::value_type *L, const Object::value_type *R) {
527               return L->first < R->first;
528             });
529  return Elements;
530}
531
532bool isUTF8(llvm::StringRef S, size_t *ErrOffset) {
533  // Fast-path for ASCII, which is valid UTF-8.
534  if (LLVM_LIKELY(isASCII(S)))
535    return true;
536
537  const UTF8 *Data = reinterpret_cast<const UTF8 *>(S.data()), *Rest = Data;
538  if (LLVM_LIKELY(isLegalUTF8String(&Rest, Data + S.size())))
539    return true;
540
541  if (ErrOffset)
542    *ErrOffset = Rest - Data;
543  return false;
544}
545
546std::string fixUTF8(llvm::StringRef S) {
547  // This isn't particularly efficient, but is only for error-recovery.
548  std::vector<UTF32> Codepoints(S.size()); // 1 codepoint per byte suffices.
549  const UTF8 *In8 = reinterpret_cast<const UTF8 *>(S.data());
550  UTF32 *Out32 = Codepoints.data();
551  ConvertUTF8toUTF32(&In8, In8 + S.size(), &Out32, Out32 + Codepoints.size(),
552                     lenientConversion);
553  Codepoints.resize(Out32 - Codepoints.data());
554  std::string Res(4 * Codepoints.size(), 0); // 4 bytes per codepoint suffice
555  const UTF32 *In32 = Codepoints.data();
556  UTF8 *Out8 = reinterpret_cast<UTF8 *>(&Res[0]);
557  ConvertUTF32toUTF8(&In32, In32 + Codepoints.size(), &Out8, Out8 + Res.size(),
558                     strictConversion);
559  Res.resize(reinterpret_cast<char *>(Out8) - Res.data());
560  return Res;
561}
562
563static void quote(llvm::raw_ostream &OS, llvm::StringRef S) {
564  OS << '\"';
565  for (unsigned char C : S) {
566    if (C == 0x22 || C == 0x5C)
567      OS << '\\';
568    if (C >= 0x20) {
569      OS << C;
570      continue;
571    }
572    OS << '\\';
573    switch (C) {
574    // A few characters are common enough to make short escapes worthwhile.
575    case '\t':
576      OS << 't';
577      break;
578    case '\n':
579      OS << 'n';
580      break;
581    case '\r':
582      OS << 'r';
583      break;
584    default:
585      OS << 'u';
586      llvm::write_hex(OS, C, llvm::HexPrintStyle::Lower, 4);
587      break;
588    }
589  }
590  OS << '\"';
591}
592
593void llvm::json::OStream::value(const Value &V) {
594  switch (V.kind()) {
595  case Value::Null:
596    valueBegin();
597    OS << "null";
598    return;
599  case Value::Boolean:
600    valueBegin();
601    OS << (*V.getAsBoolean() ? "true" : "false");
602    return;
603  case Value::Number:
604    valueBegin();
605    if (V.Type == Value::T_Integer)
606      OS << *V.getAsInteger();
607    else
608      OS << format("%.*g", std::numeric_limits<double>::max_digits10,
609                   *V.getAsNumber());
610    return;
611  case Value::String:
612    valueBegin();
613    quote(OS, *V.getAsString());
614    return;
615  case Value::Array:
616    return array([&] {
617      for (const Value &E : *V.getAsArray())
618        value(E);
619    });
620  case Value::Object:
621    return object([&] {
622      for (const Object::value_type *E : sortedElements(*V.getAsObject()))
623        attribute(E->first, E->second);
624    });
625  }
626}
627
628void llvm::json::OStream::valueBegin() {
629  assert(Stack.back().Ctx != Object && "Only attributes allowed here");
630  if (Stack.back().HasValue) {
631    assert(Stack.back().Ctx != Singleton && "Only one value allowed here");
632    OS << ',';
633  }
634  if (Stack.back().Ctx == Array)
635    newline();
636  Stack.back().HasValue = true;
637}
638
639void llvm::json::OStream::newline() {
640  if (IndentSize) {
641    OS.write('\n');
642    OS.indent(Indent);
643  }
644}
645
646void llvm::json::OStream::arrayBegin() {
647  valueBegin();
648  Stack.emplace_back();
649  Stack.back().Ctx = Array;
650  Indent += IndentSize;
651  OS << '[';
652}
653
654void llvm::json::OStream::arrayEnd() {
655  assert(Stack.back().Ctx == Array);
656  Indent -= IndentSize;
657  if (Stack.back().HasValue)
658    newline();
659  OS << ']';
660  Stack.pop_back();
661  assert(!Stack.empty());
662}
663
664void llvm::json::OStream::objectBegin() {
665  valueBegin();
666  Stack.emplace_back();
667  Stack.back().Ctx = Object;
668  Indent += IndentSize;
669  OS << '{';
670}
671
672void llvm::json::OStream::objectEnd() {
673  assert(Stack.back().Ctx == Object);
674  Indent -= IndentSize;
675  if (Stack.back().HasValue)
676    newline();
677  OS << '}';
678  Stack.pop_back();
679  assert(!Stack.empty());
680}
681
682void llvm::json::OStream::attributeBegin(llvm::StringRef Key) {
683  assert(Stack.back().Ctx == Object);
684  if (Stack.back().HasValue)
685    OS << ',';
686  newline();
687  Stack.back().HasValue = true;
688  Stack.emplace_back();
689  Stack.back().Ctx = Singleton;
690  if (LLVM_LIKELY(isUTF8(Key))) {
691    quote(OS, Key);
692  } else {
693    assert(false && "Invalid UTF-8 in attribute key");
694    quote(OS, fixUTF8(Key));
695  }
696  OS.write(':');
697  if (IndentSize)
698    OS.write(' ');
699}
700
701void llvm::json::OStream::attributeEnd() {
702  assert(Stack.back().Ctx == Singleton);
703  assert(Stack.back().HasValue && "Attribute must have a value");
704  Stack.pop_back();
705  assert(Stack.back().Ctx == Object);
706}
707
708} // namespace json
709} // namespace llvm
710
711void llvm::format_provider<llvm::json::Value>::format(
712    const llvm::json::Value &E, raw_ostream &OS, StringRef Options) {
713  unsigned IndentAmount = 0;
714  if (!Options.empty() && Options.getAsInteger(/*Radix=*/10, IndentAmount))
715    llvm_unreachable("json::Value format options should be an integer");
716  json::OStream(OS, IndentAmount).value(E);
717}
718
719