Horizon
Loading...
Searching...
No Matches
lexer.hpp
1#pragma once
2
3#include <array> // array
4#include <clocale> // localeconv
5#include <cstddef> // size_t
6#include <cstdio> // snprintf
7#include <cstdlib> // strtof, strtod, strtold, strtoll, strtoull
8#include <initializer_list> // initializer_list
9#include <string> // char_traits, string
10#include <utility> // move
11#include <vector> // vector
12
13#include <nlohmann/detail/input/input_adapters.hpp>
14#include <nlohmann/detail/input/position_t.hpp>
15#include <nlohmann/detail/macro_scope.hpp>
16
17namespace nlohmann
18{
19namespace detail
20{
22// lexer //
24
25template<typename BasicJsonType>
27{
28 public:
50
52 JSON_HEDLEY_RETURNS_NON_NULL
53 JSON_HEDLEY_CONST
54 static const char* token_type_name(const token_type t) noexcept
55 {
56 switch (t)
57 {
59 return "<uninitialized>";
61 return "true literal";
63 return "false literal";
65 return "null literal";
67 return "string literal";
71 return "number literal";
73 return "'['";
75 return "'{'";
77 return "']'";
79 return "'}'";
81 return "':'";
83 return "','";
85 return "<parse error>";
87 return "end of input";
89 return "'[', '{', or a literal";
90 // LCOV_EXCL_START
91 default: // catch non-enum values
92 return "unknown token";
93 // LCOV_EXCL_STOP
94 }
95 }
96};
102template<typename BasicJsonType, typename InputAdapterType>
103class lexer : public lexer_base<BasicJsonType>
104{
105 using number_integer_t = typename BasicJsonType::number_integer_t;
106 using number_unsigned_t = typename BasicJsonType::number_unsigned_t;
107 using number_float_t = typename BasicJsonType::number_float_t;
108 using string_t = typename BasicJsonType::string_t;
109 using char_type = typename InputAdapterType::char_type;
110 using char_int_type = typename std::char_traits<char_type>::int_type;
111
112 public:
113 using token_type = typename lexer_base<BasicJsonType>::token_type;
114
115 explicit lexer(InputAdapterType&& adapter, bool ignore_comments_ = false) noexcept
116 : ia(std::move(adapter))
117 , ignore_comments(ignore_comments_)
118 , decimal_point_char(static_cast<char_int_type>(get_decimal_point()))
119 {}
120
121 // delete because of pointer members
122 lexer(const lexer&) = delete;
123 lexer(lexer&&) = default; // NOLINT(hicpp-noexcept-move,performance-noexcept-move-constructor)
124 lexer& operator=(lexer&) = delete;
125 lexer& operator=(lexer&&) = default; // NOLINT(hicpp-noexcept-move,performance-noexcept-move-constructor)
126 ~lexer() = default;
127
128 private:
130 // locales
132
134 JSON_HEDLEY_PURE
135 static char get_decimal_point() noexcept
136 {
137 const auto* loc = localeconv();
138 JSON_ASSERT(loc != nullptr);
139 return (loc->decimal_point == nullptr) ? '.' : *(loc->decimal_point);
140 }
141
143 // scan functions
145
161 int get_codepoint()
162 {
163 // this function only makes sense after reading `\u`
164 JSON_ASSERT(current == 'u');
165 int codepoint = 0;
166
167 const auto factors = { 12u, 8u, 4u, 0u };
168 for (const auto factor : factors)
169 {
170 get();
171
172 if (current >= '0' && current <= '9')
173 {
174 codepoint += static_cast<int>((static_cast<unsigned int>(current) - 0x30u) << factor);
175 }
176 else if (current >= 'A' && current <= 'F')
177 {
178 codepoint += static_cast<int>((static_cast<unsigned int>(current) - 0x37u) << factor);
179 }
180 else if (current >= 'a' && current <= 'f')
181 {
182 codepoint += static_cast<int>((static_cast<unsigned int>(current) - 0x57u) << factor);
183 }
184 else
185 {
186 return -1;
187 }
188 }
189
190 JSON_ASSERT(0x0000 <= codepoint && codepoint <= 0xFFFF);
191 return codepoint;
192 }
193
209 bool next_byte_in_range(std::initializer_list<char_int_type> ranges)
210 {
211 JSON_ASSERT(ranges.size() == 2 || ranges.size() == 4 || ranges.size() == 6);
212 add(current);
213
214 for (auto range = ranges.begin(); range != ranges.end(); ++range)
215 {
216 get();
217 if (JSON_HEDLEY_LIKELY(*range <= current && current <= *(++range)))
218 {
219 add(current);
220 }
221 else
222 {
223 error_message = "invalid string: ill-formed UTF-8 byte";
224 return false;
225 }
226 }
227
228 return true;
229 }
230
246 token_type scan_string()
247 {
248 // reset token_buffer (ignore opening quote)
249 reset();
250
251 // we entered the function by reading an open quote
252 JSON_ASSERT(current == '\"');
253
254 while (true)
255 {
256 // get next character
257 switch (get())
258 {
259 // end of file while parsing string
260 case std::char_traits<char_type>::eof():
261 {
262 error_message = "invalid string: missing closing quote";
263 return token_type::parse_error;
264 }
265
266 // closing quote
267 case '\"':
268 {
269 return token_type::value_string;
270 }
271
272 // escapes
273 case '\\':
274 {
275 switch (get())
276 {
277 // quotation mark
278 case '\"':
279 add('\"');
280 break;
281 // reverse solidus
282 case '\\':
283 add('\\');
284 break;
285 // solidus
286 case '/':
287 add('/');
288 break;
289 // backspace
290 case 'b':
291 add('\b');
292 break;
293 // form feed
294 case 'f':
295 add('\f');
296 break;
297 // line feed
298 case 'n':
299 add('\n');
300 break;
301 // carriage return
302 case 'r':
303 add('\r');
304 break;
305 // tab
306 case 't':
307 add('\t');
308 break;
309
310 // unicode escapes
311 case 'u':
312 {
313 const int codepoint1 = get_codepoint();
314 int codepoint = codepoint1; // start with codepoint1
315
316 if (JSON_HEDLEY_UNLIKELY(codepoint1 == -1))
317 {
318 error_message = "invalid string: '\\u' must be followed by 4 hex digits";
319 return token_type::parse_error;
320 }
321
322 // check if code point is a high surrogate
323 if (0xD800 <= codepoint1 && codepoint1 <= 0xDBFF)
324 {
325 // expect next \uxxxx entry
326 if (JSON_HEDLEY_LIKELY(get() == '\\' && get() == 'u'))
327 {
328 const int codepoint2 = get_codepoint();
329
330 if (JSON_HEDLEY_UNLIKELY(codepoint2 == -1))
331 {
332 error_message = "invalid string: '\\u' must be followed by 4 hex digits";
333 return token_type::parse_error;
334 }
335
336 // check if codepoint2 is a low surrogate
337 if (JSON_HEDLEY_LIKELY(0xDC00 <= codepoint2 && codepoint2 <= 0xDFFF))
338 {
339 // overwrite codepoint
340 codepoint = static_cast<int>(
341 // high surrogate occupies the most significant 22 bits
342 (static_cast<unsigned int>(codepoint1) << 10u)
343 // low surrogate occupies the least significant 15 bits
344 + static_cast<unsigned int>(codepoint2)
345 // there is still the 0xD800, 0xDC00 and 0x10000 noise
346 // in the result so we have to subtract with:
347 // (0xD800 << 10) + DC00 - 0x10000 = 0x35FDC00
348 - 0x35FDC00u);
349 }
350 else
351 {
352 error_message = "invalid string: surrogate U+D800..U+DBFF must be followed by U+DC00..U+DFFF";
353 return token_type::parse_error;
354 }
355 }
356 else
357 {
358 error_message = "invalid string: surrogate U+D800..U+DBFF must be followed by U+DC00..U+DFFF";
359 return token_type::parse_error;
360 }
361 }
362 else
363 {
364 if (JSON_HEDLEY_UNLIKELY(0xDC00 <= codepoint1 && codepoint1 <= 0xDFFF))
365 {
366 error_message = "invalid string: surrogate U+DC00..U+DFFF must follow U+D800..U+DBFF";
367 return token_type::parse_error;
368 }
369 }
370
371 // result of the above calculation yields a proper codepoint
372 JSON_ASSERT(0x00 <= codepoint && codepoint <= 0x10FFFF);
373
374 // translate codepoint into bytes
375 if (codepoint < 0x80)
376 {
377 // 1-byte characters: 0xxxxxxx (ASCII)
378 add(static_cast<char_int_type>(codepoint));
379 }
380 else if (codepoint <= 0x7FF)
381 {
382 // 2-byte characters: 110xxxxx 10xxxxxx
383 add(static_cast<char_int_type>(0xC0u | (static_cast<unsigned int>(codepoint) >> 6u)));
384 add(static_cast<char_int_type>(0x80u | (static_cast<unsigned int>(codepoint) & 0x3Fu)));
385 }
386 else if (codepoint <= 0xFFFF)
387 {
388 // 3-byte characters: 1110xxxx 10xxxxxx 10xxxxxx
389 add(static_cast<char_int_type>(0xE0u | (static_cast<unsigned int>(codepoint) >> 12u)));
390 add(static_cast<char_int_type>(0x80u | ((static_cast<unsigned int>(codepoint) >> 6u) & 0x3Fu)));
391 add(static_cast<char_int_type>(0x80u | (static_cast<unsigned int>(codepoint) & 0x3Fu)));
392 }
393 else
394 {
395 // 4-byte characters: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
396 add(static_cast<char_int_type>(0xF0u | (static_cast<unsigned int>(codepoint) >> 18u)));
397 add(static_cast<char_int_type>(0x80u | ((static_cast<unsigned int>(codepoint) >> 12u) & 0x3Fu)));
398 add(static_cast<char_int_type>(0x80u | ((static_cast<unsigned int>(codepoint) >> 6u) & 0x3Fu)));
399 add(static_cast<char_int_type>(0x80u | (static_cast<unsigned int>(codepoint) & 0x3Fu)));
400 }
401
402 break;
403 }
404
405 // other characters after escape
406 default:
407 error_message = "invalid string: forbidden character after backslash";
408 return token_type::parse_error;
409 }
410
411 break;
412 }
413
414 // invalid control characters
415 case 0x00:
416 {
417 error_message = "invalid string: control character U+0000 (NUL) must be escaped to \\u0000";
418 return token_type::parse_error;
419 }
420
421 case 0x01:
422 {
423 error_message = "invalid string: control character U+0001 (SOH) must be escaped to \\u0001";
424 return token_type::parse_error;
425 }
426
427 case 0x02:
428 {
429 error_message = "invalid string: control character U+0002 (STX) must be escaped to \\u0002";
430 return token_type::parse_error;
431 }
432
433 case 0x03:
434 {
435 error_message = "invalid string: control character U+0003 (ETX) must be escaped to \\u0003";
436 return token_type::parse_error;
437 }
438
439 case 0x04:
440 {
441 error_message = "invalid string: control character U+0004 (EOT) must be escaped to \\u0004";
442 return token_type::parse_error;
443 }
444
445 case 0x05:
446 {
447 error_message = "invalid string: control character U+0005 (ENQ) must be escaped to \\u0005";
448 return token_type::parse_error;
449 }
450
451 case 0x06:
452 {
453 error_message = "invalid string: control character U+0006 (ACK) must be escaped to \\u0006";
454 return token_type::parse_error;
455 }
456
457 case 0x07:
458 {
459 error_message = "invalid string: control character U+0007 (BEL) must be escaped to \\u0007";
460 return token_type::parse_error;
461 }
462
463 case 0x08:
464 {
465 error_message = "invalid string: control character U+0008 (BS) must be escaped to \\u0008 or \\b";
466 return token_type::parse_error;
467 }
468
469 case 0x09:
470 {
471 error_message = "invalid string: control character U+0009 (HT) must be escaped to \\u0009 or \\t";
472 return token_type::parse_error;
473 }
474
475 case 0x0A:
476 {
477 error_message = "invalid string: control character U+000A (LF) must be escaped to \\u000A or \\n";
478 return token_type::parse_error;
479 }
480
481 case 0x0B:
482 {
483 error_message = "invalid string: control character U+000B (VT) must be escaped to \\u000B";
484 return token_type::parse_error;
485 }
486
487 case 0x0C:
488 {
489 error_message = "invalid string: control character U+000C (FF) must be escaped to \\u000C or \\f";
490 return token_type::parse_error;
491 }
492
493 case 0x0D:
494 {
495 error_message = "invalid string: control character U+000D (CR) must be escaped to \\u000D or \\r";
496 return token_type::parse_error;
497 }
498
499 case 0x0E:
500 {
501 error_message = "invalid string: control character U+000E (SO) must be escaped to \\u000E";
502 return token_type::parse_error;
503 }
504
505 case 0x0F:
506 {
507 error_message = "invalid string: control character U+000F (SI) must be escaped to \\u000F";
508 return token_type::parse_error;
509 }
510
511 case 0x10:
512 {
513 error_message = "invalid string: control character U+0010 (DLE) must be escaped to \\u0010";
514 return token_type::parse_error;
515 }
516
517 case 0x11:
518 {
519 error_message = "invalid string: control character U+0011 (DC1) must be escaped to \\u0011";
520 return token_type::parse_error;
521 }
522
523 case 0x12:
524 {
525 error_message = "invalid string: control character U+0012 (DC2) must be escaped to \\u0012";
526 return token_type::parse_error;
527 }
528
529 case 0x13:
530 {
531 error_message = "invalid string: control character U+0013 (DC3) must be escaped to \\u0013";
532 return token_type::parse_error;
533 }
534
535 case 0x14:
536 {
537 error_message = "invalid string: control character U+0014 (DC4) must be escaped to \\u0014";
538 return token_type::parse_error;
539 }
540
541 case 0x15:
542 {
543 error_message = "invalid string: control character U+0015 (NAK) must be escaped to \\u0015";
544 return token_type::parse_error;
545 }
546
547 case 0x16:
548 {
549 error_message = "invalid string: control character U+0016 (SYN) must be escaped to \\u0016";
550 return token_type::parse_error;
551 }
552
553 case 0x17:
554 {
555 error_message = "invalid string: control character U+0017 (ETB) must be escaped to \\u0017";
556 return token_type::parse_error;
557 }
558
559 case 0x18:
560 {
561 error_message = "invalid string: control character U+0018 (CAN) must be escaped to \\u0018";
562 return token_type::parse_error;
563 }
564
565 case 0x19:
566 {
567 error_message = "invalid string: control character U+0019 (EM) must be escaped to \\u0019";
568 return token_type::parse_error;
569 }
570
571 case 0x1A:
572 {
573 error_message = "invalid string: control character U+001A (SUB) must be escaped to \\u001A";
574 return token_type::parse_error;
575 }
576
577 case 0x1B:
578 {
579 error_message = "invalid string: control character U+001B (ESC) must be escaped to \\u001B";
580 return token_type::parse_error;
581 }
582
583 case 0x1C:
584 {
585 error_message = "invalid string: control character U+001C (FS) must be escaped to \\u001C";
586 return token_type::parse_error;
587 }
588
589 case 0x1D:
590 {
591 error_message = "invalid string: control character U+001D (GS) must be escaped to \\u001D";
592 return token_type::parse_error;
593 }
594
595 case 0x1E:
596 {
597 error_message = "invalid string: control character U+001E (RS) must be escaped to \\u001E";
598 return token_type::parse_error;
599 }
600
601 case 0x1F:
602 {
603 error_message = "invalid string: control character U+001F (US) must be escaped to \\u001F";
604 return token_type::parse_error;
605 }
606
607 // U+0020..U+007F (except U+0022 (quote) and U+005C (backspace))
608 case 0x20:
609 case 0x21:
610 case 0x23:
611 case 0x24:
612 case 0x25:
613 case 0x26:
614 case 0x27:
615 case 0x28:
616 case 0x29:
617 case 0x2A:
618 case 0x2B:
619 case 0x2C:
620 case 0x2D:
621 case 0x2E:
622 case 0x2F:
623 case 0x30:
624 case 0x31:
625 case 0x32:
626 case 0x33:
627 case 0x34:
628 case 0x35:
629 case 0x36:
630 case 0x37:
631 case 0x38:
632 case 0x39:
633 case 0x3A:
634 case 0x3B:
635 case 0x3C:
636 case 0x3D:
637 case 0x3E:
638 case 0x3F:
639 case 0x40:
640 case 0x41:
641 case 0x42:
642 case 0x43:
643 case 0x44:
644 case 0x45:
645 case 0x46:
646 case 0x47:
647 case 0x48:
648 case 0x49:
649 case 0x4A:
650 case 0x4B:
651 case 0x4C:
652 case 0x4D:
653 case 0x4E:
654 case 0x4F:
655 case 0x50:
656 case 0x51:
657 case 0x52:
658 case 0x53:
659 case 0x54:
660 case 0x55:
661 case 0x56:
662 case 0x57:
663 case 0x58:
664 case 0x59:
665 case 0x5A:
666 case 0x5B:
667 case 0x5D:
668 case 0x5E:
669 case 0x5F:
670 case 0x60:
671 case 0x61:
672 case 0x62:
673 case 0x63:
674 case 0x64:
675 case 0x65:
676 case 0x66:
677 case 0x67:
678 case 0x68:
679 case 0x69:
680 case 0x6A:
681 case 0x6B:
682 case 0x6C:
683 case 0x6D:
684 case 0x6E:
685 case 0x6F:
686 case 0x70:
687 case 0x71:
688 case 0x72:
689 case 0x73:
690 case 0x74:
691 case 0x75:
692 case 0x76:
693 case 0x77:
694 case 0x78:
695 case 0x79:
696 case 0x7A:
697 case 0x7B:
698 case 0x7C:
699 case 0x7D:
700 case 0x7E:
701 case 0x7F:
702 {
703 add(current);
704 break;
705 }
706
707 // U+0080..U+07FF: bytes C2..DF 80..BF
708 case 0xC2:
709 case 0xC3:
710 case 0xC4:
711 case 0xC5:
712 case 0xC6:
713 case 0xC7:
714 case 0xC8:
715 case 0xC9:
716 case 0xCA:
717 case 0xCB:
718 case 0xCC:
719 case 0xCD:
720 case 0xCE:
721 case 0xCF:
722 case 0xD0:
723 case 0xD1:
724 case 0xD2:
725 case 0xD3:
726 case 0xD4:
727 case 0xD5:
728 case 0xD6:
729 case 0xD7:
730 case 0xD8:
731 case 0xD9:
732 case 0xDA:
733 case 0xDB:
734 case 0xDC:
735 case 0xDD:
736 case 0xDE:
737 case 0xDF:
738 {
739 if (JSON_HEDLEY_UNLIKELY(!next_byte_in_range({0x80, 0xBF})))
740 {
741 return token_type::parse_error;
742 }
743 break;
744 }
745
746 // U+0800..U+0FFF: bytes E0 A0..BF 80..BF
747 case 0xE0:
748 {
749 if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0xA0, 0xBF, 0x80, 0xBF}))))
750 {
751 return token_type::parse_error;
752 }
753 break;
754 }
755
756 // U+1000..U+CFFF: bytes E1..EC 80..BF 80..BF
757 // U+E000..U+FFFF: bytes EE..EF 80..BF 80..BF
758 case 0xE1:
759 case 0xE2:
760 case 0xE3:
761 case 0xE4:
762 case 0xE5:
763 case 0xE6:
764 case 0xE7:
765 case 0xE8:
766 case 0xE9:
767 case 0xEA:
768 case 0xEB:
769 case 0xEC:
770 case 0xEE:
771 case 0xEF:
772 {
773 if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x80, 0xBF, 0x80, 0xBF}))))
774 {
775 return token_type::parse_error;
776 }
777 break;
778 }
779
780 // U+D000..U+D7FF: bytes ED 80..9F 80..BF
781 case 0xED:
782 {
783 if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x80, 0x9F, 0x80, 0xBF}))))
784 {
785 return token_type::parse_error;
786 }
787 break;
788 }
789
790 // U+10000..U+3FFFF F0 90..BF 80..BF 80..BF
791 case 0xF0:
792 {
793 if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x90, 0xBF, 0x80, 0xBF, 0x80, 0xBF}))))
794 {
795 return token_type::parse_error;
796 }
797 break;
798 }
799
800 // U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF
801 case 0xF1:
802 case 0xF2:
803 case 0xF3:
804 {
805 if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF}))))
806 {
807 return token_type::parse_error;
808 }
809 break;
810 }
811
812 // U+100000..U+10FFFF F4 80..8F 80..BF 80..BF
813 case 0xF4:
814 {
815 if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x80, 0x8F, 0x80, 0xBF, 0x80, 0xBF}))))
816 {
817 return token_type::parse_error;
818 }
819 break;
820 }
821
822 // remaining bytes (80..C1 and F5..FF) are ill-formed
823 default:
824 {
825 error_message = "invalid string: ill-formed UTF-8 byte";
826 return token_type::parse_error;
827 }
828 }
829 }
830 }
831
836 bool scan_comment()
837 {
838 switch (get())
839 {
840 // single-line comments skip input until a newline or EOF is read
841 case '/':
842 {
843 while (true)
844 {
845 switch (get())
846 {
847 case '\n':
848 case '\r':
849 case std::char_traits<char_type>::eof():
850 case '\0':
851 return true;
852
853 default:
854 break;
855 }
856 }
857 }
858
859 // multi-line comments skip input until */ is read
860 case '*':
861 {
862 while (true)
863 {
864 switch (get())
865 {
866 case std::char_traits<char_type>::eof():
867 case '\0':
868 {
869 error_message = "invalid comment; missing closing '*/'";
870 return false;
871 }
872
873 case '*':
874 {
875 switch (get())
876 {
877 case '/':
878 return true;
879
880 default:
881 {
882 unget();
883 continue;
884 }
885 }
886 }
887
888 default:
889 continue;
890 }
891 }
892 }
893
894 // unexpected character after reading '/'
895 default:
896 {
897 error_message = "invalid comment; expecting '/' or '*' after '/'";
898 return false;
899 }
900 }
901 }
902
903 JSON_HEDLEY_NON_NULL(2)
904 static void strtof(float& f, const char* str, char** endptr) noexcept
905 {
906 f = std::strtof(str, endptr);
907 }
908
909 JSON_HEDLEY_NON_NULL(2)
910 static void strtof(double& f, const char* str, char** endptr) noexcept
911 {
912 f = std::strtod(str, endptr);
913 }
914
915 JSON_HEDLEY_NON_NULL(2)
916 static void strtof(long double& f, const char* str, char** endptr) noexcept
917 {
918 f = std::strtold(str, endptr);
919 }
920
961 token_type scan_number() // lgtm [cpp/use-of-goto]
962 {
963 // reset token_buffer to store the number's bytes
964 reset();
965
966 // the type of the parsed number; initially set to unsigned; will be
967 // changed if minus sign, decimal point or exponent is read
968 token_type number_type = token_type::value_unsigned;
969
970 // state (init): we just found out we need to scan a number
971 switch (current)
972 {
973 case '-':
974 {
975 add(current);
976 goto scan_number_minus;
977 }
978
979 case '0':
980 {
981 add(current);
982 goto scan_number_zero;
983 }
984
985 case '1':
986 case '2':
987 case '3':
988 case '4':
989 case '5':
990 case '6':
991 case '7':
992 case '8':
993 case '9':
994 {
995 add(current);
996 goto scan_number_any1;
997 }
998
999 // all other characters are rejected outside scan_number()
1000 default: // LCOV_EXCL_LINE
1001 JSON_ASSERT(false); // NOLINT(cert-dcl03-c,hicpp-static-assert,misc-static-assert) LCOV_EXCL_LINE
1002 }
1003
1004scan_number_minus:
1005 // state: we just parsed a leading minus sign
1006 number_type = token_type::value_integer;
1007 switch (get())
1008 {
1009 case '0':
1010 {
1011 add(current);
1012 goto scan_number_zero;
1013 }
1014
1015 case '1':
1016 case '2':
1017 case '3':
1018 case '4':
1019 case '5':
1020 case '6':
1021 case '7':
1022 case '8':
1023 case '9':
1024 {
1025 add(current);
1026 goto scan_number_any1;
1027 }
1028
1029 default:
1030 {
1031 error_message = "invalid number; expected digit after '-'";
1032 return token_type::parse_error;
1033 }
1034 }
1035
1036scan_number_zero:
1037 // state: we just parse a zero (maybe with a leading minus sign)
1038 switch (get())
1039 {
1040 case '.':
1041 {
1042 add(decimal_point_char);
1043 goto scan_number_decimal1;
1044 }
1045
1046 case 'e':
1047 case 'E':
1048 {
1049 add(current);
1050 goto scan_number_exponent;
1051 }
1052
1053 default:
1054 goto scan_number_done;
1055 }
1056
1057scan_number_any1:
1058 // state: we just parsed a number 0-9 (maybe with a leading minus sign)
1059 switch (get())
1060 {
1061 case '0':
1062 case '1':
1063 case '2':
1064 case '3':
1065 case '4':
1066 case '5':
1067 case '6':
1068 case '7':
1069 case '8':
1070 case '9':
1071 {
1072 add(current);
1073 goto scan_number_any1;
1074 }
1075
1076 case '.':
1077 {
1078 add(decimal_point_char);
1079 goto scan_number_decimal1;
1080 }
1081
1082 case 'e':
1083 case 'E':
1084 {
1085 add(current);
1086 goto scan_number_exponent;
1087 }
1088
1089 default:
1090 goto scan_number_done;
1091 }
1092
1093scan_number_decimal1:
1094 // state: we just parsed a decimal point
1095 number_type = token_type::value_float;
1096 switch (get())
1097 {
1098 case '0':
1099 case '1':
1100 case '2':
1101 case '3':
1102 case '4':
1103 case '5':
1104 case '6':
1105 case '7':
1106 case '8':
1107 case '9':
1108 {
1109 add(current);
1110 goto scan_number_decimal2;
1111 }
1112
1113 default:
1114 {
1115 error_message = "invalid number; expected digit after '.'";
1116 return token_type::parse_error;
1117 }
1118 }
1119
1120scan_number_decimal2:
1121 // we just parsed at least one number after a decimal point
1122 switch (get())
1123 {
1124 case '0':
1125 case '1':
1126 case '2':
1127 case '3':
1128 case '4':
1129 case '5':
1130 case '6':
1131 case '7':
1132 case '8':
1133 case '9':
1134 {
1135 add(current);
1136 goto scan_number_decimal2;
1137 }
1138
1139 case 'e':
1140 case 'E':
1141 {
1142 add(current);
1143 goto scan_number_exponent;
1144 }
1145
1146 default:
1147 goto scan_number_done;
1148 }
1149
1150scan_number_exponent:
1151 // we just parsed an exponent
1152 number_type = token_type::value_float;
1153 switch (get())
1154 {
1155 case '+':
1156 case '-':
1157 {
1158 add(current);
1159 goto scan_number_sign;
1160 }
1161
1162 case '0':
1163 case '1':
1164 case '2':
1165 case '3':
1166 case '4':
1167 case '5':
1168 case '6':
1169 case '7':
1170 case '8':
1171 case '9':
1172 {
1173 add(current);
1174 goto scan_number_any2;
1175 }
1176
1177 default:
1178 {
1179 error_message =
1180 "invalid number; expected '+', '-', or digit after exponent";
1181 return token_type::parse_error;
1182 }
1183 }
1184
1185scan_number_sign:
1186 // we just parsed an exponent sign
1187 switch (get())
1188 {
1189 case '0':
1190 case '1':
1191 case '2':
1192 case '3':
1193 case '4':
1194 case '5':
1195 case '6':
1196 case '7':
1197 case '8':
1198 case '9':
1199 {
1200 add(current);
1201 goto scan_number_any2;
1202 }
1203
1204 default:
1205 {
1206 error_message = "invalid number; expected digit after exponent sign";
1207 return token_type::parse_error;
1208 }
1209 }
1210
1211scan_number_any2:
1212 // we just parsed a number after the exponent or exponent sign
1213 switch (get())
1214 {
1215 case '0':
1216 case '1':
1217 case '2':
1218 case '3':
1219 case '4':
1220 case '5':
1221 case '6':
1222 case '7':
1223 case '8':
1224 case '9':
1225 {
1226 add(current);
1227 goto scan_number_any2;
1228 }
1229
1230 default:
1231 goto scan_number_done;
1232 }
1233
1234scan_number_done:
1235 // unget the character after the number (we only read it to know that
1236 // we are done scanning a number)
1237 unget();
1238
1239 char* endptr = nullptr; // NOLINT(cppcoreguidelines-pro-type-vararg,hicpp-vararg)
1240 errno = 0;
1241
1242 // try to parse integers first and fall back to floats
1243 if (number_type == token_type::value_unsigned)
1244 {
1245 const auto x = std::strtoull(token_buffer.data(), &endptr, 10);
1246
1247 // we checked the number format before
1248 JSON_ASSERT(endptr == token_buffer.data() + token_buffer.size());
1249
1250 if (errno == 0)
1251 {
1252 value_unsigned = static_cast<number_unsigned_t>(x);
1253 if (value_unsigned == x)
1254 {
1255 return token_type::value_unsigned;
1256 }
1257 }
1258 }
1259 else if (number_type == token_type::value_integer)
1260 {
1261 const auto x = std::strtoll(token_buffer.data(), &endptr, 10);
1262
1263 // we checked the number format before
1264 JSON_ASSERT(endptr == token_buffer.data() + token_buffer.size());
1265
1266 if (errno == 0)
1267 {
1268 value_integer = static_cast<number_integer_t>(x);
1269 if (value_integer == x)
1270 {
1271 return token_type::value_integer;
1272 }
1273 }
1274 }
1275
1276 // this code is reached if we parse a floating-point number or if an
1277 // integer conversion above failed
1278 strtof(value_float, token_buffer.data(), &endptr);
1279
1280 // we checked the number format before
1281 JSON_ASSERT(endptr == token_buffer.data() + token_buffer.size());
1282
1283 return token_type::value_float;
1284 }
1285
1291 JSON_HEDLEY_NON_NULL(2)
1292 token_type scan_literal(const char_type* literal_text, const std::size_t length,
1293 token_type return_type)
1294 {
1295 JSON_ASSERT(std::char_traits<char_type>::to_char_type(current) == literal_text[0]);
1296 for (std::size_t i = 1; i < length; ++i)
1297 {
1298 if (JSON_HEDLEY_UNLIKELY(std::char_traits<char_type>::to_char_type(get()) != literal_text[i]))
1299 {
1300 error_message = "invalid literal";
1301 return token_type::parse_error;
1302 }
1303 }
1304 return return_type;
1305 }
1306
1308 // input management
1310
1312 void reset() noexcept
1313 {
1314 token_buffer.clear();
1315 token_string.clear();
1316 token_string.push_back(std::char_traits<char_type>::to_char_type(current));
1317 }
1318
1319 /*
1320 @brief get next character from the input
1321
1322 This function provides the interface to the used input adapter. It does
1323 not throw in case the input reached EOF, but returns a
1324 `std::char_traits<char>::eof()` in that case. Stores the scanned characters
1325 for use in error messages.
1326
1327 @return character read from the input
1328 */
1329 char_int_type get()
1330 {
1331 ++position.chars_read_total;
1332 ++position.chars_read_current_line;
1333
1334 if (next_unget)
1335 {
1336 // just reset the next_unget variable and work with current
1337 next_unget = false;
1338 }
1339 else
1340 {
1341 current = ia.get_character();
1342 }
1343
1344 if (JSON_HEDLEY_LIKELY(current != std::char_traits<char_type>::eof()))
1345 {
1346 token_string.push_back(std::char_traits<char_type>::to_char_type(current));
1347 }
1348
1349 if (current == '\n')
1350 {
1351 ++position.lines_read;
1352 position.chars_read_current_line = 0;
1353 }
1354
1355 return current;
1356 }
1357
1366 void unget()
1367 {
1368 next_unget = true;
1369
1370 --position.chars_read_total;
1371
1372 // in case we "unget" a newline, we have to also decrement the lines_read
1373 if (position.chars_read_current_line == 0)
1374 {
1375 if (position.lines_read > 0)
1376 {
1377 --position.lines_read;
1378 }
1379 }
1380 else
1381 {
1382 --position.chars_read_current_line;
1383 }
1384
1385 if (JSON_HEDLEY_LIKELY(current != std::char_traits<char_type>::eof()))
1386 {
1387 JSON_ASSERT(!token_string.empty());
1388 token_string.pop_back();
1389 }
1390 }
1391
1393 void add(char_int_type c)
1394 {
1395 token_buffer.push_back(static_cast<typename string_t::value_type>(c));
1396 }
1397
1398 public:
1400 // value getters
1402
1404 constexpr number_integer_t get_number_integer() const noexcept
1405 {
1406 return value_integer;
1407 }
1408
1410 constexpr number_unsigned_t get_number_unsigned() const noexcept
1411 {
1412 return value_unsigned;
1413 }
1414
1416 constexpr number_float_t get_number_float() const noexcept
1417 {
1418 return value_float;
1419 }
1420
1422 string_t& get_string()
1423 {
1424 return token_buffer;
1425 }
1426
1428 // diagnostics
1430
1432 constexpr position_t get_position() const noexcept
1433 {
1434 return position;
1435 }
1436
1440 std::string get_token_string() const
1441 {
1442 // escape control characters
1443 std::string result;
1444 for (const auto c : token_string)
1445 {
1446 if (static_cast<unsigned char>(c) <= '\x1F')
1447 {
1448 // escape control characters
1449 std::array<char, 9> cs{{}};
1450 (std::snprintf)(cs.data(), cs.size(), "<U+%.4X>", static_cast<unsigned char>(c)); // NOLINT(cppcoreguidelines-pro-type-vararg,hicpp-vararg)
1451 result += cs.data();
1452 }
1453 else
1454 {
1455 // add character as is
1456 result.push_back(static_cast<std::string::value_type>(c));
1457 }
1458 }
1459
1460 return result;
1461 }
1462
1464 JSON_HEDLEY_RETURNS_NON_NULL
1465 constexpr const char* get_error_message() const noexcept
1466 {
1467 return error_message;
1468 }
1469
1471 // actual scanner
1473
1479 {
1480 if (get() == 0xEF)
1481 {
1482 // check if we completely parse the BOM
1483 return get() == 0xBB && get() == 0xBF;
1484 }
1485
1486 // the first character is not the beginning of the BOM; unget it to
1487 // process is later
1488 unget();
1489 return true;
1490 }
1491
1492 void skip_whitespace()
1493 {
1494 do
1495 {
1496 get();
1497 }
1498 while (current == ' ' || current == '\t' || current == '\n' || current == '\r');
1499 }
1500
1501 token_type scan()
1502 {
1503 // initially, skip the BOM
1504 if (position.chars_read_total == 0 && !skip_bom())
1505 {
1506 error_message = "invalid BOM; must be 0xEF 0xBB 0xBF if given";
1507 return token_type::parse_error;
1508 }
1509
1510 // read next character and ignore whitespace
1511 skip_whitespace();
1512
1513 // ignore comments
1514 while (ignore_comments && current == '/')
1515 {
1516 if (!scan_comment())
1517 {
1518 return token_type::parse_error;
1519 }
1520
1521 // skip following whitespace
1522 skip_whitespace();
1523 }
1524
1525 switch (current)
1526 {
1527 // structural characters
1528 case '[':
1529 return token_type::begin_array;
1530 case ']':
1531 return token_type::end_array;
1532 case '{':
1533 return token_type::begin_object;
1534 case '}':
1535 return token_type::end_object;
1536 case ':':
1537 return token_type::name_separator;
1538 case ',':
1539 return token_type::value_separator;
1540
1541 // literals
1542 case 't':
1543 {
1544 std::array<char_type, 4> true_literal = {{char_type('t'), char_type('r'), char_type('u'), char_type('e')}};
1545 return scan_literal(true_literal.data(), true_literal.size(), token_type::literal_true);
1546 }
1547 case 'f':
1548 {
1549 std::array<char_type, 5> false_literal = {{char_type('f'), char_type('a'), char_type('l'), char_type('s'), char_type('e')}};
1550 return scan_literal(false_literal.data(), false_literal.size(), token_type::literal_false);
1551 }
1552 case 'n':
1553 {
1554 std::array<char_type, 4> null_literal = {{char_type('n'), char_type('u'), char_type('l'), char_type('l')}};
1555 return scan_literal(null_literal.data(), null_literal.size(), token_type::literal_null);
1556 }
1557
1558 // string
1559 case '\"':
1560 return scan_string();
1561
1562 // number
1563 case '-':
1564 case '0':
1565 case '1':
1566 case '2':
1567 case '3':
1568 case '4':
1569 case '5':
1570 case '6':
1571 case '7':
1572 case '8':
1573 case '9':
1574 return scan_number();
1575
1576 // end of input (the null byte is needed when parsing from
1577 // string literals)
1578 case '\0':
1579 case std::char_traits<char_type>::eof():
1580 return token_type::end_of_input;
1581
1582 // error
1583 default:
1584 error_message = "invalid literal";
1585 return token_type::parse_error;
1586 }
1587 }
1588
1589 private:
1591 InputAdapterType ia;
1592
1594 const bool ignore_comments = false;
1595
1597 char_int_type current = std::char_traits<char_type>::eof();
1598
1600 bool next_unget = false;
1601
1603 position_t position {};
1604
1606 std::vector<char_type> token_string {};
1607
1609 string_t token_buffer {};
1610
1612 const char* error_message = "";
1613
1614 // number values
1615 number_integer_t value_integer = 0;
1616 number_unsigned_t value_unsigned = 0;
1617 number_float_t value_float = 0;
1618
1620 const char_int_type decimal_point_char = '.';
1621};
1622} // namespace detail
1623} // namespace nlohmann
Definition lexer.hpp:27
JSON_HEDLEY_RETURNS_NON_NULL static JSON_HEDLEY_CONST const char * token_type_name(const token_type t) noexcept
return name of values of type token_type (only used for errors)
Definition lexer.hpp:54
token_type
token types for the parser
Definition lexer.hpp:31
@ value_float
an floating point number – use get_number_float() for actual value
@ begin_array
the character for array begin [
@ value_string
a string – use get_string() for actual value
@ end_array
the character for array end ]
@ uninitialized
indicating the scanner is uninitialized
@ parse_error
indicating a parse error
@ value_integer
a signed integer – use get_number_integer() for actual value
@ value_separator
the value separator ,
@ end_object
the character for object end }
@ begin_object
the character for object begin {
@ value_unsigned
an unsigned integer – use get_number_unsigned() for actual value
@ end_of_input
indicating the end of the input buffer
@ literal_or_value
a literal or the begin of a value (only for diagnostics)
lexical analysis
Definition lexer.hpp:104
string_t & get_string()
return current string value (implicitly resets the token; useful only once)
Definition lexer.hpp:1422
bool skip_bom()
skip the UTF-8 byte order mark
Definition lexer.hpp:1478
constexpr position_t get_position() const noexcept
return position of last read token
Definition lexer.hpp:1432
constexpr number_integer_t get_number_integer() const noexcept
return integer value
Definition lexer.hpp:1404
constexpr number_unsigned_t get_number_unsigned() const noexcept
return unsigned integer value
Definition lexer.hpp:1410
constexpr number_float_t get_number_float() const noexcept
return floating-point value
Definition lexer.hpp:1416
std::string get_token_string() const
return the last read token (for errors only).
Definition lexer.hpp:1440
JSON_HEDLEY_RETURNS_NON_NULL constexpr const char * get_error_message() const noexcept
return syntax error message
Definition lexer.hpp:1465
namespace for Niels Lohmann
Definition adl_serializer.hpp:12
struct to capture the start position of the current token
Definition position_t.hpp:11
std::size_t lines_read
the number of lines read
Definition position_t.hpp:17
std::size_t chars_read_current_line
the number of characters read in the current line
Definition position_t.hpp:15
std::size_t chars_read_total
the total number of characters read
Definition position_t.hpp:13