Skip to content

Commit

Permalink
reverse constants on big endian
Browse files Browse the repository at this point in the history
  • Loading branch information
grisumbras committed Jun 12, 2022
1 parent 43a87fa commit 4cc1484
Show file tree
Hide file tree
Showing 2 changed files with 93 additions and 35 deletions.
91 changes: 71 additions & 20 deletions include/boost/json/detail/utf8.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,22 @@
BOOST_JSON_NS_BEGIN
namespace detail {

#ifdef BOOST_JSON_BIG_ENDIAN
# define BOOST_JSON_MK_NUM(b1, b2) 0x ## b2 ## b1
# define BOOST_JSON_MK_NUM2(b1, b2) 0x ## b2 ## b1 ## 0000
# define BOOST_JSON_MK_NUM3(b1, b2, b3) 0x ## b3 ## b2 ## b1 ## 00
# define BOOST_JSON_MK_NUM4(b1, b2, b3, b4) 0x ## b4 ## b3 ## b2 ## b1
# define BOOST_JSON_UTF8_KIND(b) b & 0xFF
# define BOOST_JSON_UTF8_LENGTH(b) b >> 8
#else
# define BOOST_JSON_MK_NUM(b1, b2) 0x ## b1 ## b2
# define BOOST_JSON_MK_NUM2(b1, b2) 0x ## b1 ## b2
# define BOOST_JSON_MK_NUM3(b1, b2, b3) 0x ## b1 ## b2 ## b3
# define BOOST_JSON_MK_NUM4(b1, b2, b3, b4) 0x ## b1 ## b2 ## b3 ## b4
# define BOOST_JSON_UTF8_KIND(b) b >> 8
# define BOOST_JSON_UTF8_LENGTH(b) b & FF
#endif

template<int N>
std::uint32_t
load_little_endian(void const* p)
Expand All @@ -38,6 +54,7 @@ inline
uint16_t
classify_utf8(char c)
{
// for little endian
// 0x000 = invalid
// 0x102 = 2 bytes, second byte [80, BF]
// 0x203 = 3 bytes, second byte [A0, BF]
Expand All @@ -46,6 +63,7 @@ classify_utf8(char c)
// 0x504 = 4 bytes, second byte [90, BF]
// 0x604 = 4 bytes, second byte [80, BF]
// 0x704 = 4 bytes, second byte [80, 8F]
// for big endian the bytes are reversed
static constexpr uint16_t first[128]
{
0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
Expand All @@ -57,13 +75,41 @@ classify_utf8(char c)
0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,

0x000, 0x000, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102,
0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102,
0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102,
0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102,
0x203, 0x303, 0x303, 0x303, 0x303, 0x303, 0x303, 0x303,
0x303, 0x303, 0x303, 0x303, 0x303, 0x403, 0x303, 0x303,
0x504, 0x604, 0x604, 0x604, 0x704, 0x000, 0x000, 0x000,
BOOST_JSON_MK_NUM(0, 00), BOOST_JSON_MK_NUM(0, 00),
BOOST_JSON_MK_NUM(1, 02), BOOST_JSON_MK_NUM(1, 02),
BOOST_JSON_MK_NUM(1, 02), BOOST_JSON_MK_NUM(1, 02),
BOOST_JSON_MK_NUM(1, 02), BOOST_JSON_MK_NUM(1, 02),

BOOST_JSON_MK_NUM(1, 02), BOOST_JSON_MK_NUM(1, 02),
BOOST_JSON_MK_NUM(1, 02), BOOST_JSON_MK_NUM(1, 02),
BOOST_JSON_MK_NUM(1, 02), BOOST_JSON_MK_NUM(1, 02),
BOOST_JSON_MK_NUM(1, 02), BOOST_JSON_MK_NUM(1, 02),

BOOST_JSON_MK_NUM(1, 02), BOOST_JSON_MK_NUM(1, 02),
BOOST_JSON_MK_NUM(1, 02), BOOST_JSON_MK_NUM(1, 02),
BOOST_JSON_MK_NUM(1, 02), BOOST_JSON_MK_NUM(1, 02),
BOOST_JSON_MK_NUM(1, 02), BOOST_JSON_MK_NUM(1, 02),

BOOST_JSON_MK_NUM(1, 02), BOOST_JSON_MK_NUM(1, 02),
BOOST_JSON_MK_NUM(1, 02), BOOST_JSON_MK_NUM(1, 02),
BOOST_JSON_MK_NUM(1, 02), BOOST_JSON_MK_NUM(1, 02),
BOOST_JSON_MK_NUM(1, 02), BOOST_JSON_MK_NUM(1, 02),

BOOST_JSON_MK_NUM(2, 03), BOOST_JSON_MK_NUM(3, 03),
BOOST_JSON_MK_NUM(3, 03), BOOST_JSON_MK_NUM(3, 03),
BOOST_JSON_MK_NUM(3, 03), BOOST_JSON_MK_NUM(3, 03),
BOOST_JSON_MK_NUM(3, 03), BOOST_JSON_MK_NUM(3, 03),

BOOST_JSON_MK_NUM(3, 03), BOOST_JSON_MK_NUM(3, 03),
BOOST_JSON_MK_NUM(3, 03), BOOST_JSON_MK_NUM(3, 03),
BOOST_JSON_MK_NUM(3, 03), BOOST_JSON_MK_NUM(4, 03),
BOOST_JSON_MK_NUM(3, 03), BOOST_JSON_MK_NUM(3, 03),

BOOST_JSON_MK_NUM(5, 04), BOOST_JSON_MK_NUM(6, 04),
BOOST_JSON_MK_NUM(6, 04), BOOST_JSON_MK_NUM(6, 04),
BOOST_JSON_MK_NUM(7, 04), BOOST_JSON_MK_NUM(0, 00),
BOOST_JSON_MK_NUM(0, 00), BOOST_JSON_MK_NUM(0, 00),

0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
};
return first[static_cast<unsigned char>(c & 0x7F)];
Expand All @@ -74,30 +120,33 @@ bool
is_valid_utf8(const char* p, uint16_t first)
{
uint32_t v;
switch(first >> 8)
switch(BOOST_JSON_UTF8_KIND(first))
{
default:
return false;

// 2 bytes, second byte [80, BF]
case 1:
v = load_little_endian<2>(p);
return (v & 0xC000) == 0x8000;
std::memcpy(&v, p, 2);
return (v & BOOST_JSON_MK_NUM2(C0,00)) == BOOST_JSON_MK_NUM2(80,00);

// 3 bytes, second byte [A0, BF]
case 2:
v = load_little_endian<3>(p);
return (v & 0xC0E000) == 0x80A000;
std::memcpy(&v, p, 3);
return (v & BOOST_JSON_MK_NUM3(C0,E0,00))
== BOOST_JSON_MK_NUM3(80,A0,00);

// 3 bytes, second byte [80, BF]
case 3:
v = load_little_endian<3>(p);
return (v & 0xC0C000) == 0x808000;
std::memcpy(&v, p, 3);
return (v & BOOST_JSON_MK_NUM3(C0,C0,00))
== BOOST_JSON_MK_NUM3(80,80,00);

// 3 bytes, second byte [80, 9F]
case 4:
v = load_little_endian<3>(p);
return (v & 0xC0E000) == 0x808000;
std::memcpy(&v, p, 3);
return (v & BOOST_JSON_MK_NUM3(C0,E0,00))
== BOOST_JSON_MK_NUM3(80,80,00);

// 4 bytes, second byte [90, BF]
case 5:
Expand All @@ -106,13 +155,15 @@ is_valid_utf8(const char* p, uint16_t first)

// 4 bytes, second byte [80, BF]
case 6:
v = load_little_endian<4>(p);
return (v & 0xC0C0C000) == 0x80808000;
std::memcpy(&v, p, 4);
return (v & BOOST_JSON_MK_NUM4(C0,C0,C0,00))
== BOOST_JSON_MK_NUM4(80,80,80,00);

// 4 bytes, second byte [80, 8F]
case 7:
v = load_little_endian<4>(p);
return (v & 0xC0C0F000) == 0x80808000;
std::memcpy(&v, p, 4);
return (v & BOOST_JSON_MK_NUM4(C0,C0,F0,00))
== BOOST_JSON_MK_NUM4(80,80,80,00);
}
}

Expand Down
37 changes: 22 additions & 15 deletions test/utf8.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,21 +50,25 @@ class utf8_test
void
testClassifyUtf8()
{
BOOST_TEST((detail::classify_utf8('\x00') & 0xFF) == 0);
BOOST_TEST(detail::classify_utf8('\x00') == 0);
// from code point U+0080 (0xC280 in UTF-8)
BOOST_TEST((detail::classify_utf8('\xC2') & 0xFF) == 2);
BOOST_TEST(detail::classify_utf8('\xC2') == BOOST_JSON_MK_NUM(1, 02));
// from code point U+07FF (0xDFBF in UTF-8)
BOOST_TEST((detail::classify_utf8('\xDF') & 0xFF) == 2);
BOOST_TEST(detail::classify_utf8('\xDF') == BOOST_JSON_MK_NUM(1, 02));
// from code point U+0800 (0xE0A080 in UTF-8)
BOOST_TEST((detail::classify_utf8('\xE0') & 0xFF) == 3);
// from code point U+0FFFF (0xEFBFBF in UTF-8)
BOOST_TEST((detail::classify_utf8('\xEF') & 0xFF) == 3);
// from code point U+010000 (0xF0908080 in UTF-8)
BOOST_TEST((detail::classify_utf8('\xF0') & 0xFF) == 4);
// from code point U+010000 (0xF0908080 in UTF-8)
BOOST_TEST((detail::classify_utf8('\xF0') & 0xFF) == 4);
// from code point U+010FFFF (0xF48FBFBF in UTF-8)
BOOST_TEST((detail::classify_utf8('\xF4') & 0xFF) == 4);
BOOST_TEST(detail::classify_utf8('\xE0') == BOOST_JSON_MK_NUM(2, 03));
// from code point U+D7B0 (0xED9EB0 in UTF-8)
BOOST_TEST(detail::classify_utf8('\xED') == BOOST_JSON_MK_NUM(4, 03));
// from code point U+FFFF (0xEFBFBF in UTF-8)
BOOST_TEST(detail::classify_utf8('\xEF') == BOOST_JSON_MK_NUM(3, 03));
// from code point U+10000 (0xF0908080 in UTF-8)
BOOST_TEST(detail::classify_utf8('\xF0') == BOOST_JSON_MK_NUM(5, 04));
// from code point U+80000 (0xF1808080 in UTF-8)
BOOST_TEST(detail::classify_utf8('\xF1') == BOOST_JSON_MK_NUM(6, 04));
// from code point U+C00000 (0xF3808080 in UTF-8)
BOOST_TEST(detail::classify_utf8('\xF3') == BOOST_JSON_MK_NUM(6, 04));
// from code point U+10FFFF (0xF48FBFBF in UTF-8)
BOOST_TEST(detail::classify_utf8('\xF4') == BOOST_JSON_MK_NUM(7, 04));
}

void
Expand All @@ -78,9 +82,12 @@ class utf8_test
BOOST_TEST(is_valid_utf8("\xC2\x80")); // code point U+0080
BOOST_TEST(is_valid_utf8("\xDF\xBF")); // code point U+07FF
BOOST_TEST(is_valid_utf8("\xE0\xA0\x80")); // code point U+0800
BOOST_TEST(is_valid_utf8("\xEF\xBF\xBF")); // from code point U+0FFFF
BOOST_TEST(is_valid_utf8("\xF0\x90\x80\x80")); // code point U+010000
BOOST_TEST(is_valid_utf8("\xF4\x8F\xBF\xBF")); // code point U+010FFFF
BOOST_TEST(is_valid_utf8("\xED\x9E\xB0")); // code point U+D7B0
BOOST_TEST(is_valid_utf8("\xEF\xBF\xBF")); // from code point U+FFFF
BOOST_TEST(is_valid_utf8("\xF0\x90\x80\x80")); // code point U+10000
BOOST_TEST(is_valid_utf8("\xF1\x80\x80\x80")); // code point U+80000
BOOST_TEST(is_valid_utf8("\xF3\x80\x80\x80")); // code point U+C00000
BOOST_TEST(is_valid_utf8("\xF4\x8F\xBF\xBF")); // code point U+10FFFF

BOOST_TEST(! is_valid_utf8("\x80"));
BOOST_TEST(! is_valid_utf8("\xBF"));
Expand Down

0 comments on commit 4cc1484

Please sign in to comment.