FE 0.9.0
Header-only C++ frontend library
Loading...
Searching...
No Matches
utf8.h
Go to the documentation of this file.
1#pragma once
2
3#include <cctype>
4
5#include <istream>
6#include <ostream>
7
8#include "fe/assert.h"
9
10/// UTF-8 helpers for decoding byte streams, encoding `char32_t` values, and running
11/// ASCII-style character classification on `char32_t`.
12///
13/// The central entry points are @ref decode and @ref encode. Decoding returns
14/// sentinel values such as @ref EoF and @ref Invalid instead of throwing.
15namespace fe::utf8 {
16
17static constexpr size_t Max = 4; ///< Maximal number of `char8_t`s of an UTF-8 byte sequence.
18static constexpr char32_t BOM = 0xfeff; ///< [Byte Order Mark](https://en.wikipedia.org/wiki/Byte_order_mark#UTF-8).
19static constexpr char32_t EoF
20 = (char32_t)std::istream::traits_type::eof(); ///< End of stream sentinel returned by @ref decode.
21static constexpr char32_t Null = 0; ///< U+0000 NULL returned unchanged by @ref decode.
22static constexpr char32_t Invalid = 0x110000; ///< Sentinel returned by @ref decode for malformed UTF-8.
23
24/// Returns the expected number of bytes for an UTF-8 char sequence by inspecting the first byte.
25/// Retuns @c 0 if invalid.
26constexpr size_t num_bytes(char8_t c) noexcept {
27 if ((c & char8_t(0b10000000)) == char8_t(0b00000000)) return 1;
28 if ((c & char8_t(0b11100000)) == char8_t(0b11000000)) return 2;
29 if ((c & char8_t(0b11110000)) == char8_t(0b11100000)) return 3;
30 if ((c & char8_t(0b11111000)) == char8_t(0b11110000)) return 4;
31 return 0;
32}
33
34/// Append @p b to @p c for converting UTF-8 to UTF-32.
35constexpr char32_t append(char32_t c, char8_t b) noexcept { return (c << 6) | (b & 0b00111111); }
36
37/// Get relevant bits of first UTF-8 byte @p c of a @em multi-byte sequence consisting of @p num bytes.
38constexpr char32_t first(char32_t c, char32_t num) noexcept { return c & (0b00011111 >> (num - 2)); }
39
40/// Minimum Unicode scalar value representable in an UTF-8 sequence of @p num bytes.
41constexpr char32_t min_code_point(size_t num) noexcept {
42 switch (num) {
43 case 1: return 0x000000;
44 case 2: return 0x000080;
45 case 3: return 0x000800;
46 case 4: return 0x010000;
47 default: return 0x110000;
48 }
49}
50
51/// Is @p c a valid Unicode scalar value?
52constexpr bool is_scalar_value(char32_t c) noexcept { return c <= 0x10ffff && !(0xd800 <= c && c <= 0xdfff); }
53
54/// Is the 2nd, 3rd, or 4th byte of an UTF-8 byte sequence valid?
55/// @returns the extracted `char8_t` or `char8_t(-1)` if invalid.
56constexpr char8_t is_valid234(char8_t c) noexcept {
57 return (c & char8_t(0b11000000)) == char8_t(0b10000000) ? (c & char8_t(0b00111111)) : char8_t(-1);
58}
59
60/// Decodes the next UTF-8 sequence from @p is into a single `char32_t`.
61///
62/// Returns @ref EoF when the stream is exhausted and @ref Invalid for malformed,
63/// overlong, surrogate, or otherwise non-scalar encodings.
64inline char32_t decode(std::istream& is) {
65 char32_t result = is.get();
66 if (result == EoF) return result;
67
68 switch (auto n = utf8::num_bytes(char8_t(result))) {
69 case 0: return Invalid;
70 case 1: return result;
71 default:
72 result = utf8::first(result, n);
73
74 for (size_t i = 1; i != n; ++i)
75 if (auto x = is_valid234(is.get()); x != char8_t(-1))
76 result = utf8::append(result, x);
77 else
78 return Invalid;
79
80 if (result < utf8::min_code_point(n) || !utf8::is_scalar_value(result)) return Invalid;
81 }
82
83 return result;
84}
85
86namespace {
87// and, or
88std::ostream& ao(std::ostream& os, char32_t c32, char32_t a = 0b00111111, char32_t o = 0b10000000) {
89 return os << char((c32 & a) | o);
90}
91} // namespace
92
93/// Encodes @p c32 as UTF-8 and writes the resulting bytes to @p os.
94///
95/// Returns `false` when @p c32 is outside the encodable range.
96inline bool encode(std::ostream& os, char32_t c32) {
97 // clang-format off
98 if (c32 <= 0x00007f) { ao(os, c32 , 0b11111111, 0b00000000); return true; }
99 if (c32 <= 0x0007ff) { ao(ao(os, c32 >> 6, 0b00011111, 0b11000000), c32); return true; }
100 if (c32 <= 0x00ffff) { ao(ao(ao(os, c32 >> 12, 0b00001111, 0b11100000), c32 >> 6), c32); return true; }
101 if (c32 <= 0x10ffff) { ao(ao(ao(ao(os, c32 >> 18, 0b00000111, 0b11110000), c32 >> 12), c32 >> 6), c32); return true; }
102 // clang-format on
103 return false;
104}
105/// Wrapper for `char32_t` with an `operator<<` that writes UTF-8.
106struct Char32 {
107 Char32(char32_t c)
108 : c(c) {}
109
110 friend std::ostream& operator<<(std::ostream& os, Char32 c) {
111 auto res = utf8::encode(os, c.c);
112 assert_unused(res);
113 return os;
114 }
115
116 char32_t c;
117};
118
119/// @name Wrappers
120/// Safe `char32_t`-style wrappers for <[ctype](https://en.cppreference.com/w/cpp/header/cctype)> functions:
121/// > Like all other functions from `<cctype>`, the behavior of `std::isalnum` is undefined if the argument's value is
122/// neither representable as `unsigned char` nor equal to `EOF`.
123///@{
124// clang-format off
125inline bool isalnum (char32_t c) noexcept { return (c & ~0xFF) == 0 ? std::isalnum (c) : false; }
126inline bool isalpha (char32_t c) noexcept { return (c & ~0xFF) == 0 ? std::isalpha (c) : false; }
127inline bool isblank (char32_t c) noexcept { return (c & ~0xFF) == 0 ? std::isblank (c) : false; }
128inline bool iscntrl (char32_t c) noexcept { return (c & ~0xFF) == 0 ? std::iscntrl (c) : false; }
129inline bool isdigit (char32_t c) noexcept { return (c & ~0xFF) == 0 ? std::isdigit (c) : false; }
130inline bool isgraph (char32_t c) noexcept { return (c & ~0xFF) == 0 ? std::isgraph (c) : false; }
131inline bool islower (char32_t c) noexcept { return (c & ~0xFF) == 0 ? std::islower (c) : false; }
132inline bool isprint (char32_t c) noexcept { return (c & ~0xFF) == 0 ? std::isprint (c) : false; }
133inline bool ispunct (char32_t c) noexcept { return (c & ~0xFF) == 0 ? std::ispunct (c) : false; }
134inline bool isspace (char32_t c) noexcept { return (c & ~0xFF) == 0 ? std::isspace (c) : false; }
135inline bool isupper (char32_t c) noexcept { return (c & ~0xFF) == 0 ? std::isupper (c) : false; }
136inline bool isxdigit(char32_t c) noexcept { return (c & ~0xFF) == 0 ? std::isxdigit(c) : false; }
137inline bool isascii (char32_t c) noexcept { return c <= 0x7F; }
138inline char32_t tolower(char32_t c) noexcept { return (c & ~0xFF) == 0 ? std::tolower(c) : c; }
139inline char32_t toupper(char32_t c) noexcept { return (c & ~0xFF) == 0 ? std::toupper(c) : c; }
140
141/// Is @p c within [begin, finis]?
142constexpr bool isrange(char32_t c, char32_t begin, char32_t finis) noexcept { return begin <= c && c <= finis; }
143constexpr auto isrange(char32_t begin, char32_t finis) noexcept { return [=](char32_t c) { return isrange(c, begin, finis); }; }
144
145constexpr bool isodigit(char32_t c) noexcept { return isrange(c, '0', '7'); } ///< Is octal digit?
146constexpr bool isbdigit(char32_t c) noexcept { return isrange(c, '0', '1'); } ///< Is binary digit?
147// clang-format on
148///@}
149
150/// @name any
151/// Build a predicate that checks whether a code point matches any of the given values.
152///@{
153inline bool _any(char32_t c, char32_t d) { return c == d; }
154template<class... T>
155inline bool _any(char32_t c, char32_t d, T... args) {
156 return c == d || _any(c, args...);
157}
158template<class... T>
159inline auto any(T... args) {
160 return [=](char32_t c) { return _any(c, args...); };
161}
162///@}
163
164} // namespace fe::utf8
#define assert_unused(x)
Definition assert.h:35
UTF-8 helpers for decoding byte streams, encoding char32_t values, and running ASCII-style character ...
Definition utf8.h:15
bool isalnum(char32_t c) noexcept
Definition utf8.h:125
bool isdigit(char32_t c) noexcept
Definition utf8.h:129
static constexpr char32_t Invalid
Sentinel returned by decode for malformed UTF-8.
Definition utf8.h:22
bool _any(char32_t c, char32_t d)
Definition utf8.h:153
static constexpr char32_t BOM
Byte Order Mark.
Definition utf8.h:18
char32_t tolower(char32_t c) noexcept
Definition utf8.h:138
bool isascii(char32_t c) noexcept
Definition utf8.h:137
constexpr size_t num_bytes(char8_t c) noexcept
Returns the expected number of bytes for an UTF-8 char sequence by inspecting the first byte.
Definition utf8.h:26
constexpr bool isbdigit(char32_t c) noexcept
Is binary digit?
Definition utf8.h:146
constexpr char32_t first(char32_t c, char32_t num) noexcept
Get relevant bits of first UTF-8 byte c of a multi-byte sequence consisting of num bytes.
Definition utf8.h:38
char32_t decode(std::istream &is)
Decodes the next UTF-8 sequence from is into a single char32_t.
Definition utf8.h:64
bool isxdigit(char32_t c) noexcept
Definition utf8.h:136
bool encode(std::ostream &os, char32_t c32)
Encodes c32 as UTF-8 and writes the resulting bytes to os.
Definition utf8.h:96
constexpr char8_t is_valid234(char8_t c) noexcept
Is the 2nd, 3rd, or 4th byte of an UTF-8 byte sequence valid?
Definition utf8.h:56
bool isprint(char32_t c) noexcept
Definition utf8.h:132
constexpr bool isodigit(char32_t c) noexcept
Is octal digit?
Definition utf8.h:145
constexpr char32_t min_code_point(size_t num) noexcept
Minimum Unicode scalar value representable in an UTF-8 sequence of num bytes.
Definition utf8.h:41
bool isblank(char32_t c) noexcept
Definition utf8.h:127
bool isalpha(char32_t c) noexcept
Definition utf8.h:126
static constexpr size_t Max
Maximal number of char8_ts of an UTF-8 byte sequence.
Definition utf8.h:17
bool isupper(char32_t c) noexcept
Definition utf8.h:135
char32_t toupper(char32_t c) noexcept
Definition utf8.h:139
auto any(T... args)
Definition utf8.h:159
bool iscntrl(char32_t c) noexcept
Definition utf8.h:128
bool ispunct(char32_t c) noexcept
Definition utf8.h:133
static constexpr char32_t EoF
End of stream sentinel returned by decode.
Definition utf8.h:20
constexpr char32_t append(char32_t c, char8_t b) noexcept
Append b to c for converting UTF-8 to UTF-32.
Definition utf8.h:35
constexpr bool isrange(char32_t c, char32_t begin, char32_t finis) noexcept
Is c within [begin, finis]?
Definition utf8.h:142
bool islower(char32_t c) noexcept
Definition utf8.h:131
constexpr bool is_scalar_value(char32_t c) noexcept
Is c a valid Unicode scalar value?
Definition utf8.h:52
bool isspace(char32_t c) noexcept
Definition utf8.h:134
bool isgraph(char32_t c) noexcept
Definition utf8.h:130
static constexpr char32_t Null
U+0000 NULL returned unchanged by decode.
Definition utf8.h:21
friend std::ostream & operator<<(std::ostream &os, Char32 c)
Definition utf8.h:110
char32_t c
Definition utf8.h:116
Char32(char32_t c)
Definition utf8.h:107