FE 0.6.1
A header-only C++ library for writing frontends
Loading...
Searching...
No Matches
utf8.h
Go to the documentation of this file.
1#pragma once
2
3#include <cctype>
4
5#include <istream>
6#include <ostream>
7
8#include "fe/assert.h"
9
10namespace fe::utf8 {
11
12static constexpr size_t Max = 4; ///< Maximal number of `char8_t`s of an UTF-8 byte sequence.
13static constexpr char32_t BOM = 0xfeff; ///< [Byte Order Mark](https://en.wikipedia.org/wiki/Byte_order_mark#UTF-8).
14static constexpr char32_t EoF = (char32_t)std::istream::traits_type::eof(); ///< End of File.
15static constexpr char32_t Null = 0; ///< U+0000 NULL.
16static constexpr char32_t Invalid = 0x110000; ///< Invalid UTF-8 sequence.
17
18/// Returns the expected number of bytes for an UTF-8 char sequence by inspecting the first byte.
19/// Retuns @c 0 if invalid.
20inline size_t num_bytes(char8_t c) {
21 if ((c & char8_t(0b10000000)) == char8_t(0b00000000)) return 1;
22 if ((c & char8_t(0b11100000)) == char8_t(0b11000000)) return 2;
23 if ((c & char8_t(0b11110000)) == char8_t(0b11100000)) return 3;
24 if ((c & char8_t(0b11111000)) == char8_t(0b11110000)) return 4;
25 return 0;
26}
27
28/// Append @p b to @p c for converting UTF-8 to UTF-32.
29inline char32_t append(char32_t c, char8_t b) { return (c << 6) | (b & 0b00111111); }
30
31/// Get relevant bits of first UTF-8 byte @p c of a @em multi-byte sequence consisting of @p num bytes.
32inline char32_t first(char32_t c, char32_t num) { return c & (0b00011111 >> (num - 2)); }
33
34/// Minimum Unicode scalar value representable in an UTF-8 sequence of @p num bytes.
35inline char32_t min_code_point(size_t num) {
36 switch (num) {
37 case 1: return 0x000000;
38 case 2: return 0x000080;
39 case 3: return 0x000800;
40 case 4: return 0x010000;
41 default: return 0x110000;
42 }
43}
44
45/// Is @p c a valid Unicode scalar value?
46inline bool is_scalar_value(char32_t c) { return c <= 0x10ffff && !(0xd800 <= c && c <= 0xdfff); }
47
48/// Is the 2nd, 3rd, or 4th byte of an UTF-8 byte sequence valid?
49/// @returns the extracted `char8_t` or `char8_t(-1)` if invalid.
50inline char8_t is_valid234(char8_t c) {
51 return (c & char8_t(0b11000000)) == char8_t(0b10000000) ? (c & char8_t(0b00111111)) : char8_t(-1);
52}
53
54/// Decodes the next sequence of bytes from @p is as UTF-32.
55/// @returns Invalid on error.
56inline char32_t decode(std::istream& is) {
57 char32_t result = is.get();
58 if (result == EoF) return result;
59
60 switch (auto n = utf8::num_bytes(char8_t(result))) {
61 case 0: return Invalid;
62 case 1: return result;
63 default:
64 result = utf8::first(result, n);
65
66 for (size_t i = 1; i != n; ++i)
67 if (auto x = is_valid234(is.get()); x != char8_t(-1))
68 result = utf8::append(result, x);
69 else
70 return Invalid;
71
72 if (result < utf8::min_code_point(n) || !utf8::is_scalar_value(result)) return Invalid;
73 }
74
75 return result;
76}
77
78namespace {
79// and, or
80std::ostream& ao(std::ostream& os, char32_t c32, char32_t a = 0b00111111, char32_t o = 0b10000000) {
81 return os << char((c32 & a) | o);
82}
83} // namespace
84
85/// Encodes the UTF-32 char @p c32 as UTF-8 and writes the sequence of bytes to @p os.
86/// @returns `false` on error.
87inline bool encode(std::ostream& os, char32_t c32) {
88 // clang-format off
89 if (c32 <= 0x00007f) { ao(os, c32 , 0b11111111, 0b00000000); return true; }
90 if (c32 <= 0x0007ff) { ao(ao(os, c32 >> 6, 0b00011111, 0b11000000), c32); return true; }
91 if (c32 <= 0x00ffff) { ao(ao(ao(os, c32 >> 12, 0b00001111, 0b11100000), c32 >> 6), c32); return true; }
92 if (c32 <= 0x10ffff) { ao(ao(ao(ao(os, c32 >> 18, 0b00000111, 0b11110000), c32 >> 12), c32 >> 6), c32); return true; }
93 // clang-format on
94 return false;
95}
96/// Wrapper for `char32_t` which has a friend ostream operator.
97struct Char32 {
98 Char32(char32_t c)
99 : c(c) {}
100
101 friend std::ostream& operator<<(std::ostream& os, Char32 c) {
102 auto res = utf8::encode(os, c.c);
103 assert_unused(res);
104 return os;
105 }
106
107 char32_t c;
108};
109
110/// @name Wrappers
111/// Safe `char32_t`-style wrappers for <[ctype](https://en.cppreference.com/w/cpp/header/cctype)> functions:
112/// > Like all other functions from `<cctype>`, the behavior of `std::isalnum` is undefined if the argument's value is
113/// neither representable as `unsigned char` nor equal to `EOF`.
114///@{
115// clang-format off
116inline bool isalnum (char32_t c) { return (c & ~0xFF) == 0 ? std::isalnum (c) : false; }
117inline bool isalpha (char32_t c) { return (c & ~0xFF) == 0 ? std::isalpha (c) : false; }
118inline bool isblank (char32_t c) { return (c & ~0xFF) == 0 ? std::isblank (c) : false; }
119inline bool iscntrl (char32_t c) { return (c & ~0xFF) == 0 ? std::iscntrl (c) : false; }
120inline bool isdigit (char32_t c) { return (c & ~0xFF) == 0 ? std::isdigit (c) : false; }
121inline bool isgraph (char32_t c) { return (c & ~0xFF) == 0 ? std::isgraph (c) : false; }
122inline bool islower (char32_t c) { return (c & ~0xFF) == 0 ? std::islower (c) : false; }
123inline bool isprint (char32_t c) { return (c & ~0xFF) == 0 ? std::isprint (c) : false; }
124inline bool ispunct (char32_t c) { return (c & ~0xFF) == 0 ? std::ispunct (c) : false; }
125inline bool isspace (char32_t c) { return (c & ~0xFF) == 0 ? std::isspace (c) : false; }
126inline bool isupper (char32_t c) { return (c & ~0xFF) == 0 ? std::isupper (c) : false; }
127inline bool isxdigit(char32_t c) { return (c & ~0xFF) == 0 ? std::isxdigit(c) : false; }
128inline bool isascii (char32_t c) { return c <= 0x7F; }
129inline char32_t tolower(char32_t c) { return (c & ~0xFF) == 0 ? std::tolower(c) : c; }
130inline char32_t toupper(char32_t c) { return (c & ~0xFF) == 0 ? std::toupper(c) : c; }
131
132/// Is @p c within [begin, finis]?
133inline bool isrange(char32_t c, char32_t begin, char32_t finis) { return begin <= c && c <= finis; }
134inline auto isrange(char32_t begin, char32_t finis) { return [=](char32_t c) { return isrange(c, begin, finis); }; }
135
136inline bool isodigit(char32_t c) { return isrange(c, '0', '7'); } ///< Is octal digit?
137inline bool isbdigit(char32_t c) { return isrange(c, '0', '1'); } ///< Is binary digit?
138// clang-format on
139///@}
140
141/// @name any
142/// Is @p c in any of the remaining arguments?
143///@{
144inline bool _any(char32_t c, char32_t d) { return c == d; }
145template<class... T>
146inline bool _any(char32_t c, char32_t d, T... args) {
147 return c == d || _any(c, args...);
148}
149template<class... T>
150inline auto any(T... args) {
151 return [=](char32_t c) { return _any(c, args...); };
152}
153///@}
154
155} // namespace fe::utf8
#define assert_unused(x)
Definition assert.h:35
bool isbdigit(char32_t c)
Is binary digit?
Definition utf8.h:137
bool isgraph(char32_t c)
Definition utf8.h:121
char32_t min_code_point(size_t num)
Minimum Unicode scalar value representable in an UTF-8 sequence of num bytes.
Definition utf8.h:35
static constexpr char32_t Invalid
Invalid UTF-8 sequence.
Definition utf8.h:16
bool _any(char32_t c, char32_t d)
Definition utf8.h:144
bool isupper(char32_t c)
Definition utf8.h:126
static constexpr char32_t BOM
Byte Order Mark.
Definition utf8.h:13
bool isblank(char32_t c)
Definition utf8.h:118
bool isodigit(char32_t c)
Is octal digit?
Definition utf8.h:136
char8_t is_valid234(char8_t c)
Is the 2nd, 3rd, or 4th byte of an UTF-8 byte sequence valid?
Definition utf8.h:50
bool isascii(char32_t c)
Definition utf8.h:128
bool iscntrl(char32_t c)
Definition utf8.h:119
bool isrange(char32_t c, char32_t begin, char32_t finis)
Is c within [begin, finis]?
Definition utf8.h:133
char32_t decode(std::istream &is)
Decodes the next sequence of bytes from is as UTF-32.
Definition utf8.h:56
bool encode(std::ostream &os, char32_t c32)
Encodes the UTF-32 char c32 as UTF-8 and writes the sequence of bytes to os.
Definition utf8.h:87
char32_t tolower(char32_t c)
Definition utf8.h:129
size_t num_bytes(char8_t c)
Returns the expected number of bytes for an UTF-8 char sequence by inspecting the first byte.
Definition utf8.h:20
bool isprint(char32_t c)
Definition utf8.h:123
char32_t toupper(char32_t c)
Definition utf8.h:130
bool isdigit(char32_t c)
Definition utf8.h:120
static constexpr size_t Max
Maximal number of char8_ts of an UTF-8 byte sequence.
Definition utf8.h:12
bool isalpha(char32_t c)
Definition utf8.h:117
auto any(T... args)
Definition utf8.h:150
bool islower(char32_t c)
Definition utf8.h:122
bool isalnum(char32_t c)
Definition utf8.h:116
static constexpr char32_t EoF
End of File.
Definition utf8.h:14
bool isspace(char32_t c)
Definition utf8.h:125
bool isxdigit(char32_t c)
Definition utf8.h:127
char32_t first(char32_t c, char32_t num)
Get relevant bits of first UTF-8 byte c of a multi-byte sequence consisting of num bytes.
Definition utf8.h:32
bool ispunct(char32_t c)
Definition utf8.h:124
char32_t append(char32_t c, char8_t b)
Append b to c for converting UTF-8 to UTF-32.
Definition utf8.h:29
static constexpr char32_t Null
U+0000 NULL.
Definition utf8.h:15
bool is_scalar_value(char32_t c)
Is c a valid Unicode scalar value?
Definition utf8.h:46
friend std::ostream & operator<<(std::ostream &os, Char32 c)
Definition utf8.h:101
char32_t c
Definition utf8.h:107
Char32(char32_t c)
Definition utf8.h:98