FE 0.5.0
A header-only C++ library for writing frontends
Loading...
Searching...
No Matches
utf8.h
Go to the documentation of this file.
1#pragma once
2
3#include <cctype>
4
5#include <istream>
6#include <ostream>
7
8#include "fe/assert.h"
9
10namespace fe::utf8 {
11
12static constexpr size_t Max = 4; ///< Maximal number of `char8_t`s of an UTF-8 byte sequence.
13static constexpr char32_t BOM = 0xfeff; ///< [Byte Order Mark](https://en.wikipedia.org/wiki/Byte_order_mark#UTF-8).
14static constexpr char32_t EoF = (char32_t)std::istream::traits_type::eof(); ///< End of File.
15static constexpr char32_t Null = 0;
16
17/// Returns the expected number of bytes for an UTF-8 char sequence by inspecting the first byte.
18/// Retuns @c 0 if invalid.
19inline size_t num_bytes(char8_t c) {
20 if ((c & char8_t(0b10000000)) == char8_t(0b00000000)) return 1;
21 if ((c & char8_t(0b11100000)) == char8_t(0b11000000)) return 2;
22 if ((c & char8_t(0b11110000)) == char8_t(0b11100000)) return 3;
23 if ((c & char8_t(0b11111000)) == char8_t(0b11110000)) return 4;
24 return 0;
25}
26
27/// Append @p b to @p c for converting UTF-8 to UTF-32.
28inline char32_t append(char32_t c, char32_t b) { return (c << 6) | (b & 0b00111111); }
29
30/// Get relevant bits of first UTF-8 byte @p c of a @em multi-byte sequence consisting of @p num bytes.
31inline char32_t first(char32_t c, char32_t num) { return c & (0b00011111 >> (num - 2)); }
32
33/// Is the 2nd, 3rd, or 4th byte of an UTF-8 byte sequence valid?
34/// @returns the extracted `char8_t` or `char8_t(-1)` if invalid.
35inline char8_t is_valid234(char8_t c) {
36 return (c & char8_t(0b11000000)) == char8_t(0b10000000) ? (c & char8_t(0b00111111)) : char8_t(-1);
37}
38
39/// Decodes the next sequence of bytes from @p is as UTF-32.
40/// @returns Null on error.
41inline char32_t decode(std::istream& is) {
42 char32_t result = is.get();
43 if (result == EoF) return result;
44
45 switch (auto n = utf8::num_bytes(result)) {
46 case 0: return Null;
47 case 1: return result;
48 default:
49 result = utf8::first(result, n);
50
51 for (size_t i = 1; i != n; ++i)
52 if (auto x = is_valid234(is.get()); x != char8_t(-1))
53 result = utf8::append(result, x);
54 else
55 return 0;
56 }
57
58 return result;
59}
60
61namespace {
62// and, or
63std::ostream& ao(std::ostream& os, char32_t c32, char32_t a = 0b00111111, char32_t o = 0b10000000) {
64 return os << char((c32 & a) | o);
65}
66} // namespace
67
68/// Encodes the UTF-32 char @p c32 as UTF-8 and writes the sequence of bytes to @p os.
69/// @returns `false` on error.
70inline bool encode(std::ostream& os, char32_t c32) {
71 // clang-format off
72 if (c32 <= 0x00007f) { ao(os, c32 , 0b11111111, 0b00000000); return true; }
73 if (c32 <= 0x0007ff) { ao(ao(os, c32 >> 6, 0b00011111, 0b11000000), c32); return true; }
74 if (c32 <= 0x00ffff) { ao(ao(ao(os, c32 >> 12, 0b00001111, 0b11100000), c32 >> 6), c32); return true; }
75 if (c32 <= 0x10ffff) { ao(ao(ao(ao(os, c32 >> 18, 0b00000111, 0b11110000), c32 >> 12), c32 >> 6), c32); return true; }
76 // clang-format on
77 return false;
78}
79/// Wrapper for `char32_t` which has a friend ostream operator.
80struct Char32 {
81 Char32(char32_t c)
82 : c(c) {}
83
84 friend std::ostream& operator<<(std::ostream& os, Char32 c) {
85 auto res = utf8::encode(os, c.c);
86 assert_unused(res);
87 return os;
88 }
89
90 char32_t c;
91};
92
93/// @name Wrappers
94///@{
95/// Safe `char32_t`-style wrappers for <[ctype](https://en.cppreference.com/w/cpp/header/cctype)> functions:
96/// > Like all other functions from `<cctype>`, the behavior of `std::isalnum` is undefined if the argument's value is
97/// neither representable as `unsigned char` nor equal to `EOF`.
98// clang-format off
99inline bool isalnum (char32_t c) { return (c & ~0xFF) == 0 ? std::isalnum (c) : false; }
100inline bool isalpha (char32_t c) { return (c & ~0xFF) == 0 ? std::isalpha (c) : false; }
101inline bool isblank (char32_t c) { return (c & ~0xFF) == 0 ? std::isblank (c) : false; }
102inline bool iscntrl (char32_t c) { return (c & ~0xFF) == 0 ? std::iscntrl (c) : false; }
103inline bool isdigit (char32_t c) { return (c & ~0xFF) == 0 ? std::isdigit (c) : false; }
104inline bool isgraph (char32_t c) { return (c & ~0xFF) == 0 ? std::isgraph (c) : false; }
105inline bool islower (char32_t c) { return (c & ~0xFF) == 0 ? std::islower (c) : false; }
106inline bool isprint (char32_t c) { return (c & ~0xFF) == 0 ? std::isprint (c) : false; }
107inline bool ispunct (char32_t c) { return (c & ~0xFF) == 0 ? std::ispunct (c) : false; }
108inline bool isspace (char32_t c) { return (c & ~0xFF) == 0 ? std::isspace (c) : false; }
109inline bool isupper (char32_t c) { return (c & ~0xFF) == 0 ? std::isupper (c) : false; }
110inline bool isxdigit(char32_t c) { return (c & ~0xFF) == 0 ? std::isxdigit(c) : false; }
111inline bool isascii (char32_t c) { return c <= 0x7F; }
112inline char32_t tolower(char32_t c) { return (c & ~0xFF) == 0 ? std::tolower(c) : c; }
113inline char32_t toupper(char32_t c) { return (c & ~0xFF) == 0 ? std::toupper(c) : c; }
114
115/// Is @p c within [begin, finis]?
116inline bool isrange(char32_t c, char32_t begin, char32_t finis) { return begin <= c && c <= finis; }
117inline auto isrange(char32_t begin, char32_t finis) { return [=](char32_t c) { return isrange(c, begin, finis); }; }
118
119inline bool isodigit(char32_t c) { return isrange(c, '0', '7'); } ///< Is octal digit?
120inline bool isbdigit(char32_t c) { return isrange(c, '0', '1'); } ///< Is binary digit?
121// clang-format on
122///@}
123
124/// @name any
125///@{
126/// Is @p c in any of the remaining arguments?
127inline bool _any(char32_t c, char32_t d) { return c == d; }
128template<class... T> inline bool _any(char32_t c, char32_t d, T... args) { return c == d || _any(c, args...); }
129template<class... T> inline auto any(T... args) {
130 return [=](char32_t c) { return _any(c, args...); };
131}
132///@}
133
134} // namespace fe::utf8
#define assert_unused(x)
Definition assert.h:35
bool isbdigit(char32_t c)
Is binary digit?
Definition utf8.h:120
bool isgraph(char32_t c)
Definition utf8.h:104
bool _any(char32_t c, char32_t d)
Definition utf8.h:127
bool isupper(char32_t c)
Definition utf8.h:109
static constexpr char32_t BOM
Byte Order Mark.
Definition utf8.h:13
bool isblank(char32_t c)
Definition utf8.h:101
bool isodigit(char32_t c)
Is octal digit?
Definition utf8.h:119
char8_t is_valid234(char8_t c)
Is the 2nd, 3rd, or 4th byte of an UTF-8 byte sequence valid?
Definition utf8.h:35
bool isascii(char32_t c)
Definition utf8.h:111
bool iscntrl(char32_t c)
Definition utf8.h:102
bool isrange(char32_t c, char32_t begin, char32_t finis)
Is c within [begin, finis]?
Definition utf8.h:116
char32_t decode(std::istream &is)
Decodes the next sequence of bytes from is as UTF-32.
Definition utf8.h:41
bool encode(std::ostream &os, char32_t c32)
Encodes the UTF-32 char c32 as UTF-8 and writes the sequence of bytes to os.
Definition utf8.h:70
char32_t tolower(char32_t c)
Definition utf8.h:112
size_t num_bytes(char8_t c)
Returns the expected number of bytes for an UTF-8 char sequence by inspecting the first byte.
Definition utf8.h:19
bool isprint(char32_t c)
Definition utf8.h:106
char32_t toupper(char32_t c)
Definition utf8.h:113
bool isdigit(char32_t c)
Definition utf8.h:103
static constexpr size_t Max
Maximal number of char8_ts of an UTF-8 byte sequence.
Definition utf8.h:12
bool isalpha(char32_t c)
Definition utf8.h:100
auto any(T... args)
Definition utf8.h:129
bool islower(char32_t c)
Definition utf8.h:105
bool isalnum(char32_t c)
Definition utf8.h:99
static constexpr char32_t EoF
End of File.
Definition utf8.h:14
bool isspace(char32_t c)
Definition utf8.h:108
bool isxdigit(char32_t c)
Definition utf8.h:110
char32_t append(char32_t c, char32_t b)
Append b to c for converting UTF-8 to UTF-32.
Definition utf8.h:28
char32_t first(char32_t c, char32_t num)
Get relevant bits of first UTF-8 byte c of a multi-byte sequence consisting of num bytes.
Definition utf8.h:31
bool ispunct(char32_t c)
Definition utf8.h:107
static constexpr char32_t Null
Definition utf8.h:15
Wrapper for char32_t which has a friend ostream operator.
Definition utf8.h:80
friend std::ostream & operator<<(std::ostream &os, Char32 c)
Definition utf8.h:84
char32_t c
Definition utf8.h:90
Char32(char32_t c)
Definition utf8.h:81