blob: 9bad81d4d048f84dd85fe07ebc9b50928d5fc507 [file] [log] [blame]
Andrew Geissler595f6302022-01-24 19:11:47 +00001From 51c500269bf53749b107807d84271385fad35628 Mon Sep 17 00:00:00 2001
2From: Marek Polacek <polacek@redhat.com>
3Date: Wed, 6 Oct 2021 14:33:59 -0400
4Subject: [PATCH] libcpp: Implement -Wbidi-chars for CVE-2021-42574 [PR103026]
5
6From a link below:
7"An issue was discovered in the Bidirectional Algorithm in the Unicode
8Specification through 14.0. It permits the visual reordering of
9characters via control sequences, which can be used to craft source code
10that renders different logic than the logical ordering of tokens
11ingested by compilers and interpreters. Adversaries can leverage this to
12encode source code for compilers accepting Unicode such that targeted
13vulnerabilities are introduced invisibly to human reviewers."
14
15More info:
16https://nvd.nist.gov/vuln/detail/CVE-2021-42574
17https://trojansource.codes/
18
19This is not a compiler bug. However, to mitigate the problem, this patch
20implements -Wbidi-chars=[none|unpaired|any] to warn about possibly
21misleading Unicode bidirectional control characters the preprocessor may
22encounter.
23
24The default is =unpaired, which warns about improperly terminated
25bidirectional control characters; e.g. a LRE without its corresponding PDF.
26The level =any warns about any use of bidirectional control characters.
27
28This patch handles both UCNs and UTF-8 characters. UCNs designating
29bidi characters in identifiers are accepted since r204886. Then r217144
30enabled -fextended-identifiers by default. Extended characters in C/C++
31identifiers have been accepted since r275979. However, this patch still
32warns about mixing UTF-8 and UCN bidi characters; there seems to be no
33good reason to allow mixing them.
34
35We warn in different contexts: comments (both C and C++-style), string
36literals, character constants, and identifiers. Expectedly, UCNs are ignored
37in comments and raw string literals. The bidirectional control characters
38can nest so this patch handles that as well.
39
40I have not included nor tested this at all with Fortran (which also has
41string literals and line comments).
42
43Dave M. posted patches improving diagnostic involving Unicode characters.
44This patch does not make use of this new infrastructure yet.
45
46 PR preprocessor/103026
47
48gcc/c-family/ChangeLog:
49
50 * c.opt (Wbidi-chars, Wbidi-chars=): New option.
51
52gcc/ChangeLog:
53
54 * doc/invoke.texi: Document -Wbidi-chars.
55
56libcpp/ChangeLog:
57
58 * include/cpplib.h (enum cpp_bidirectional_level): New.
59 (struct cpp_options): Add cpp_warn_bidirectional.
60 (enum cpp_warning_reason): Add CPP_W_BIDIRECTIONAL.
61 * internal.h (struct cpp_reader): Add warn_bidi_p member
62 function.
63 * init.c (cpp_create_reader): Set cpp_warn_bidirectional.
64 * lex.c (bidi): New namespace.
65 (get_bidi_utf8): New function.
66 (get_bidi_ucn): Likewise.
67 (maybe_warn_bidi_on_close): Likewise.
68 (maybe_warn_bidi_on_char): Likewise.
69 (_cpp_skip_block_comment): Implement warning about bidirectional
70 control characters.
71 (skip_line_comment): Likewise.
72 (forms_identifier_p): Likewise.
73 (lex_identifier): Likewise.
74 (lex_string): Likewise.
75 (lex_raw_string): Likewise.
76
77gcc/testsuite/ChangeLog:
78
79 * c-c++-common/Wbidi-chars-1.c: New test.
80 * c-c++-common/Wbidi-chars-2.c: New test.
81 * c-c++-common/Wbidi-chars-3.c: New test.
82 * c-c++-common/Wbidi-chars-4.c: New test.
83 * c-c++-common/Wbidi-chars-5.c: New test.
84 * c-c++-common/Wbidi-chars-6.c: New test.
85 * c-c++-common/Wbidi-chars-7.c: New test.
86 * c-c++-common/Wbidi-chars-8.c: New test.
87 * c-c++-common/Wbidi-chars-9.c: New test.
88 * c-c++-common/Wbidi-chars-10.c: New test.
89 * c-c++-common/Wbidi-chars-11.c: New test.
90 * c-c++-common/Wbidi-chars-12.c: New test.
91 * c-c++-common/Wbidi-chars-13.c: New test.
92 * c-c++-common/Wbidi-chars-14.c: New test.
93 * c-c++-common/Wbidi-chars-15.c: New test.
94 * c-c++-common/Wbidi-chars-16.c: New test.
95 * c-c++-common/Wbidi-chars-17.c: New test.
96
97CVE: CVE-2021-42574
98Upstream-Status: Backport [https://gcc.gnu.org/git/gitweb.cgi?p=gcc.git;h=51c500269bf53749b107807d84271385fad35628]
99Signed-off-by: Pgowda <pgowda.cve@gmail.com>
100
101---
102 gcc/c-family/c.opt | 24 ++
103 gcc/doc/invoke.texi | 21 +-
104 gcc/testsuite/c-c++-common/Wbidi-chars-1.c | 12 +
105 gcc/testsuite/c-c++-common/Wbidi-chars-10.c | 27 ++
106 gcc/testsuite/c-c++-common/Wbidi-chars-11.c | 13 +
107 gcc/testsuite/c-c++-common/Wbidi-chars-12.c | 19 +
108 gcc/testsuite/c-c++-common/Wbidi-chars-13.c | 17 +
109 gcc/testsuite/c-c++-common/Wbidi-chars-14.c | 38 ++
110 gcc/testsuite/c-c++-common/Wbidi-chars-15.c | 59 +++
111 gcc/testsuite/c-c++-common/Wbidi-chars-16.c | 26 ++
112 gcc/testsuite/c-c++-common/Wbidi-chars-17.c | 30 ++
113 gcc/testsuite/c-c++-common/Wbidi-chars-2.c | 9 +
114 gcc/testsuite/c-c++-common/Wbidi-chars-3.c | 11 +
115 gcc/testsuite/c-c++-common/Wbidi-chars-4.c | 188 +++++++++
116 gcc/testsuite/c-c++-common/Wbidi-chars-5.c | 188 +++++++++
117 gcc/testsuite/c-c++-common/Wbidi-chars-6.c | 155 ++++++++
118 gcc/testsuite/c-c++-common/Wbidi-chars-7.c | 9 +
119 gcc/testsuite/c-c++-common/Wbidi-chars-8.c | 13 +
120 gcc/testsuite/c-c++-common/Wbidi-chars-9.c | 29 ++
121 libcpp/include/cpplib.h | 18 +-
122 libcpp/init.c | 1 +
123 libcpp/internal.h | 7 +
124 libcpp/lex.c | 408 +++++++++++++++++++-
125 23 files changed, 1315 insertions(+), 7 deletions(-)
126 create mode 100644 gcc/testsuite/c-c++-common/Wbidi-chars-1.c
127 create mode 100644 gcc/testsuite/c-c++-common/Wbidi-chars-10.c
128 create mode 100644 gcc/testsuite/c-c++-common/Wbidi-chars-11.c
129 create mode 100644 gcc/testsuite/c-c++-common/Wbidi-chars-12.c
130 create mode 100644 gcc/testsuite/c-c++-common/Wbidi-chars-13.c
131 create mode 100644 gcc/testsuite/c-c++-common/Wbidi-chars-14.c
132 create mode 100644 gcc/testsuite/c-c++-common/Wbidi-chars-15.c
133 create mode 100644 gcc/testsuite/c-c++-common/Wbidi-chars-16.c
134 create mode 100644 gcc/testsuite/c-c++-common/Wbidi-chars-17.c
135 create mode 100644 gcc/testsuite/c-c++-common/Wbidi-chars-2.c
136 create mode 100644 gcc/testsuite/c-c++-common/Wbidi-chars-3.c
137 create mode 100644 gcc/testsuite/c-c++-common/Wbidi-chars-4.c
138 create mode 100644 gcc/testsuite/c-c++-common/Wbidi-chars-5.c
139 create mode 100644 gcc/testsuite/c-c++-common/Wbidi-chars-6.c
140 create mode 100644 gcc/testsuite/c-c++-common/Wbidi-chars-7.c
141 create mode 100644 gcc/testsuite/c-c++-common/Wbidi-chars-8.c
142 create mode 100644 gcc/testsuite/c-c++-common/Wbidi-chars-9.c
143
144diff --git a/gcc/c-family/c.opt b/gcc/c-family/c.opt
145index 8a4cd634f77..3976fc368db 100644
146--- a/gcc/c-family/c.opt
147+++ b/gcc/c-family/c.opt
148@@ -370,6 +370,30 @@ Wbad-function-cast
149 C ObjC Var(warn_bad_function_cast) Warning
150 Warn about casting functions to incompatible types.
151
152+Wbidi-chars
153+C ObjC C++ ObjC++ Warning Alias(Wbidi-chars=,any,none)
154+;
155+
156+Wbidi-chars=
157+C ObjC C++ ObjC++ RejectNegative Joined Warning CPP(cpp_warn_bidirectional) CppReason(CPP_W_BIDIRECTIONAL) Var(warn_bidirectional) Init(bidirectional_unpaired) Enum(cpp_bidirectional_level)
158+-Wbidi-chars=[none|unpaired|any] Warn about UTF-8 bidirectional control characters.
159+
160+; Required for these enum values.
161+SourceInclude
162+cpplib.h
163+
164+Enum
165+Name(cpp_bidirectional_level) Type(int) UnknownError(argument %qs to %<-Wbidi-chars%> not recognized)
166+
167+EnumValue
168+Enum(cpp_bidirectional_level) String(none) Value(bidirectional_none)
169+
170+EnumValue
171+Enum(cpp_bidirectional_level) String(unpaired) Value(bidirectional_unpaired)
172+
173+EnumValue
174+Enum(cpp_bidirectional_level) String(any) Value(bidirectional_any)
175+
176 Wbool-compare
177 C ObjC C++ ObjC++ Var(warn_bool_compare) Warning LangEnabledBy(C ObjC C++ ObjC++,Wall)
178 Warn about boolean expression compared with an integer value different from true/false.
179diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
180index 6070288856c..a22758d18ee 100644
181--- a/gcc/doc/invoke.texi
182+++ b/gcc/doc/invoke.texi
183@@ -326,7 +326,9 @@ Objective-C and Objective-C++ Dialects}.
184 -Warith-conversion @gol
185 -Warray-bounds -Warray-bounds=@var{n} @gol
186 -Wno-attributes -Wattribute-alias=@var{n} -Wno-attribute-alias @gol
187--Wno-attribute-warning -Wbool-compare -Wbool-operation @gol
188+-Wno-attribute-warning @gol
189+-Wbidi-chars=@r{[}none@r{|}unpaired@r{|}any@r{]} @gol
190+-Wbool-compare -Wbool-operation @gol
191 -Wno-builtin-declaration-mismatch @gol
192 -Wno-builtin-macro-redefined -Wc90-c99-compat -Wc99-c11-compat @gol
193 -Wc11-c2x-compat @gol
194@@ -7559,6 +7561,23 @@ Attributes considered include @code{allo
195 This is the default. You can disable these warnings with either
196 @option{-Wno-attribute-alias} or @option{-Wattribute-alias=0}.
197
198+@item -Wbidi-chars=@r{[}none@r{|}unpaired@r{|}any@r{]}
199+@opindex Wbidi-chars=
200+@opindex Wbidi-chars
201+@opindex Wno-bidi-chars
202+Warn about possibly misleading UTF-8 bidirectional control characters in
203+comments, string literals, character constants, and identifiers. Such
204+characters can change left-to-right writing direction into right-to-left
205+(and vice versa), which can cause confusion between the logical order and
206+visual order. This may be dangerous; for instance, it may seem that a piece
207+of code is not commented out, whereas it in fact is.
208+
209+There are three levels of warning supported by GCC@. The default is
210+@option{-Wbidi-chars=unpaired}, which warns about improperly terminated
211+bidi contexts. @option{-Wbidi-chars=none} turns the warning off.
212+@option{-Wbidi-chars=any} warns about any use of bidirectional control
213+characters.
214+
215 @item -Wbool-compare
216 @opindex Wno-bool-compare
217 @opindex Wbool-compare
218diff --git a/gcc/testsuite/c-c++-common/Wbidi-chars-10.c b/gcc/testsuite/c-c++-common/Wbidi-chars-10.c
219new file mode 100644
220index 00000000000..34f5ac19271
221--- /dev/null
222+++ b/gcc/testsuite/c-c++-common/Wbidi-chars-10.c
223@@ -0,0 +1,27 @@
224+/* PR preprocessor/103026 */
225+/* { dg-do compile } */
226+/* { dg-options "-Wbidi-chars=unpaired" } */
227+/* More nesting testing. */
228+
229+/* RLEâ« LRI⦠PDF⬠PDIâ©*/
230+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
231+int LRE_\u202a_PDF_\u202c;
232+int LRE_\u202a_PDF_\u202c_LRE_\u202a_PDF_\u202c;
233+int LRE_\u202a_LRI_\u2066_PDF_\u202c_PDI_\u2069;
234+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
235+int RLE_\u202b_RLI_\u2067_PDF_\u202c_PDI_\u2069;
236+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
237+int RLE_\u202b_RLI_\u2067_PDI_\u2069_PDF_\u202c;
238+int FSI_\u2068_LRO_\u202d_PDI_\u2069_PDF_\u202c;
239+int FSI_\u2068;
240+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
241+int FSI_\u2068_PDI_\u2069;
242+int FSI_\u2068_FSI_\u2068_PDI_\u2069;
243+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
244+int RLI_\u2067_RLI_\u2067_RLI_\u2067_RLI_\u2067_RLI_\u2067_RLI_\u2067_RLI_\u2067_PDI_\u2069_PDI_\u2069_PDI_\u2069_PDI_\u2069_PDI_\u2069_PDI_\u2069_PDI_\u2069;
245+int RLI_\u2067_RLI_\u2067_RLI_\u2067_RLI_\u2067_RLI_\u2067_RLI_\u2067_RLI_\u2067_PDI_\u2069_PDI_\u2069_PDI_\u2069_PDI_\u2069_PDI_\u2069_PDI_\u2069;
246+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
247+int RLI_\u2067_RLI_\u2067_RLI_\u2067_RLI_\u2067_RLI_\u2067_RLI_\u2067_RLI_\u2067_PDI_\u2069_PDI_\u2069_PDI_\u2069_PDI_\u2069_PDI_\u2069_PDI_\u2069_PDF_\u202c;
248+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
249+int RLI_\u2067_RLI_\u2067_RLI_\u2067_RLI_\u2067_FSI_\u2068_PDI_\u2069_PDI_\u2069_PDI_\u2069_PDI_\u2069;
250+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
251diff --git a/gcc/testsuite/c-c++-common/Wbidi-chars-11.c b/gcc/testsuite/c-c++-common/Wbidi-chars-11.c
252new file mode 100644
253index 00000000000..270ce2368a9
254--- /dev/null
255+++ b/gcc/testsuite/c-c++-common/Wbidi-chars-11.c
256@@ -0,0 +1,13 @@
257+/* PR preprocessor/103026 */
258+/* { dg-do compile } */
259+/* { dg-options "-Wbidi-chars=unpaired" } */
260+/* Test that we warn when mixing UCN and UTF-8. */
261+
262+int LRE_âª_PDF_\u202c;
263+/* { dg-warning "mismatch" "" { target *-*-* } .-1 } */
264+int LRE_\u202a_PDF_â¬_;
265+/* { dg-warning "mismatch" "" { target *-*-* } .-1 } */
266+const char *s1 = "LRE_âª_PDF_\u202c";
267+/* { dg-warning "mismatch" "" { target *-*-* } .-1 } */
268+const char *s2 = "LRE_\u202a_PDF_â¬";
269+/* { dg-warning "mismatch" "" { target *-*-* } .-1 } */
270diff --git a/gcc/testsuite/c-c++-common/Wbidi-chars-12.c b/gcc/testsuite/c-c++-common/Wbidi-chars-12.c
271new file mode 100644
272index 00000000000..b07eec1da91
273--- /dev/null
274+++ b/gcc/testsuite/c-c++-common/Wbidi-chars-12.c
275@@ -0,0 +1,19 @@
276+/* PR preprocessor/103026 */
277+/* { dg-do compile { target { c || c++11 } } } */
278+/* { dg-options "-Wbidi-chars=any" } */
279+/* Test raw strings. */
280+
281+const char *s1 = R"(a b c LRE⪠1 2 3 PDF⬠x y z)";
282+/* { dg-warning "U\\+202A" "" { target *-*-* } .-1 } */
283+const char *s2 = R"(a b c RLE⫠1 2 3 PDF⬠x y z)";
284+/* { dg-warning "U\\+202B" "" { target *-*-* } .-1 } */
285+const char *s3 = R"(a b c LRO⭠1 2 3 PDF⬠x y z)";
286+/* { dg-warning "U\\+202D" "" { target *-*-* } .-1 } */
287+const char *s4 = R"(a b c RLO⮠1 2 3 PDF⬠x y z)";
288+/* { dg-warning "U\\+202E" "" { target *-*-* } .-1 } */
289+const char *s7 = R"(a b c FSI⨠1 2 3 PDI⩠x y) z";
290+/* { dg-warning "U\\+2068" "" { target *-*-* } .-1 } */
291+const char *s8 = R"(a b c PDIâ© x y )z";
292+/* { dg-warning "U\\+2069" "" { target *-*-* } .-1 } */
293+const char *s9 = R"(a b c PDF⬠x y z)";
294+/* { dg-warning "U\\+202C" "" { target *-*-* } .-1 } */
295diff -uprN '-x*.orig' '-x*.rej' del/gcc-11.2.0/gcc/testsuite/c-c++-common/Wbidi-chars-13.c gcc-11.2.0/gcc/testsuite/c-c++-common/Wbidi-chars-13.c
296--- del/gcc-11.2.0/gcc/testsuite/c-c++-common/Wbidi-chars-13.c 1969-12-31 16:00:00.000000000 -0800
297+++ gcc-11.2.0/gcc/testsuite/c-c++-common/Wbidi-chars-13.c 2021-12-13 23:11:22.328439287 -0800
298@@ -0,0 +1,17 @@
299+/* PR preprocessor/103026 */
300+/* { dg-do compile { target { c || c++11 } } } */
301+/* { dg-options "-Wbidi-chars=unpaired" } */
302+/* Test raw strings. */
303+
304+const char *s1 = R"(a b c LRE⪠1 2 3)";
305+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
306+const char *s2 = R"(a b c RLEâ« 1 2 3)";
307+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
308+const char *s3 = R"(a b c LROâ­ 1 2 3)";
309+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
310+const char *s4 = R"(a b c FSI⨠1 2 3)";
311+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
312+const char *s5 = R"(a b c LRI⦠1 2 3)";
313+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
314+const char *s6 = R"(a b c RLI⧠1 2 3)";
315+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
316diff --git a/gcc/testsuite/c-c++-common/Wbidi-chars-14.c b/gcc/testsuite/c-c++-common/Wbidi-chars-14.c
317new file mode 100644
318index 00000000000..ba5f75d9553
319--- /dev/null
320+++ b/gcc/testsuite/c-c++-common/Wbidi-chars-14.c
321@@ -0,0 +1,38 @@
322+/* PR preprocessor/103026 */
323+/* { dg-do compile } */
324+/* { dg-options "-Wbidi-chars=unpaired" } */
325+/* Test PDI handling, which also pops any subsequent LREs, RLEs, LROs,
326+ or RLOs. */
327+
328+/* LRI_â¦_LRI_â¦_RLE_â«_RLE_â«_RLE_â«_PDI_â©*/
329+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
330+// LRI_â¦_RLE_â«_RLE_â«_RLE_â«_PDI_â©
331+// LRI_â¦_RLO_â®_RLE_â«_RLE_â«_PDI_â©
332+// LRI_â¦_RLO_â®_RLE_â«_PDI_â©
333+// FSI_â¨_RLO_â®_PDI_â©
334+// FSI_â¨_FSI_â¨_RLO_â®_PDI_â©
335+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
336+
337+int LRI_\u2066_LRI_\u2066_LRE_\u202a_LRE_\u202a_LRE_\u202a_PDI_\u2069;
338+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
339+int LRI_\u2066_LRI_\u2066_LRE_\u202a_LRE_\u202a_LRE_\u202a_PDI_\u2069_PDI_\u2069;
340+int LRI_\u2066_LRI_\u2066_LRI_\u2066_LRE_\u202a_LRE_\u202a_LRE_\u202a_PDI_\u2069_PDI_\u2069;
341+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
342+int PDI_\u2069;
343+int LRI_\u2066_PDI_\u2069;
344+int RLI_\u2067_PDI_\u2069;
345+int LRE_\u202a_LRI_\u2066_PDI_\u2069;
346+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
347+int LRI_\u2066_LRE_\u202a_PDF_\u202c_PDI_\u2069;
348+int LRI_\u2066_LRE_\u202a_LRE_\u202a_PDF_\u202c_PDI_\u2069;
349+int RLI_\u2067_LRI_\u2066_LRE_\u202a_LRE_\u202a_PDF_\u202c_PDI_\u2069;
350+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
351+int FSI_\u2068_LRI_\u2066_LRE_\u202a_LRE_\u202a_PDF_\u202c_PDI_\u2069;
352+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
353+int RLO_\u202e_PDI_\u2069;
354+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
355+int RLI_\u2067_PDI_\u2069_RLI_\u2067;
356+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
357+int FSI_\u2068_PDF_\u202c_PDI_\u2069;
358+int FSI_\u2068_FSI_\u2068_PDF_\u202c_PDI_\u2069;
359+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
360diff --git a/gcc/testsuite/c-c++-common/Wbidi-chars-15.c b/gcc/testsuite/c-c++-common/Wbidi-chars-15.c
361new file mode 100644
362index 00000000000..a0ce8ff5e2c
363--- /dev/null
364+++ b/gcc/testsuite/c-c++-common/Wbidi-chars-15.c
365@@ -0,0 +1,59 @@
366+/* PR preprocessor/103026 */
367+/* { dg-do compile } */
368+/* { dg-options "-Wbidi-chars=unpaired" } */
369+/* Test unpaired bidi control chars in multiline comments. */
370+
371+/*
372+ * LRE⪠end
373+ */
374+/* { dg-warning "unpaired" "" { target *-*-* } .-2 } */
375+/*
376+ * RLEâ« end
377+ */
378+/* { dg-warning "unpaired" "" { target *-*-* } .-2 } */
379+/*
380+ * LROâ­ end
381+ */
382+/* { dg-warning "unpaired" "" { target *-*-* } .-2 } */
383+/*
384+ * RLOâ® end
385+ */
386+/* { dg-warning "unpaired" "" { target *-*-* } .-2 } */
387+/*
388+ * LRI⦠end
389+ */
390+/* { dg-warning "unpaired" "" { target *-*-* } .-2 } */
391+/*
392+ * RLI⧠end
393+ */
394+/* { dg-warning "unpaired" "" { target *-*-* } .-2 } */
395+/*
396+ * FSI⨠end
397+ */
398+/* { dg-warning "unpaired" "" { target *-*-* } .-2 } */
399+/* LREâª
400+ PDF⬠*/
401+/* { dg-warning "unpaired" "" { target *-*-* } .-2 } */
402+/* FSIâ¨
403+ PDIâ© */
404+/* { dg-warning "unpaired" "" { target *-*-* } .-2 } */
405+
406+/* LRE<âª>
407+ *
408+ */
409+/* { dg-warning "unpaired" "" { target *-*-* } .-3 } */
410+
411+/*
412+ * LRE<âª>
413+ */
414+/* { dg-warning "unpaired" "" { target *-*-* } .-2 } */
415+
416+/*
417+ *
418+ * LRE<âª> */
419+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
420+
421+/* RLI<â§> */ /* PDI<â©> */
422+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
423+/* LRE<âª> */ /* PDF<â¬> */
424+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
425diff --git a/gcc/testsuite/c-c++-common/Wbidi-chars-16.c b/gcc/testsuite/c-c++-common/Wbidi-chars-16.c
426new file mode 100644
427index 00000000000..baa0159861c
428--- /dev/null
429+++ b/gcc/testsuite/c-c++-common/Wbidi-chars-16.c
430@@ -0,0 +1,26 @@
431+/* PR preprocessor/103026 */
432+/* { dg-do compile } */
433+/* { dg-options "-Wbidi-chars=any" } */
434+/* Test LTR/RTL chars. */
435+
436+/* LTR<â> */
437+/* { dg-warning "U\\+200E" "" { target *-*-* } .-1 } */
438+// LTR<â>
439+/* { dg-warning "U\\+200E" "" { target *-*-* } .-1 } */
440+/* RTL<â> */
441+/* { dg-warning "U\\+200F" "" { target *-*-* } .-1 } */
442+// RTL<â>
443+/* { dg-warning "U\\+200F" "" { target *-*-* } .-1 } */
444+
445+const char *s1 = "LTR<â>";
446+/* { dg-warning "U\\+200E" "" { target *-*-* } .-1 } */
447+const char *s2 = "LTR\u200e";
448+/* { dg-warning "U\\+200E" "" { target *-*-* } .-1 } */
449+const char *s3 = "LTR\u200E";
450+/* { dg-warning "U\\+200E" "" { target *-*-* } .-1 } */
451+const char *s4 = "RTL<â>";
452+/* { dg-warning "U\\+200F" "" { target *-*-* } .-1 } */
453+const char *s5 = "RTL\u200f";
454+/* { dg-warning "U\\+200F" "" { target *-*-* } .-1 } */
455+const char *s6 = "RTL\u200F";
456+/* { dg-warning "U\\+200F" "" { target *-*-* } .-1 } */
457diff --git a/gcc/testsuite/c-c++-common/Wbidi-chars-17.c b/gcc/testsuite/c-c++-common/Wbidi-chars-17.c
458new file mode 100644
459index 00000000000..07cb4321f96
460--- /dev/null
461+++ b/gcc/testsuite/c-c++-common/Wbidi-chars-17.c
462@@ -0,0 +1,30 @@
463+/* PR preprocessor/103026 */
464+/* { dg-do compile } */
465+/* { dg-options "-Wbidi-chars=unpaired" } */
466+/* Test LTR/RTL chars. */
467+
468+/* LTR<â> */
469+/* { dg-bogus "unpaired" "" { target *-*-* } .-1 } */
470+// LTR<â>
471+/* { dg-bogus "unpaired" "" { target *-*-* } .-1 } */
472+/* RTL<â> */
473+/* { dg-bogus "unpaired" "" { target *-*-* } .-1 } */
474+// RTL<â>
475+/* { dg-bogus "unpaired" "" { target *-*-* } .-1 } */
476+int ltr_\u200e;
477+/* { dg-error "universal character " "" { target *-*-* } .-1 } */
478+int rtl_\u200f;
479+/* { dg-error "universal character " "" { target *-*-* } .-1 } */
480+
481+const char *s1 = "LTR<â>";
482+/* { dg-bogus "unpaired" "" { target *-*-* } .-1 } */
483+const char *s2 = "LTR\u200e";
484+/* { dg-bogus "unpaired" "" { target *-*-* } .-1 } */
485+const char *s3 = "LTR\u200E";
486+/* { dg-bogus "unpaired" "" { target *-*-* } .-1 } */
487+const char *s4 = "RTL<â>";
488+/* { dg-bogus "unpaired" "" { target *-*-* } .-1 } */
489+const char *s5 = "RTL\u200f";
490+/* { dg-bogus "unpaired" "" { target *-*-* } .-1 } */
491+const char *s6 = "RTL\u200F";
492+/* { dg-bogus "unpaired" "" { target *-*-* } .-1 } */
493diff --git a/gcc/testsuite/c-c++-common/Wbidi-chars-1.c b/gcc/testsuite/c-c++-common/Wbidi-chars-1.c
494new file mode 100644
495index 00000000000..2340374f276
496--- /dev/null
497+++ b/gcc/testsuite/c-c++-common/Wbidi-chars-1.c
498@@ -0,0 +1,12 @@
499+/* PR preprocessor/103026 */
500+/* { dg-do compile } */
501+
502+int main() {
503+ int isAdmin = 0;
504+ /*â® } â¦if (isAdmin)⩠⦠begin admins only */
505+/* { dg-warning "bidirectional" "" { target *-*-* } .-1 } */
506+ __builtin_printf("You are an admin.\n");
507+ /* end admins only â® { â¦*/
508+/* { dg-warning "bidirectional" "" { target *-*-* } .-1 } */
509+ return 0;
510+}
511diff --git a/gcc/testsuite/c-c++-common/Wbidi-chars-2.c b/gcc/testsuite/c-c++-common/Wbidi-chars-2.c
512new file mode 100644
513index 00000000000..2340374f276
514--- /dev/null
515+++ b/gcc/testsuite/c-c++-common/Wbidi-chars-2.c
516@@ -0,0 +1,9 @@
517+/* PR preprocessor/103026 */
518+/* { dg-do compile } */
519+
520+int main() {
521+ /* Say hello; newlineâ§/*/ return 0 ;
522+/* { dg-warning "bidirectional" "" { target *-*-* } .-1 } */
523+ __builtin_printf("Hello world.\n");
524+ return 0;
525+}
526diff --git a/gcc/testsuite/c-c++-common/Wbidi-chars-3.c b/gcc/testsuite/c-c++-common/Wbidi-chars-3.c
527new file mode 100644
528index 00000000000..9dc7edb6e64
529--- /dev/null
530+++ b/gcc/testsuite/c-c++-common/Wbidi-chars-3.c
531@@ -0,0 +1,11 @@
532+/* PR preprocessor/103026 */
533+/* { dg-do compile } */
534+
535+int main() {
536+ const char* access_level = "user";
537+ if (__builtin_strcmp(access_level, "userâ® â¦// Check if adminâ© â¦")) {
538+/* { dg-warning "bidirectional" "" { target *-*-* } .-1 } */
539+ __builtin_printf("You are an admin.\n");
540+ }
541+ return 0;
542+}
543diff --git a/gcc/testsuite/c-c++-common/Wbidi-chars-4.c b/gcc/testsuite/c-c++-common/Wbidi-chars-4.c
544new file mode 100644
545index 00000000000..639e5c62e88
546--- /dev/null
547+++ b/gcc/testsuite/c-c++-common/Wbidi-chars-4.c
548@@ -0,0 +1,188 @@
549+/* PR preprocessor/103026 */
550+/* { dg-do compile } */
551+/* { dg-options "-Wbidi-chars=any -Wno-multichar -Wno-overflow" } */
552+/* Test all bidi chars in various contexts (identifiers, comments,
553+ string literals, character constants), both UCN and UTF-8. The bidi
554+ chars here are properly terminated, except for the character constants. */
555+
556+/* a b c LRE⪠1 2 3 PDF⬠x y z */
557+/* { dg-warning "U\\+202A" "" { target *-*-* } .-1 } */
558+/* a b c RLE⫠1 2 3 PDF⬠x y z */
559+/* { dg-warning "U\\+202B" "" { target *-*-* } .-1 } */
560+/* a b c LRO⭠1 2 3 PDF⬠x y z */
561+/* { dg-warning "U\\+202D" "" { target *-*-* } .-1 } */
562+/* a b c RLO⮠1 2 3 PDF⬠x y z */
563+/* { dg-warning "U\\+202E" "" { target *-*-* } .-1 } */
564+/* a b c LRI⦠1 2 3 PDI⩠x y z */
565+/* { dg-warning "U\\+2066" "" { target *-*-* } .-1 } */
566+/* a b c RLI⧠1 2 3 PDI⩠x y */
567+/* { dg-warning "U\\+2067" "" { target *-*-* } .-1 } */
568+/* a b c FSI⨠1 2 3 PDI⩠x y z */
569+/* { dg-warning "U\\+2068" "" { target *-*-* } .-1 } */
570+
571+/* Same but C++ comments instead. */
572+// a b c LRE⪠1 2 3 PDF⬠x y z
573+/* { dg-warning "U\\+202A" "" { target *-*-* } .-1 } */
574+// a b c RLE⫠1 2 3 PDF⬠x y z
575+/* { dg-warning "U\\+202B" "" { target *-*-* } .-1 } */
576+// a b c LRO⭠1 2 3 PDF⬠x y z
577+/* { dg-warning "U\\+202D" "" { target *-*-* } .-1 } */
578+// a b c RLO⮠1 2 3 PDF⬠x y z
579+/* { dg-warning "U\\+202E" "" { target *-*-* } .-1 } */
580+// a b c LRI⦠1 2 3 PDI⩠x y z
581+/* { dg-warning "U\\+2066" "" { target *-*-* } .-1 } */
582+// a b c RLI⧠1 2 3 PDI⩠x y
583+/* { dg-warning "U\\+2067" "" { target *-*-* } .-1 } */
584+// a b c FSI⨠1 2 3 PDI⩠x y z
585+/* { dg-warning "U\\+2068" "" { target *-*-* } .-1 } */
586+
587+/* Here we're closing an unopened context, warn when =any. */
588+/* a b c PDIâ© x y z */
589+/* { dg-warning "U\\+2069" "" { target *-*-* } .-1 } */
590+/* a b c PDF⬠x y z */
591+/* { dg-warning "U\\+202C" "" { target *-*-* } .-1 } */
592+// a b c PDIâ© x y z
593+/* { dg-warning "U\\+2069" "" { target *-*-* } .-1 } */
594+// a b c PDF⬠x y z
595+/* { dg-warning "U\\+202C" "" { target *-*-* } .-1 } */
596+
597+/* Multiline comments. */
598+/* a b c PDIâ© x y z
599+ */
600+/* { dg-warning "U\\+2069" "" { target *-*-* } .-2 } */
601+/* a b c PDF⬠x y z
602+ */
603+/* { dg-warning "U\\+202C" "" { target *-*-* } .-2 } */
604+/* first
605+ a b c PDIâ© x y z
606+ */
607+/* { dg-warning "U\\+2069" "" { target *-*-* } .-2 } */
608+/* first
609+ a b c PDF⬠x y z
610+ */
611+/* { dg-warning "U\\+202C" "" { target *-*-* } .-2 } */
612+/* first
613+ a b c PDIâ© x y z */
614+/* { dg-warning "U\\+2069" "" { target *-*-* } .-1 } */
615+/* first
616+ a b c PDF⬠x y z */
617+/* { dg-warning "U\\+202C" "" { target *-*-* } .-1 } */
618+
619+void
620+g1 ()
621+{
622+ const char *s1 = "a b c LRE⪠1 2 3 PDF⬠x y z";
623+/* { dg-warning "U\\+202A" "" { target *-*-* } .-1 } */
624+ const char *s2 = "a b c RLE⫠1 2 3 PDF⬠x y z";
625+/* { dg-warning "U\\+202B" "" { target *-*-* } .-1 } */
626+ const char *s3 = "a b c LRO⭠1 2 3 PDF⬠x y z";
627+/* { dg-warning "U\\+202D" "" { target *-*-* } .-1 } */
628+ const char *s4 = "a b c RLO⮠1 2 3 PDF⬠x y z";
629+/* { dg-warning "U\\+202E" "" { target *-*-* } .-1 } */
630+ const char *s5 = "a b c LRI⦠1 2 3 PDI⩠x y z";
631+/* { dg-warning "U\\+2066" "" { target *-*-* } .-1 } */
632+ const char *s6 = "a b c RLI⧠1 2 3 PDI⩠x y z";
633+/* { dg-warning "U\\+2067" "" { target *-*-* } .-1 } */
634+ const char *s7 = "a b c FSI⨠1 2 3 PDI⩠x y z";
635+/* { dg-warning "U\\+2068" "" { target *-*-* } .-1 } */
636+ const char *s8 = "a b c PDIâ© x y z";
637+/* { dg-warning "U\\+2069" "" { target *-*-* } .-1 } */
638+ const char *s9 = "a b c PDF⬠x y z";
639+/* { dg-warning "U\\+202C" "" { target *-*-* } .-1 } */
640+
641+ const char *s10 = "a b c LRE\u202a 1 2 3 PDF\u202c x y z";
642+/* { dg-warning "U\\+202A" "" { target *-*-* } .-1 } */
643+ const char *s11 = "a b c LRE\u202A 1 2 3 PDF\u202c x y z";
644+/* { dg-warning "U\\+202A" "" { target *-*-* } .-1 } */
645+ const char *s12 = "a b c RLE\u202b 1 2 3 PDF\u202c x y z";
646+/* { dg-warning "U\\+202B" "" { target *-*-* } .-1 } */
647+ const char *s13 = "a b c RLE\u202B 1 2 3 PDF\u202c x y z";
648+/* { dg-warning "U\\+202B" "" { target *-*-* } .-1 } */
649+ const char *s14 = "a b c LRO\u202d 1 2 3 PDF\u202c x y z";
650+/* { dg-warning "U\\+202D" "" { target *-*-* } .-1 } */
651+ const char *s15 = "a b c LRO\u202D 1 2 3 PDF\u202c x y z";
652+/* { dg-warning "U\\+202D" "" { target *-*-* } .-1 } */
653+ const char *s16 = "a b c RLO\u202e 1 2 3 PDF\u202c x y z";
654+/* { dg-warning "U\\+202E" "" { target *-*-* } .-1 } */
655+ const char *s17 = "a b c RLO\u202E 1 2 3 PDF\u202c x y z";
656+/* { dg-warning "U\\+202E" "" { target *-*-* } .-1 } */
657+ const char *s18 = "a b c LRI\u2066 1 2 3 PDI\u2069 x y z";
658+/* { dg-warning "U\\+2066" "" { target *-*-* } .-1 } */
659+ const char *s19 = "a b c RLI\u2067 1 2 3 PDI\u2069 x y z";
660+/* { dg-warning "U\\+2067" "" { target *-*-* } .-1 } */
661+ const char *s20 = "a b c FSI\u2068 1 2 3 PDI\u2069 x y z";
662+/* { dg-warning "U\\+2068" "" { target *-*-* } .-1 } */
663+}
664+
665+void
666+g2 ()
667+{
668+ const char c1 = '\u202a';
669+/* { dg-warning "U\\+202A" "" { target *-*-* } .-1 } */
670+ const char c2 = '\u202A';
671+/* { dg-warning "U\\+202A" "" { target *-*-* } .-1 } */
672+ const char c3 = '\u202b';
673+/* { dg-warning "U\\+202B" "" { target *-*-* } .-1 } */
674+ const char c4 = '\u202B';
675+/* { dg-warning "U\\+202B" "" { target *-*-* } .-1 } */
676+ const char c5 = '\u202d';
677+/* { dg-warning "U\\+202D" "" { target *-*-* } .-1 } */
678+ const char c6 = '\u202D';
679+/* { dg-warning "U\\+202D" "" { target *-*-* } .-1 } */
680+ const char c7 = '\u202e';
681+/* { dg-warning "U\\+202E" "" { target *-*-* } .-1 } */
682+ const char c8 = '\u202E';
683+/* { dg-warning "U\\+202E" "" { target *-*-* } .-1 } */
684+ const char c9 = '\u2066';
685+/* { dg-warning "U\\+2066" "" { target *-*-* } .-1 } */
686+ const char c10 = '\u2067';
687+/* { dg-warning "U\\+2067" "" { target *-*-* } .-1 } */
688+ const char c11 = '\u2068';
689+/* { dg-warning "U\\+2068" "" { target *-*-* } .-1 } */
690+}
691+
692+int aâªbâ¬c;
693+/* { dg-warning "U\\+202A" "" { target *-*-* } .-1 } */
694+int aâ«bâ¬c;
695+/* { dg-warning "U\\+202B" "" { target *-*-* } .-1 } */
696+int aâ­bâ¬c;
697+/* { dg-warning "U\\+202D" "" { target *-*-* } .-1 } */
698+int aâ®bâ¬c;
699+/* { dg-warning "U\\+202E" "" { target *-*-* } .-1 } */
700+int aâ¦bâ©c;
701+/* { dg-warning "U\\+2066" "" { target *-*-* } .-1 } */
702+int aâ§bâ©c;
703+/* { dg-warning "U\\+2067" "" { target *-*-* } .-1 } */
704+int aâ¨bâ©c;
705+/* { dg-warning "U\\+2068" "" { target *-*-* } .-1 } */
706+int Aâ¬X;
707+/* { dg-warning "U\\+202C" "" { target *-*-* } .-1 } */
708+int A\u202cY;
709+/* { dg-warning "U\\+202C" "" { target *-*-* } .-1 } */
710+int A\u202CY2;
711+/* { dg-warning "U\\+202C" "" { target *-*-* } .-1 } */
712+
713+int d\u202ae\u202cf;
714+/* { dg-warning "U\\+202A" "" { target *-*-* } .-1 } */
715+int d\u202Ae\u202cf2;
716+/* { dg-warning "U\\+202A" "" { target *-*-* } .-1 } */
717+int d\u202be\u202cf;
718+/* { dg-warning "U\\+202B" "" { target *-*-* } .-1 } */
719+int d\u202Be\u202cf2;
720+/* { dg-warning "U\\+202B" "" { target *-*-* } .-1 } */
721+int d\u202de\u202cf;
722+/* { dg-warning "U\\+202D" "" { target *-*-* } .-1 } */
723+int d\u202De\u202cf2;
724+/* { dg-warning "U\\+202D" "" { target *-*-* } .-1 } */
725+int d\u202ee\u202cf;
726+/* { dg-warning "U\\+202E" "" { target *-*-* } .-1 } */
727+int d\u202Ee\u202cf2;
728+/* { dg-warning "U\\+202E" "" { target *-*-* } .-1 } */
729+int d\u2066e\u2069f;
730+/* { dg-warning "U\\+2066" "" { target *-*-* } .-1 } */
731+int d\u2067e\u2069f;
732+/* { dg-warning "U\\+2067" "" { target *-*-* } .-1 } */
733+int d\u2068e\u2069f;
734+/* { dg-warning "U\\+2068" "" { target *-*-* } .-1 } */
735+int X\u2069;
736+/* { dg-warning "U\\+2069" "" { target *-*-* } .-1 } */
737diff --git a/gcc/testsuite/c-c++-common/Wbidi-chars-5.c b/gcc/testsuite/c-c++-common/Wbidi-chars-5.c
738new file mode 100644
739index 00000000000..68cb053144b
740--- /dev/null
741+++ b/gcc/testsuite/c-c++-common/Wbidi-chars-5.c
742@@ -0,0 +1,188 @@
743+/* PR preprocessor/103026 */
744+/* { dg-do compile } */
745+/* { dg-options "-Wbidi-chars=unpaired -Wno-multichar -Wno-overflow" } */
746+/* Test all bidi chars in various contexts (identifiers, comments,
747+ string literals, character constants), both UCN and UTF-8. The bidi
748+ chars here are properly terminated, except for the character constants. */
749+
750+/* a b c LRE⪠1 2 3 PDF⬠x y z */
751+/* { dg-bogus "unpaired" "" { target *-*-* } .-1 } */
752+/* a b c RLE⫠1 2 3 PDF⬠x y z */
753+/* { dg-bogus "unpaired" "" { target *-*-* } .-1 } */
754+/* a b c LRO⭠1 2 3 PDF⬠x y z */
755+/* { dg-bogus "unpaired" "" { target *-*-* } .-1 } */
756+/* a b c RLO⮠1 2 3 PDF⬠x y z */
757+/* { dg-bogus "unpaired" "" { target *-*-* } .-1 } */
758+/* a b c LRI⦠1 2 3 PDI⩠x y z */
759+/* { dg-bogus "unpaired" "" { target *-*-* } .-1 } */
760+/* a b c RLI⧠1 2 3 PDI⩠x y */
761+/* { dg-bogus "unpaired" "" { target *-*-* } .-1 } */
762+/* a b c FSI⨠1 2 3 PDI⩠x y z */
763+/* { dg-bogus "unpaired" "" { target *-*-* } .-1 } */
764+
765+/* Same but C++ comments instead. */
766+// a b c LRE⪠1 2 3 PDF⬠x y z
767+/* { dg-bogus "unpaired" "" { target *-*-* } .-1 } */
768+// a b c RLE⫠1 2 3 PDF⬠x y z
769+/* { dg-bogus "unpaired" "" { target *-*-* } .-1 } */
770+// a b c LRO⭠1 2 3 PDF⬠x y z
771+/* { dg-bogus "unpaired" "" { target *-*-* } .-1 } */
772+// a b c RLO⮠1 2 3 PDF⬠x y z
773+/* { dg-bogus "unpaired" "" { target *-*-* } .-1 } */
774+// a b c LRI⦠1 2 3 PDI⩠x y z
775+/* { dg-bogus "unpaired" "" { target *-*-* } .-1 } */
776+// a b c RLI⧠1 2 3 PDI⩠x y
777+/* { dg-bogus "unpaired" "" { target *-*-* } .-1 } */
778+// a b c FSI⨠1 2 3 PDI⩠x y z
779+/* { dg-bogus "unpaired" "" { target *-*-* } .-1 } */
780+
781+/* Here we're closing an unopened context, warn when =any. */
782+/* a b c PDIâ© x y z */
783+/* { dg-bogus "unpaired" "" { target *-*-* } .-1 } */
784+/* a b c PDF⬠x y z */
785+/* { dg-bogus "unpaired" "" { target *-*-* } .-1 } */
786+// a b c PDIâ© x y z
787+/* { dg-bogus "unpaired" "" { target *-*-* } .-1 } */
788+// a b c PDF⬠x y z
789+/* { dg-bogus "unpaired" "" { target *-*-* } .-1 } */
790+
791+/* Multiline comments. */
792+/* a b c PDIâ© x y z
793+ */
794+/* { dg-bogus "unpaired" "" { target *-*-* } .-2 } */
795+/* a b c PDF⬠x y z
796+ */
797+/* { dg-bogus "unpaired" "" { target *-*-* } .-2 } */
798+/* first
799+ a b c PDIâ© x y z
800+ */
801+/* { dg-bogus "unpaired" "" { target *-*-* } .-2 } */
802+/* first
803+ a b c PDF⬠x y z
804+ */
805+/* { dg-bogus "unpaired" "" { target *-*-* } .-2 } */
806+/* first
807+ a b c PDIâ© x y z */
808+/* { dg-bogus "unpaired" "" { target *-*-* } .-1 } */
809+/* first
810+ a b c PDF⬠x y z */
811+/* { dg-bogus "unpaired" "" { target *-*-* } .-1 } */
812+
813+void
814+g1 ()
815+{
816+ const char *s1 = "a b c LRE⪠1 2 3 PDF⬠x y z";
817+/* { dg-bogus "unpaired" "" { target *-*-* } .-1 } */
818+ const char *s2 = "a b c RLE⫠1 2 3 PDF⬠x y z";
819+/* { dg-bogus "unpaired" "" { target *-*-* } .-1 } */
820+ const char *s3 = "a b c LRO⭠1 2 3 PDF⬠x y z";
821+/* { dg-bogus "unpaired" "" { target *-*-* } .-1 } */
822+ const char *s4 = "a b c RLO⮠1 2 3 PDF⬠x y z";
823+/* { dg-bogus "unpaired" "" { target *-*-* } .-1 } */
824+ const char *s5 = "a b c LRI⦠1 2 3 PDI⩠x y z";
825+/* { dg-bogus "unpaired" "" { target *-*-* } .-1 } */
826+ const char *s6 = "a b c RLI⧠1 2 3 PDI⩠x y z";
827+/* { dg-bogus "unpaired" "" { target *-*-* } .-1 } */
828+ const char *s7 = "a b c FSI⨠1 2 3 PDI⩠x y z";
829+/* { dg-bogus "unpaired" "" { target *-*-* } .-1 } */
830+ const char *s8 = "a b c PDIâ© x y z";
831+/* { dg-bogus "unpaired" "" { target *-*-* } .-1 } */
832+ const char *s9 = "a b c PDF⬠x y z";
833+/* { dg-bogus "unpaired" "" { target *-*-* } .-1 } */
834+
835+ const char *s10 = "a b c LRE\u202a 1 2 3 PDF\u202c x y z";
836+/* { dg-bogus "unpaired" "" { target *-*-* } .-1 } */
837+ const char *s11 = "a b c LRE\u202A 1 2 3 PDF\u202c x y z";
838+/* { dg-bogus "unpaired" "" { target *-*-* } .-1 } */
839+ const char *s12 = "a b c RLE\u202b 1 2 3 PDF\u202c x y z";
840+/* { dg-bogus "unpaired" "" { target *-*-* } .-1 } */
841+ const char *s13 = "a b c RLE\u202B 1 2 3 PDF\u202c x y z";
842+/* { dg-bogus "unpaired" "" { target *-*-* } .-1 } */
843+ const char *s14 = "a b c LRO\u202d 1 2 3 PDF\u202c x y z";
844+/* { dg-bogus "unpaired" "" { target *-*-* } .-1 } */
845+ const char *s15 = "a b c LRO\u202D 1 2 3 PDF\u202c x y z";
846+/* { dg-bogus "unpaired" "" { target *-*-* } .-1 } */
847+ const char *s16 = "a b c RLO\u202e 1 2 3 PDF\u202c x y z";
848+/* { dg-bogus "unpaired" "" { target *-*-* } .-1 } */
849+ const char *s17 = "a b c RLO\u202E 1 2 3 PDF\u202c x y z";
850+/* { dg-bogus "unpaired" "" { target *-*-* } .-1 } */
851+ const char *s18 = "a b c LRI\u2066 1 2 3 PDI\u2069 x y z";
852+/* { dg-bogus "unpaired" "" { target *-*-* } .-1 } */
853+ const char *s19 = "a b c RLI\u2067 1 2 3 PDI\u2069 x y z";
854+/* { dg-bogus "unpaired" "" { target *-*-* } .-1 } */
855+ const char *s20 = "a b c FSI\u2068 1 2 3 PDI\u2069 x y z";
856+/* { dg-bogus "unpaired" "" { target *-*-* } .-1 } */
857+}
858+
859+void
860+g2 ()
861+{
862+ const char c1 = '\u202a';
863+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
864+ const char c2 = '\u202A';
865+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
866+ const char c3 = '\u202b';
867+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
868+ const char c4 = '\u202B';
869+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
870+ const char c5 = '\u202d';
871+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
872+ const char c6 = '\u202D';
873+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
874+ const char c7 = '\u202e';
875+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
876+ const char c8 = '\u202E';
877+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
878+ const char c9 = '\u2066';
879+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
880+ const char c10 = '\u2067';
881+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
882+ const char c11 = '\u2068';
883+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
884+}
885+
886+int aâªbâ¬c;
887+/* { dg-bogus "unpaired" "" { target *-*-* } .-1 } */
888+int aâ«bâ¬c;
889+/* { dg-bogus "unpaired" "" { target *-*-* } .-1 } */
890+int aâ­bâ¬c;
891+/* { dg-bogus "unpaired" "" { target *-*-* } .-1 } */
892+int aâ®bâ¬c;
893+/* { dg-bogus "unpaired" "" { target *-*-* } .-1 } */
894+int aâ¦bâ©c;
895+/* { dg-bogus "unpaired" "" { target *-*-* } .-1 } */
896+int aâ§bâ©c;
897+/* { dg-bogus "unpaired" "" { target *-*-* } .-1 } */
898+int aâ¨bâ©c;
899+/* { dg-bogus "unpaired" "" { target *-*-* } .-1 } */
900+int Aâ¬X;
901+/* { dg-bogus "unpaired" "" { target *-*-* } .-1 } */
902+int A\u202cY;
903+/* { dg-bogus "unpaired" "" { target *-*-* } .-1 } */
904+int A\u202CY2;
905+/* { dg-bogus "unpaired" "" { target *-*-* } .-1 } */
906+
907+int d\u202ae\u202cf;
908+/* { dg-bogus "unpaired" "" { target *-*-* } .-1 } */
909+int d\u202Ae\u202cf2;
910+/* { dg-bogus "unpaired" "" { target *-*-* } .-1 } */
911+int d\u202be\u202cf;
912+/* { dg-bogus "unpaired" "" { target *-*-* } .-1 } */
913+int d\u202Be\u202cf2;
914+/* { dg-bogus "unpaired" "" { target *-*-* } .-1 } */
915+int d\u202de\u202cf;
916+/* { dg-bogus "unpaired" "" { target *-*-* } .-1 } */
917+int d\u202De\u202cf2;
918+/* { dg-bogus "unpaired" "" { target *-*-* } .-1 } */
919+int d\u202ee\u202cf;
920+/* { dg-bogus "unpaired" "" { target *-*-* } .-1 } */
921+int d\u202Ee\u202cf2;
922+/* { dg-bogus "unpaired" "" { target *-*-* } .-1 } */
923+int d\u2066e\u2069f;
924+/* { dg-bogus "unpaired" "" { target *-*-* } .-1 } */
925+int d\u2067e\u2069f;
926+/* { dg-bogus "unpaired" "" { target *-*-* } .-1 } */
927+int d\u2068e\u2069f;
928+/* { dg-bogus "unpaired" "" { target *-*-* } .-1 } */
929+int X\u2069;
930+/* { dg-bogus "unpaired" "" { target *-*-* } .-1 } */
931diff --git a/gcc/testsuite/c-c++-common/Wbidi-chars-6.c b/gcc/testsuite/c-c++-common/Wbidi-chars-6.c
932new file mode 100644
933index 00000000000..0ce6fff2dee
934--- /dev/null
935+++ b/gcc/testsuite/c-c++-common/Wbidi-chars-6.c
936@@ -0,0 +1,155 @@
937+/* PR preprocessor/103026 */
938+/* { dg-do compile } */
939+/* { dg-options "-Wbidi-chars=unpaired" } */
940+/* Test nesting of bidi chars in various contexts. */
941+
942+/* Terminated by the wrong char: */
943+/* a b c LRE⪠1 2 3 PDI⩠x y z */
944+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
945+/* a b c RLEâ« 1 2 3 PDIâ© x y z*/
946+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
947+/* a b c LROâ­ 1 2 3 PDIâ© x y z */
948+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
949+/* a b c RLOâ® 1 2 3 PDIâ© x y z */
950+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
951+/* a b c LRI⦠1 2 3 PDF⬠x y z */
952+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
953+/* a b c RLI⧠1 2 3 PDF⬠x y z */
954+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
955+/* a b c FSI⨠1 2 3 PDF⬠x y z*/
956+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
957+
958+/* LRE⪠PDF⬠*/
959+/* LRE⪠LRE⪠PDF⬠PDF⬠*/
960+/* PDF⬠LRE⪠PDF⬠*/
961+/* LRE⪠PDF⬠LRE⪠PDF⬠*/
962+/* LRE⪠LRE⪠PDF⬠*/
963+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
964+/* PDF⬠LRE⪠*/
965+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
966+
967+// a b c LRE⪠1 2 3 PDI⩠x y z
968+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
969+// a b c RLEâ« 1 2 3 PDIâ© x y z*/
970+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
971+// a b c LROâ­ 1 2 3 PDIâ© x y z
972+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
973+// a b c RLOâ® 1 2 3 PDIâ© x y z
974+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
975+// a b c LRI⦠1 2 3 PDF⬠x y z
976+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
977+// a b c RLI⧠1 2 3 PDF⬠x y z
978+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
979+// a b c FSI⨠1 2 3 PDF⬠x y z
980+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
981+
982+// LRE⪠PDFâ¬
983+// LRE⪠LRE⪠PDF⬠PDFâ¬
984+// PDF⬠LRE⪠PDFâ¬
985+// LRE⪠PDF⬠LRE⪠PDFâ¬
986+// LRE⪠LRE⪠PDFâ¬
987+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
988+// PDF⬠LREâª
989+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
990+
991+void
992+g1 ()
993+{
994+ const char *s1 = "a b c LRE⪠1 2 3 PDI⩠x y z";
995+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
996+ const char *s2 = "a b c LRE\u202a 1 2 3 PDI\u2069 x y z";
997+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
998+ const char *s3 = "a b c RLEâ« 1 2 3 PDIâ© x y ";
999+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
1000+ const char *s4 = "a b c RLE\u202b 1 2 3 PDI\u2069 x y z";
1001+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
1002+ const char *s5 = "a b c LROâ­ 1 2 3 PDIâ© x y z";
1003+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
1004+ const char *s6 = "a b c LRO\u202d 1 2 3 PDI\u2069 x y z";
1005+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
1006+ const char *s7 = "a b c RLOâ® 1 2 3 PDIâ© x y z";
1007+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
1008+ const char *s8 = "a b c RLO\u202e 1 2 3 PDI\u2069 x y z";
1009+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
1010+ const char *s9 = "a b c LRI⦠1 2 3 PDF⬠x y z";
1011+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
1012+ const char *s10 = "a b c LRI\u2066 1 2 3 PDF\u202c x y z";
1013+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
1014+ const char *s11 = "a b c RLI⧠1 2 3 PDF⬠x y z\
1015+ ";
1016+/* { dg-warning "unpaired" "" { target *-*-* } .-2 } */
1017+ const char *s12 = "a b c RLI\u2067 1 2 3 PDF\u202c x y z";
1018+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
1019+ const char *s13 = "a b c FSI⨠1 2 3 PDF⬠x y z";
1020+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
1021+ const char *s14 = "a b c FSI\u2068 1 2 3 PDF\u202c x y z";
1022+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
1023+ const char *s15 = "PDF⬠LREâª";
1024+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
1025+ const char *s16 = "PDF\u202c LRE\u202a";
1026+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
1027+ const char *s17 = "LRE⪠PDFâ¬";
1028+ const char *s18 = "LRE\u202a PDF\u202c";
1029+ const char *s19 = "LRE⪠LRE⪠PDF⬠PDFâ¬";
1030+ const char *s20 = "LRE\u202a LRE\u202a PDF\u202c PDF\u202c";
1031+ const char *s21 = "PDF⬠LRE⪠PDFâ¬";
1032+ const char *s22 = "PDF\u202c LRE\u202a PDF\u202c";
1033+ const char *s23 = "LRE⪠LRE⪠PDFâ¬";
1034+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
1035+ const char *s24 = "LRE\u202a LRE\u202a PDF\u202c";
1036+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
1037+ const char *s25 = "PDF⬠LREâª";
1038+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
1039+ const char *s26 = "PDF\u202c LRE\u202a";
1040+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
1041+ const char *s27 = "PDF⬠LRE\u202a";
1042+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
1043+ const char *s28 = "PDF\u202c LREâª";
1044+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
1045+}
1046+
1047+int aLREâªbPDIâ©;
1048+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
1049+int A\u202aB\u2069C;
1050+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
1051+int aRLEâ«bPDIâ©;
1052+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
1053+int a\u202bB\u2069c;
1054+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
1055+int aLROâ­bPDIâ©;
1056+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
1057+int a\u202db\u2069c2;
1058+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
1059+int aRLOâ®bPDIâ©;
1060+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
1061+int a\u202eb\u2069;
1062+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
1063+int aLRIâ¦bPDFâ¬;
1064+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
1065+int a\u2066b\u202c;
1066+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
1067+int aRLIâ§bPDFâ¬c
1068+;
1069+/* { dg-warning "unpaired" "" { target *-*-* } .-2 } */
1070+int a\u2067b\u202c;
1071+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
1072+int aFSIâ¨bPDFâ¬;
1073+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
1074+int a\u2068b\u202c;
1075+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
1076+int aFSIâ¨bPD\u202C;
1077+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
1078+int aFSI\u2068bPDFâ¬_;
1079+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
1080+int aLREâªbPDFâ¬b;
1081+int A\u202aB\u202c;
1082+int a_LREâª_LREâª_b_PDFâ¬_PDFâ¬;
1083+int A\u202aA\u202aB\u202cB\u202c;
1084+int aPDFâ¬bLREadPDFâ¬;
1085+int a_\u202C_\u202a_\u202c;
1086+int a_LREâª_b_PDFâ¬_c_LREâª_PDFâ¬;
1087+int a_\u202a_\u202c_\u202a_\u202c_;
1088+int a_LREâª_b_PDFâ¬_c_LREâª;
1089+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
1090+int a_\u202a_\u202c_\u202a_;
1091+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
1092diff --git a/gcc/testsuite/c-c++-common/Wbidi-chars-7.c b/gcc/testsuite/c-c++-common/Wbidi-chars-7.c
1093new file mode 100644
1094index 00000000000..d012d420ec0
1095--- /dev/null
1096+++ b/gcc/testsuite/c-c++-common/Wbidi-chars-7.c
1097@@ -0,0 +1,9 @@
1098+/* PR preprocessor/103026 */
1099+/* { dg-do compile } */
1100+/* { dg-options "-Wbidi-chars=any" } */
1101+/* Test we ignore UCNs in comments. */
1102+
1103+// a b c \u202a 1 2 3
1104+// a b c \u202A 1 2 3
1105+/* a b c \u202a 1 2 3 */
1106+/* a b c \u202A 1 2 3 */
1107diff --git a/gcc/testsuite/c-c++-common/Wbidi-chars-8.c b/gcc/testsuite/c-c++-common/Wbidi-chars-8.c
1108new file mode 100644
1109index 00000000000..4f54c5092ec
1110--- /dev/null
1111+++ b/gcc/testsuite/c-c++-common/Wbidi-chars-8.c
1112@@ -0,0 +1,13 @@
1113+/* PR preprocessor/103026 */
1114+/* { dg-do compile } */
1115+/* { dg-options "-Wbidi-chars=any" } */
1116+/* Test \u vs \U. */
1117+
1118+int a_\u202A;
1119+/* { dg-warning "U\\+202A" "" { target *-*-* } .-1 } */
1120+int a_\u202a_2;
1121+/* { dg-warning "U\\+202A" "" { target *-*-* } .-1 } */
1122+int a_\U0000202A_3;
1123+/* { dg-warning "U\\+202A" "" { target *-*-* } .-1 } */
1124+int a_\U0000202a_4;
1125+/* { dg-warning "U\\+202A" "" { target *-*-* } .-1 } */
1126diff --git a/gcc/testsuite/c-c++-common/Wbidi-chars-9.c b/gcc/testsuite/c-c++-common/Wbidi-chars-9.c
1127new file mode 100644
1128index 00000000000..e2af1b1ca97
1129--- /dev/null
1130+++ b/gcc/testsuite/c-c++-common/Wbidi-chars-9.c
1131@@ -0,0 +1,29 @@
1132+/* PR preprocessor/103026 */
1133+/* { dg-do compile } */
1134+/* { dg-options "-Wbidi-chars=unpaired" } */
1135+/* Test that we properly separate bidi contexts (comment/identifier/character
1136+ constant/string literal). */
1137+
1138+/* LRE ->âª<- */ int pdf_\u202c_1;
1139+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
1140+/* RLE ->â«<- */ int pdf_\u202c_2;
1141+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
1142+/* LRO ->â­<- */ int pdf_\u202c_3;
1143+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
1144+/* RLO ->â®<- */ int pdf_\u202c_4;
1145+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
1146+/* LRI ->â¦<-*/ int pdi_\u2069_1;
1147+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
1148+/* RLI ->â§<- */ int pdi_\u2069_12;
1149+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
1150+/* FSI ->â¨<- */ int pdi_\u2069_3;
1151+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
1152+
1153+const char *s1 = "LRE\u202a"; /* PDF ->â¬<- */
1154+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
1155+/* LRE ->âª<- */ const char *s2 = "PDF\u202c";
1156+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
1157+const char *s3 = "LRE\u202a"; int pdf_\u202c_5;
1158+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
1159+int lre_\u202a; const char *s4 = "PDF\u202c";
1160+/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */
1161diff --git a/libcpp/include/cpplib.h b/libcpp/include/cpplib.h
1162index 176f8c5bbce..112b9c24751 100644
1163--- a/libcpp/include/cpplib.h
1164+++ b/libcpp/include/cpplib.h
1165@@ -318,6 +318,17 @@ enum cpp_main_search
1166 CMS_system, /* Search the system INCLUDE path. */
1167 };
1168
1169+/* The possible bidirectional control characters checking levels, from least
1170+ restrictive to most. */
1171+enum cpp_bidirectional_level {
1172+ /* No checking. */
1173+ bidirectional_none,
1174+ /* Only detect unpaired uses of bidirectional control characters. */
1175+ bidirectional_unpaired,
1176+ /* Detect any use of bidirectional control characters. */
1177+ bidirectional_any
1178+};
1179+
1180 /* This structure is nested inside struct cpp_reader, and
1181 carries all the options visible to the command line. */
1182 struct cpp_options
1183@@ -531,6 +542,10 @@ struct cpp_options
1184 /* True if warn about differences between C++98 and C++11. */
1185 bool cpp_warn_cxx11_compat;
1186
1187+ /* Nonzero if bidirectional control characters checking is on. See enum
1188+ cpp_bidirectional_level. */
1189+ unsigned char cpp_warn_bidirectional;
1190+
1191 /* Dependency generation. */
1192 struct
1193 {
1194@@ -635,7 +650,8 @@ enum cpp_warning_reason {
1195 CPP_W_C90_C99_COMPAT,
1196 CPP_W_C11_C2X_COMPAT,
1197 CPP_W_CXX11_COMPAT,
1198- CPP_W_EXPANSION_TO_DEFINED
1199+ CPP_W_EXPANSION_TO_DEFINED,
1200+ CPP_W_BIDIRECTIONAL
1201 };
1202
1203 /* Callback for header lookup for HEADER, which is the name of a
1204diff --git a/libcpp/init.c b/libcpp/init.c
1205index 5a424e23553..f9a8f5f088f 100644
1206--- a/libcpp/init.c
1207+++ b/libcpp/init.c
1208@@ -219,6 +219,7 @@ cpp_create_reader (enum c_lang lang, cpp
1209 = ENABLE_CANONICAL_SYSTEM_HEADERS;
1210 CPP_OPTION (pfile, ext_numeric_literals) = 1;
1211 CPP_OPTION (pfile, warn_date_time) = 0;
1212+ CPP_OPTION (pfile, cpp_warn_bidirectional) = bidirectional_unpaired;
1213
1214 /* Default CPP arithmetic to something sensible for the host for the
1215 benefit of dumb users like fix-header. */
1216diff --git a/libcpp/internal.h b/libcpp/internal.h
1217index 8577cab6c83..0ce0246c5a2 100644
1218--- a/libcpp/internal.h
1219+++ b/libcpp/internal.h
1220@@ -597,6 +597,13 @@ struct cpp_reader
1221 /* Location identifying the main source file -- intended to be line
1222 zero of said file. */
1223 location_t main_loc;
1224+
1225+ /* Returns true iff we should warn about UTF-8 bidirectional control
1226+ characters. */
1227+ bool warn_bidi_p () const
1228+ {
1229+ return CPP_OPTION (this, cpp_warn_bidirectional) != bidirectional_none;
1230+ }
1231 };
1232
1233 /* Character classes. Based on the more primitive macros in safe-ctype.h.
1234diff --git a/libcpp/lex.c b/libcpp/lex.c
1235index fa2253d41c3..6a4fbce6030 100644
1236--- a/libcpp/lex.c
1237+++ b/libcpp/lex.c
1238@@ -1164,6 +1164,324 @@ _cpp_process_line_notes (cpp_reader *pfi
1239 }
1240 }
1241
1242+namespace bidi {
1243+ enum class kind {
1244+ NONE, LRE, RLE, LRO, RLO, LRI, RLI, FSI, PDF, PDI, LTR, RTL
1245+ };
1246+
1247+ /* All the UTF-8 encodings of bidi characters start with E2. */
1248+ constexpr uchar utf8_start = 0xe2;
1249+
1250+ /* A vector holding currently open bidi contexts. We use a char for
1251+ each context, its LSB is 1 if it represents a PDF context, 0 if it
1252+ represents a PDI context. The next bit is 1 if this context was open
1253+ by a bidi character written as a UCN, and 0 when it was UTF-8. */
1254+ semi_embedded_vec <unsigned char, 16> vec;
1255+
1256+ /* Close the whole comment/identifier/string literal/character constant
1257+ context. */
1258+ void on_close ()
1259+ {
1260+ vec.truncate (0);
1261+ }
1262+
1263+ /* Pop the last element in the vector. */
1264+ void pop ()
1265+ {
1266+ unsigned int len = vec.count ();
1267+ gcc_checking_assert (len > 0);
1268+ vec.truncate (len - 1);
1269+ }
1270+
1271+ /* Return the context of the Ith element. */
1272+ kind ctx_at (unsigned int i)
1273+ {
1274+ return (vec[i] & 1) ? kind::PDF : kind::PDI;
1275+ }
1276+
1277+ /* Return which context is currently opened. */
1278+ kind current_ctx ()
1279+ {
1280+ unsigned int len = vec.count ();
1281+ if (len == 0)
1282+ return kind::NONE;
1283+ return ctx_at (len - 1);
1284+ }
1285+
1286+ /* Return true if the current context comes from a UCN origin, that is,
1287+ the bidi char which started this bidi context was written as a UCN. */
1288+ bool current_ctx_ucn_p ()
1289+ {
1290+ unsigned int len = vec.count ();
1291+ gcc_checking_assert (len > 0);
1292+ return (vec[len - 1] >> 1) & 1;
1293+ }
1294+
1295+ /* We've read a bidi char, update the current vector as necessary. */
1296+ void on_char (kind k, bool ucn_p)
1297+ {
1298+ switch (k)
1299+ {
1300+ case kind::LRE:
1301+ case kind::RLE:
1302+ case kind::LRO:
1303+ case kind::RLO:
1304+ vec.push (ucn_p ? 3u : 1u);
1305+ break;
1306+ case kind::LRI:
1307+ case kind::RLI:
1308+ case kind::FSI:
1309+ vec.push (ucn_p ? 2u : 0u);
1310+ break;
1311+ /* PDF terminates the scope of the last LRE, RLE, LRO, or RLO
1312+ whose scope has not yet been terminated. */
1313+ case kind::PDF:
1314+ if (current_ctx () == kind::PDF)
1315+ pop ();
1316+ break;
1317+ /* PDI terminates the scope of the last LRI, RLI, or FSI whose
1318+ scope has not yet been terminated, as well as the scopes of
1319+ any subsequent LREs, RLEs, LROs, or RLOs whose scopes have not
1320+ yet been terminated. */
1321+ case kind::PDI:
1322+ for (int i = vec.count () - 1; i >= 0; --i)
1323+ if (ctx_at (i) == kind::PDI)
1324+ {
1325+ vec.truncate (i);
1326+ break;
1327+ }
1328+ break;
1329+ case kind::LTR:
1330+ case kind::RTL:
1331+ /* These aren't popped by a PDF/PDI. */
1332+ break;
1333+ [[likely]] case kind::NONE:
1334+ break;
1335+ default:
1336+ abort ();
1337+ }
1338+ }
1339+
1340+ /* Return a descriptive string for K. */
1341+ const char *to_str (kind k)
1342+ {
1343+ switch (k)
1344+ {
1345+ case kind::LRE:
1346+ return "U+202A (LEFT-TO-RIGHT EMBEDDING)";
1347+ case kind::RLE:
1348+ return "U+202B (RIGHT-TO-LEFT EMBEDDING)";
1349+ case kind::LRO:
1350+ return "U+202D (LEFT-TO-RIGHT OVERRIDE)";
1351+ case kind::RLO:
1352+ return "U+202E (RIGHT-TO-LEFT OVERRIDE)";
1353+ case kind::LRI:
1354+ return "U+2066 (LEFT-TO-RIGHT ISOLATE)";
1355+ case kind::RLI:
1356+ return "U+2067 (RIGHT-TO-LEFT ISOLATE)";
1357+ case kind::FSI:
1358+ return "U+2068 (FIRST STRONG ISOLATE)";
1359+ case kind::PDF:
1360+ return "U+202C (POP DIRECTIONAL FORMATTING)";
1361+ case kind::PDI:
1362+ return "U+2069 (POP DIRECTIONAL ISOLATE)";
1363+ case kind::LTR:
1364+ return "U+200E (LEFT-TO-RIGHT MARK)";
1365+ case kind::RTL:
1366+ return "U+200F (RIGHT-TO-LEFT MARK)";
1367+ default:
1368+ abort ();
1369+ }
1370+ }
1371+}
1372+
1373+/* Parse a sequence of 3 bytes starting with P and return its bidi code. */
1374+
1375+static bidi::kind
1376+get_bidi_utf8 (const unsigned char *const p)
1377+{
1378+ gcc_checking_assert (p[0] == bidi::utf8_start);
1379+
1380+ if (p[1] == 0x80)
1381+ switch (p[2])
1382+ {
1383+ case 0xaa:
1384+ return bidi::kind::LRE;
1385+ case 0xab:
1386+ return bidi::kind::RLE;
1387+ case 0xac:
1388+ return bidi::kind::PDF;
1389+ case 0xad:
1390+ return bidi::kind::LRO;
1391+ case 0xae:
1392+ return bidi::kind::RLO;
1393+ case 0x8e:
1394+ return bidi::kind::LTR;
1395+ case 0x8f:
1396+ return bidi::kind::RTL;
1397+ default:
1398+ break;
1399+ }
1400+ else if (p[1] == 0x81)
1401+ switch (p[2])
1402+ {
1403+ case 0xa6:
1404+ return bidi::kind::LRI;
1405+ case 0xa7:
1406+ return bidi::kind::RLI;
1407+ case 0xa8:
1408+ return bidi::kind::FSI;
1409+ case 0xa9:
1410+ return bidi::kind::PDI;
1411+ default:
1412+ break;
1413+ }
1414+
1415+ return bidi::kind::NONE;
1416+}
1417+
1418+/* Parse a UCN where P points just past \u or \U and return its bidi code. */
1419+
1420+static bidi::kind
1421+get_bidi_ucn (const unsigned char *p, bool is_U)
1422+{
1423+ /* 6.4.3 Universal Character Names
1424+ \u hex-quad
1425+ \U hex-quad hex-quad
1426+ where \unnnn means \U0000nnnn. */
1427+
1428+ if (is_U)
1429+ {
1430+ if (p[0] != '0' || p[1] != '0' || p[2] != '0' || p[3] != '0')
1431+ return bidi::kind::NONE;
1432+ /* Skip 4B so we can treat \u and \U the same below. */
1433+ p += 4;
1434+ }
1435+
1436+ /* All code points we are looking for start with 20xx. */
1437+ if (p[0] != '2' || p[1] != '0')
1438+ return bidi::kind::NONE;
1439+ else if (p[2] == '2')
1440+ switch (p[3])
1441+ {
1442+ case 'a':
1443+ case 'A':
1444+ return bidi::kind::LRE;
1445+ case 'b':
1446+ case 'B':
1447+ return bidi::kind::RLE;
1448+ case 'c':
1449+ case 'C':
1450+ return bidi::kind::PDF;
1451+ case 'd':
1452+ case 'D':
1453+ return bidi::kind::LRO;
1454+ case 'e':
1455+ case 'E':
1456+ return bidi::kind::RLO;
1457+ default:
1458+ break;
1459+ }
1460+ else if (p[2] == '6')
1461+ switch (p[3])
1462+ {
1463+ case '6':
1464+ return bidi::kind::LRI;
1465+ case '7':
1466+ return bidi::kind::RLI;
1467+ case '8':
1468+ return bidi::kind::FSI;
1469+ case '9':
1470+ return bidi::kind::PDI;
1471+ default:
1472+ break;
1473+ }
1474+ else if (p[2] == '0')
1475+ switch (p[3])
1476+ {
1477+ case 'e':
1478+ case 'E':
1479+ return bidi::kind::LTR;
1480+ case 'f':
1481+ case 'F':
1482+ return bidi::kind::RTL;
1483+ default:
1484+ break;
1485+ }
1486+
1487+ return bidi::kind::NONE;
1488+}
1489+
1490+/* We're closing a bidi context, that is, we've encountered a newline,
1491+ are closing a C-style comment, or are at the end of a string literal,
1492+ character constant, or identifier. Warn if this context was not
1493+ properly terminated by a PDI or PDF. P points to the last character
1494+ in this context. */
1495+
1496+static void
1497+maybe_warn_bidi_on_close (cpp_reader *pfile, const uchar *p)
1498+{
1499+ if (CPP_OPTION (pfile, cpp_warn_bidirectional) == bidirectional_unpaired
1500+ && bidi::vec.count () > 0)
1501+ {
1502+ const location_t loc
1503+ = linemap_position_for_column (pfile->line_table,
1504+ CPP_BUF_COLUMN (pfile->buffer, p));
1505+ cpp_warning_with_line (pfile, CPP_W_BIDIRECTIONAL, loc, 0,
1506+ "unpaired UTF-8 bidirectional control character "
1507+ "detected");
1508+ }
1509+ /* We're done with this context. */
1510+ bidi::on_close ();
1511+}
1512+
1513+/* We're at the beginning or in the middle of an identifier/comment/string
1514+ literal/character constant. Warn if we've encountered a bidi character.
1515+ KIND says which bidi character it was; P points to it in the character
1516+ stream. UCN_P is true iff this bidi character was written as a UCN. */
1517+
1518+static void
1519+maybe_warn_bidi_on_char (cpp_reader *pfile, const uchar *p, bidi::kind kind,
1520+ bool ucn_p)
1521+{
1522+ if (__builtin_expect (kind == bidi::kind::NONE, 1))
1523+ return;
1524+
1525+ const auto warn_bidi = CPP_OPTION (pfile, cpp_warn_bidirectional);
1526+
1527+ if (warn_bidi != bidirectional_none)
1528+ {
1529+ const location_t loc
1530+ = linemap_position_for_column (pfile->line_table,
1531+ CPP_BUF_COLUMN (pfile->buffer, p));
1532+ /* It seems excessive to warn about a PDI/PDF that is closing
1533+ an opened context because we've already warned about the
1534+ opening character. Except warn when we have a UCN x UTF-8
1535+ mismatch. */
1536+ if (kind == bidi::current_ctx ())
1537+ {
1538+ if (warn_bidi == bidirectional_unpaired
1539+ && bidi::current_ctx_ucn_p () != ucn_p)
1540+ cpp_warning_with_line (pfile, CPP_W_BIDIRECTIONAL, loc, 0,
1541+ "UTF-8 vs UCN mismatch when closing "
1542+ "a context by \"%s\"", bidi::to_str (kind));
1543+ }
1544+ else if (warn_bidi == bidirectional_any)
1545+ {
1546+ if (kind == bidi::kind::PDF || kind == bidi::kind::PDI)
1547+ cpp_warning_with_line (pfile, CPP_W_BIDIRECTIONAL, loc, 0,
1548+ "\"%s\" is closing an unopened context",
1549+ bidi::to_str (kind));
1550+ else
1551+ cpp_warning_with_line (pfile, CPP_W_BIDIRECTIONAL, loc, 0,
1552+ "found problematic Unicode character \"%s\"",
1553+ bidi::to_str (kind));
1554+ }
1555+ }
1556+ /* We're done with this context. */
1557+ bidi::on_char (kind, ucn_p);
1558+}
1559+
1560 /* Skip a C-style block comment. We find the end of the comment by
1561 seeing if an asterisk is before every '/' we encounter. Returns
1562 nonzero if comment terminated by EOF, zero otherwise.
1563@@ -1175,6 +1493,7 @@ _cpp_skip_block_comment (cpp_reader *pfi
1564 cpp_buffer *buffer = pfile->buffer;
1565 const uchar *cur = buffer->cur;
1566 uchar c;
1567+ const bool warn_bidi_p = pfile->warn_bidi_p ();
1568
1569 cur++;
1570 if (*cur == '/')
1571@@ -1189,7 +1508,11 @@ _cpp_skip_block_comment (cpp_reader *pfi
1572 if (c == '/')
1573 {
1574 if (cur[-2] == '*')
1575- break;
1576+ {
1577+ if (warn_bidi_p)
1578+ maybe_warn_bidi_on_close (pfile, cur);
1579+ break;
1580+ }
1581
1582 /* Warn about potential nested comments, but not if the '/'
1583 comes immediately before the true comment delimiter.
1584@@ -1208,6 +1531,8 @@ _cpp_skip_block_comment (cpp_reader *pfi
1585 {
1586 unsigned int cols;
1587 buffer->cur = cur - 1;
1588+ if (warn_bidi_p)
1589+ maybe_warn_bidi_on_close (pfile, cur);
1590 _cpp_process_line_notes (pfile, true);
1591 if (buffer->next_line >= buffer->rlimit)
1592 return true;
1593@@ -1218,6 +1543,13 @@ _cpp_skip_block_comment (cpp_reader *pfi
1594
1595 cur = buffer->cur;
1596 }
1597+ /* If this is a beginning of a UTF-8 encoding, it might be
1598+ a bidirectional control character. */
1599+ else if (__builtin_expect (c == bidi::utf8_start, 0) && warn_bidi_p)
1600+ {
1601+ bidi::kind kind = get_bidi_utf8 (cur - 1);
1602+ maybe_warn_bidi_on_char (pfile, cur, kind, /*ucn_p=*/false);
1603+ }
1604 }
1605
1606 buffer->cur = cur;
1607@@ -1233,9 +1565,31 @@ skip_line_comment (cpp_reader *pfile)
1608 {
1609 cpp_buffer *buffer = pfile->buffer;
1610 location_t orig_line = pfile->line_table->highest_line;
1611+ const bool warn_bidi_p = pfile->warn_bidi_p ();
1612
1613- while (*buffer->cur != '\n')
1614- buffer->cur++;
1615+ if (!warn_bidi_p)
1616+ while (*buffer->cur != '\n')
1617+ buffer->cur++;
1618+ else
1619+ {
1620+ while (*buffer->cur != '\n'
1621+ && *buffer->cur != bidi::utf8_start)
1622+ buffer->cur++;
1623+ if (__builtin_expect (*buffer->cur == bidi::utf8_start, 0))
1624+ {
1625+ while (*buffer->cur != '\n')
1626+ {
1627+ if (__builtin_expect (*buffer->cur == bidi::utf8_start, 0))
1628+ {
1629+ bidi::kind kind = get_bidi_utf8 (buffer->cur);
1630+ maybe_warn_bidi_on_char (pfile, buffer->cur, kind,
1631+ /*ucn_p=*/false);
1632+ }
1633+ buffer->cur++;
1634+ }
1635+ maybe_warn_bidi_on_close (pfile, buffer->cur);
1636+ }
1637+ }
1638
1639 _cpp_process_line_notes (pfile, true);
1640 return orig_line != pfile->line_table->highest_line;
1641@@ -1317,11 +1671,13 @@ static const cppchar_t utf8_signifier =
1642
1643 /* Returns TRUE if the sequence starting at buffer->cur is valid in
1644 an identifier. FIRST is TRUE if this starts an identifier. */
1645+
1646 static bool
1647 forms_identifier_p (cpp_reader *pfile, int first,
1648 struct normalize_state *state)
1649 {
1650 cpp_buffer *buffer = pfile->buffer;
1651+ const bool warn_bidi_p = pfile->warn_bidi_p ();
1652
1653 if (*buffer->cur == '$')
1654 {
1655@@ -1344,6 +1700,13 @@ forms_identifier_p (cpp_reader *pfile, i
1656 cppchar_t s;
1657 if (*buffer->cur >= utf8_signifier)
1658 {
1659+ if (__builtin_expect (*buffer->cur == bidi::utf8_start, 0)
1660+ && warn_bidi_p)
1661+ {
1662+ bidi::kind kind = get_bidi_utf8 (buffer->cur);
1663+ maybe_warn_bidi_on_char (pfile, buffer->cur, kind,
1664+ /*ucn_p=*/false);
1665+ }
1666 if (_cpp_valid_utf8 (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
1667 state, &s))
1668 return true;
1669@@ -1352,6 +1715,13 @@ forms_identifier_p (cpp_reader *pfile, i
1670 && (buffer->cur[1] == 'u' || buffer->cur[1] == 'U'))
1671 {
1672 buffer->cur += 2;
1673+ if (warn_bidi_p)
1674+ {
1675+ bidi::kind kind = get_bidi_ucn (buffer->cur,
1676+ buffer->cur[-1] == 'U');
1677+ maybe_warn_bidi_on_char (pfile, buffer->cur, kind,
1678+ /*ucn_p=*/true);
1679+ }
1680 if (_cpp_valid_ucn (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
1681 state, &s, NULL, NULL))
1682 return true;
1683@@ -1460,6 +1830,7 @@ lex_identifier (cpp_reader *pfile, const
1684 const uchar *cur;
1685 unsigned int len;
1686 unsigned int hash = HT_HASHSTEP (0, *base);
1687+ const bool warn_bidi_p = pfile->warn_bidi_p ();
1688
1689 cur = pfile->buffer->cur;
1690 if (! starts_ucn)
1691@@ -1483,6 +1854,8 @@ lex_identifier (cpp_reader *pfile, const
1692 pfile->buffer->cur++;
1693 }
1694 } while (forms_identifier_p (pfile, false, nst));
1695+ if (warn_bidi_p)
1696+ maybe_warn_bidi_on_close (pfile, pfile->buffer->cur);
1697 result = _cpp_interpret_identifier (pfile, base,
1698 pfile->buffer->cur - base);
1699 *spelling = cpp_lookup (pfile, base, pfile->buffer->cur - base);
1700@@ -1719,6 +2092,7 @@ static void
1701 lex_raw_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
1702 {
1703 const uchar *pos = base;
1704+ const bool warn_bidi_p = pfile->warn_bidi_p ();
1705
1706 /* 'tis a pity this information isn't passed down from the lexer's
1707 initial categorization of the token. */
1708@@ -1955,8 +2329,15 @@ lex_raw_string (cpp_reader *pfile, cpp_t
1709 pos = base = pfile->buffer->cur;
1710 note = &pfile->buffer->notes[pfile->buffer->cur_note];
1711 }
1712+ else if (__builtin_expect ((unsigned char) c == bidi::utf8_start, 0)
1713+ && warn_bidi_p)
1714+ maybe_warn_bidi_on_char (pfile, pos - 1, get_bidi_utf8 (pos - 1),
1715+ /*ucn_p=*/false);
1716 }
1717
1718+ if (warn_bidi_p)
1719+ maybe_warn_bidi_on_close (pfile, pos);
1720+
1721 if (CPP_OPTION (pfile, user_literals))
1722 {
1723 /* If a string format macro, say from inttypes.h, is placed touching
1724@@ -2051,15 +2432,27 @@ lex_string (cpp_reader *pfile, cpp_token
1725 else
1726 terminator = '>', type = CPP_HEADER_NAME;
1727
1728+ const bool warn_bidi_p = pfile->warn_bidi_p ();
1729 for (;;)
1730 {
1731 cppchar_t c = *cur++;
1732
1733 /* In #include-style directives, terminators are not escapable. */
1734 if (c == '\\' && !pfile->state.angled_headers && *cur != '\n')
1735- cur++;
1736+ {
1737+ if ((cur[0] == 'u' || cur[0] == 'U') && warn_bidi_p)
1738+ {
1739+ bidi::kind kind = get_bidi_ucn (cur + 1, cur[0] == 'U');
1740+ maybe_warn_bidi_on_char (pfile, cur, kind, /*ucn_p=*/true);
1741+ }
1742+ cur++;
1743+ }
1744 else if (c == terminator)
1745- break;
1746+ {
1747+ if (warn_bidi_p)
1748+ maybe_warn_bidi_on_close (pfile, cur - 1);
1749+ break;
1750+ }
1751 else if (c == '\n')
1752 {
1753 cur--;
1754@@ -2076,6 +2469,11 @@ lex_string (cpp_reader *pfile, cpp_token
1755 }
1756 else if (c == '\0')
1757 saw_NUL = true;
1758+ else if (__builtin_expect (c == bidi::utf8_start, 0) && warn_bidi_p)
1759+ {
1760+ bidi::kind kind = get_bidi_utf8 (cur - 1);
1761+ maybe_warn_bidi_on_char (pfile, cur - 1, kind, /*ucn_p=*/false);
1762+ }
1763 }
1764
1765 if (saw_NUL && !pfile->state.skipping)