Brad Bishop | bec4ebc | 2022-08-03 09:55:16 -0400 | [diff] [blame] | 1 | From bd5e882cf6e0def3dd1bc106075d59a303fe0d1e Mon Sep 17 00:00:00 2001 |
| 2 | From: David Malcolm <dmalcolm@redhat.com> |
| 3 | Date: Mon, 18 Oct 2021 18:55:31 -0400 |
| 4 | Subject: [PATCH] diagnostics: escape non-ASCII source bytes for certain |
| 5 | diagnostics |
| 6 | MIME-Version: 1.0 |
| 7 | Content-Type: text/plain; charset=utf8 |
| 8 | Content-Transfer-Encoding: 8bit |
| 9 | |
| 10 | This patch adds support to GCC's diagnostic subsystem for escaping certain |
| 11 | bytes and Unicode characters when quoting source code. |
| 12 | |
| 13 | Specifically, this patch adds a new flag rich_location::m_escape_on_output |
| 14 | which is a hint from a diagnostic that non-ASCII bytes in the pertinent |
| 15 | lines of the user's source code should be escaped when printed. |
| 16 | |
| 17 | The patch sets this for the following diagnostics: |
| 18 | - when complaining about stray bytes in the program (when these |
| 19 | are non-printable) |
| 20 | - when complaining about "null character(s) ignored"); |
| 21 | - for -Wnormalized= (and generate source ranges for such warnings) |
| 22 | |
| 23 | The escaping is controlled by a new option: |
| 24 | -fdiagnostics-escape-format=[unicode|bytes] |
| 25 | |
| 26 | For example, consider a diagnostic involing a source line containing the |
| 27 | string "before" followed by the Unicode character U+03C0 ("GREEK SMALL |
| 28 | LETTER PI", with UTF-8 encoding 0xCF 0x80) followed by the byte 0xBF |
| 29 | (a stray UTF-8 trailing byte), followed by the string "after", where the |
| 30 | diagnostic highlights the U+03C0 character. |
| 31 | |
| 32 | By default, this line will be printed verbatim to the user when |
| 33 | reporting a diagnostic at it, as: |
| 34 | |
| 35 | beforeÏXafter |
| 36 | ^ |
| 37 | |
| 38 | (using X for the stray byte to avoid putting invalid UTF-8 in this |
| 39 | commit message) |
| 40 | |
| 41 | If the diagnostic sets the "escape" flag, it will be printed as: |
| 42 | |
| 43 | before<U+03C0><BF>after |
| 44 | ^~~~~~~~ |
| 45 | |
| 46 | with -fdiagnostics-escape-format=unicode (the default), or as: |
| 47 | |
| 48 | before<CF><80><BF>after |
| 49 | ^~~~~~~~ |
| 50 | |
| 51 | if the user supplies -fdiagnostics-escape-format=bytes. |
| 52 | |
| 53 | This only affects how the source is printed; it does not affect |
| 54 | how column numbers that are printed (as per -fdiagnostics-column-unit= |
| 55 | and -fdiagnostics-column-origin=). |
| 56 | |
| 57 | gcc/c-family/ChangeLog: |
| 58 | * c-lex.c (c_lex_with_flags): When complaining about non-printable |
| 59 | CPP_OTHER tokens, set the "escape on output" flag. |
| 60 | |
| 61 | gcc/ChangeLog: |
| 62 | * common.opt (fdiagnostics-escape-format=): New. |
| 63 | (diagnostics_escape_format): New enum. |
| 64 | (DIAGNOSTICS_ESCAPE_FORMAT_UNICODE): New enum value. |
| 65 | (DIAGNOSTICS_ESCAPE_FORMAT_BYTES): Likewise. |
| 66 | * diagnostic-format-json.cc (json_end_diagnostic): Add |
| 67 | "escape-source" attribute. |
| 68 | * diagnostic-show-locus.c |
| 69 | (exploc_with_display_col::exploc_with_display_col): Replace |
| 70 | "tabstop" param with a cpp_char_column_policy and add an "aspect" |
| 71 | param. Use these to compute m_display_col accordingly. |
| 72 | (struct char_display_policy): New struct. |
| 73 | (layout::m_policy): New field. |
| 74 | (layout::m_escape_on_output): New field. |
| 75 | (def_policy): New function. |
| 76 | (make_range): Update for changes to exploc_with_display_col ctor. |
| 77 | (default_print_decoded_ch): New. |
| 78 | (width_per_escaped_byte): New. |
| 79 | (escape_as_bytes_width): New. |
| 80 | (escape_as_bytes_print): New. |
| 81 | (escape_as_unicode_width): New. |
| 82 | (escape_as_unicode_print): New. |
| 83 | (make_policy): New. |
| 84 | (layout::layout): Initialize new fields. Update m_exploc ctor |
| 85 | call for above change to ctor. |
| 86 | (layout::maybe_add_location_range): Update for changes to |
| 87 | exploc_with_display_col ctor. |
| 88 | (layout::calculate_x_offset_display): Update for change to |
| 89 | cpp_display_width. |
| 90 | (layout::print_source_line): Pass policy |
| 91 | to cpp_display_width_computation. Capture cpp_decoded_char when |
| 92 | calling process_next_codepoint. Move printing of source code to |
| 93 | m_policy.m_print_cb. |
| 94 | (line_label::line_label): Pass in policy rather than context. |
| 95 | (layout::print_any_labels): Update for change to line_label ctor. |
| 96 | (get_affected_range): Pass in policy rather than context, updating |
| 97 | calls to location_compute_display_column accordingly. |
| 98 | (get_printed_columns): Likewise, also for cpp_display_width. |
| 99 | (correction::correction): Pass in policy rather than tabstop. |
| 100 | (correction::compute_display_cols): Pass m_policy rather than |
| 101 | m_tabstop to cpp_display_width. |
| 102 | (correction::m_tabstop): Replace with... |
| 103 | (correction::m_policy): ...this. |
| 104 | (line_corrections::line_corrections): Pass in policy rather than |
| 105 | context. |
| 106 | (line_corrections::m_context): Replace with... |
| 107 | (line_corrections::m_policy): ...this. |
| 108 | (line_corrections::add_hint): Update to use m_policy rather than |
| 109 | m_context. |
| 110 | (line_corrections::add_hint): Likewise. |
| 111 | (layout::print_trailing_fixits): Likewise. |
| 112 | (selftest::test_display_widths): New. |
| 113 | (selftest::test_layout_x_offset_display_utf8): Update to use |
| 114 | policy rather than tabstop. |
| 115 | (selftest::test_one_liner_labels_utf8): Add test of escaping |
| 116 | source lines. |
| 117 | (selftest::test_diagnostic_show_locus_one_liner_utf8): Update to |
| 118 | use policy rather than tabstop. |
| 119 | (selftest::test_overlapped_fixit_printing): Likewise. |
| 120 | (selftest::test_overlapped_fixit_printing_utf8): Likewise. |
| 121 | (selftest::test_overlapped_fixit_printing_2): Likewise. |
| 122 | (selftest::test_tab_expansion): Likewise. |
| 123 | (selftest::test_escaping_bytes_1): New. |
| 124 | (selftest::test_escaping_bytes_2): New. |
| 125 | (selftest::diagnostic_show_locus_c_tests): Call the new tests. |
| 126 | * diagnostic.c (diagnostic_initialize): Initialize |
| 127 | context->escape_format. |
| 128 | (convert_column_unit): Update to use default character width policy. |
| 129 | (selftest::test_diagnostic_get_location_text): Likewise. |
| 130 | * diagnostic.h (enum diagnostics_escape_format): New enum. |
| 131 | (diagnostic_context::escape_format): New field. |
| 132 | * doc/invoke.texi (-fdiagnostics-escape-format=): New option. |
| 133 | (-fdiagnostics-format=): Add "escape-source" attribute to examples |
| 134 | of JSON output, and document it. |
| 135 | * input.c (location_compute_display_column): Pass in "policy" |
| 136 | rather than "tabstop", passing to |
| 137 | cpp_byte_column_to_display_column. |
| 138 | (selftest::test_cpp_utf8): Update to use cpp_char_column_policy. |
| 139 | * input.h (class cpp_char_column_policy): New forward decl. |
| 140 | (location_compute_display_column): Pass in "policy" rather than |
| 141 | "tabstop". |
| 142 | * opts.c (common_handle_option): Handle |
| 143 | OPT_fdiagnostics_escape_format_. |
| 144 | * selftest.c (temp_source_file::temp_source_file): New ctor |
| 145 | overload taking a size_t. |
| 146 | * selftest.h (temp_source_file::temp_source_file): Likewise. |
| 147 | |
| 148 | gcc/testsuite/ChangeLog: |
| 149 | * c-c++-common/diagnostic-format-json-1.c: Add regexp to consume |
| 150 | "escape-source" attribute. |
| 151 | * c-c++-common/diagnostic-format-json-2.c: Likewise. |
| 152 | * c-c++-common/diagnostic-format-json-3.c: Likewise. |
| 153 | * c-c++-common/diagnostic-format-json-4.c: Likewise, twice. |
| 154 | * c-c++-common/diagnostic-format-json-5.c: Likewise. |
| 155 | * gcc.dg/cpp/warn-normalized-4-bytes.c: New test. |
| 156 | * gcc.dg/cpp/warn-normalized-4-unicode.c: New test. |
| 157 | * gcc.dg/encoding-issues-bytes.c: New test. |
| 158 | * gcc.dg/encoding-issues-unicode.c: New test. |
| 159 | * gfortran.dg/diagnostic-format-json-1.F90: Add regexp to consume |
| 160 | "escape-source" attribute. |
| 161 | * gfortran.dg/diagnostic-format-json-2.F90: Likewise. |
| 162 | * gfortran.dg/diagnostic-format-json-3.F90: Likewise. |
| 163 | |
| 164 | libcpp/ChangeLog: |
| 165 | * charset.c (convert_escape): Use encoding_rich_location when |
| 166 | complaining about nonprintable unknown escape sequences. |
| 167 | (cpp_display_width_computation::::cpp_display_width_computation): |
| 168 | Pass in policy rather than tabstop. |
| 169 | (cpp_display_width_computation::process_next_codepoint): Add "out" |
| 170 | param and populate *out if non-NULL. |
| 171 | (cpp_display_width_computation::advance_display_cols): Pass NULL |
| 172 | to process_next_codepoint. |
| 173 | (cpp_byte_column_to_display_column): Pass in policy rather than |
| 174 | tabstop. Pass NULL to process_next_codepoint. |
| 175 | (cpp_display_column_to_byte_column): Pass in policy rather than |
| 176 | tabstop. |
| 177 | * errors.c (cpp_diagnostic_get_current_location): New function, |
| 178 | splitting out the logic from... |
| 179 | (cpp_diagnostic): ...here. |
| 180 | (cpp_warning_at): New function. |
| 181 | (cpp_pedwarning_at): New function. |
| 182 | * include/cpplib.h (cpp_warning_at): New decl for rich_location. |
| 183 | (cpp_pedwarning_at): Likewise. |
| 184 | (struct cpp_decoded_char): New. |
| 185 | (struct cpp_char_column_policy): New. |
| 186 | (cpp_display_width_computation::cpp_display_width_computation): |
| 187 | Replace "tabstop" param with "policy". |
| 188 | (cpp_display_width_computation::process_next_codepoint): Add "out" |
| 189 | param. |
| 190 | (cpp_display_width_computation::m_tabstop): Replace with... |
| 191 | (cpp_display_width_computation::m_policy): ...this. |
| 192 | (cpp_byte_column_to_display_column): Replace "tabstop" param with |
| 193 | "policy". |
| 194 | (cpp_display_width): Likewise. |
| 195 | (cpp_display_column_to_byte_column): Likewise. |
| 196 | * include/line-map.h (rich_location::escape_on_output_p): New. |
| 197 | (rich_location::set_escape_on_output): New. |
| 198 | (rich_location::m_escape_on_output): New. |
| 199 | * internal.h (cpp_diagnostic_get_current_location): New decl. |
| 200 | (class encoding_rich_location): New. |
| 201 | * lex.c (skip_whitespace): Use encoding_rich_location when |
| 202 | complaining about null characters. |
| 203 | (warn_about_normalization): Generate a source range when |
| 204 | complaining about improperly normalized tokens, rather than just a |
| 205 | point, and use encoding_rich_location so that the source code |
| 206 | is escaped on printing. |
| 207 | * line-map.c (rich_location::rich_location): Initialize |
| 208 | m_escape_on_output. |
| 209 | |
| 210 | Signed-off-by: David Malcolm <dmalcolm@redhat.com> |
| 211 | |
| 212 | CVE: CVE-2021-42574 |
| 213 | Upstream-Status: Backport [https://gcc.gnu.org/git/gitweb.cgi?p=gcc.git;h=bd5e882cf6e0def3dd1bc106075d59a303fe0d1e] |
| 214 | Signed-off-by: Pgowda <pgowda.cve@gmail.com> |
| 215 | |
| 216 | --- |
| 217 | gcc/c-family/c-lex.c | 6 +- |
| 218 | gcc/common.opt | 13 + |
| 219 | gcc/diagnostic-format-json.cc | 3 + |
| 220 | gcc/diagnostic-show-locus.c | 580 +++++++++++++++--- |
| 221 | gcc/diagnostic.c | 10 +- |
| 222 | gcc/diagnostic.h | 18 + |
| 223 | gcc/doc/invoke.texi | 43 +- |
| 224 | gcc/input.c | 62 +- |
| 225 | gcc/input.h | 7 +- |
| 226 | gcc/opts.c | 4 + |
| 227 | gcc/selftest.c | 15 + |
| 228 | gcc/selftest.h | 2 + |
| 229 | .../c-c++-common/diagnostic-format-json-1.c | 1 + |
| 230 | .../c-c++-common/diagnostic-format-json-2.c | 1 + |
| 231 | .../c-c++-common/diagnostic-format-json-3.c | 1 + |
| 232 | .../c-c++-common/diagnostic-format-json-4.c | 2 + |
| 233 | .../c-c++-common/diagnostic-format-json-5.c | 1 + |
| 234 | .../gcc.dg/cpp/warn-normalized-4-bytes.c | 21 + |
| 235 | .../gcc.dg/cpp/warn-normalized-4-unicode.c | 19 + |
| 236 | gcc/testsuite/gcc.dg/encoding-issues-bytes.c | Bin 0 -> 595 bytes |
| 237 | .../gcc.dg/encoding-issues-unicode.c | Bin 0 -> 613 bytes |
| 238 | .../gfortran.dg/diagnostic-format-json-1.F90 | 1 + |
| 239 | .../gfortran.dg/diagnostic-format-json-2.F90 | 1 + |
| 240 | .../gfortran.dg/diagnostic-format-json-3.F90 | 1 + |
| 241 | libcpp/charset.c | 63 +- |
| 242 | libcpp/errors.c | 82 ++- |
| 243 | libcpp/include/cpplib.h | 76 ++- |
| 244 | libcpp/include/line-map.h | 13 + |
| 245 | libcpp/internal.h | 23 + |
| 246 | libcpp/lex.c | 38 +- |
| 247 | libcpp/line-map.c | 3 +- |
| 248 | 31 files changed, 942 insertions(+), 168 deletions(-) |
| 249 | create mode 100644 gcc/testsuite/gcc.dg/cpp/warn-normalized-4-bytes.c |
| 250 | create mode 100644 gcc/testsuite/gcc.dg/cpp/warn-normalized-4-unicode.c |
| 251 | create mode 100644 gcc/testsuite/gcc.dg/encoding-issues-bytes.c |
| 252 | create mode 100644 gcc/testsuite/gcc.dg/encoding-issues-unicode.c |
| 253 | |
| 254 | diff --git a/gcc/c-family/c-lex.c b/gcc/c-family/c-lex.c |
| 255 | --- a/gcc/c-family/c-lex.c 2021-07-27 23:55:06.980283060 -0700 |
| 256 | +++ b/gcc/c-family/c-lex.c 2021-12-14 01:16:01.541943272 -0800 |
| 257 | @@ -603,7 +603,11 @@ c_lex_with_flags (tree *value, location_ |
| 258 | else if (ISGRAPH (c)) |
| 259 | error_at (*loc, "stray %qc in program", (int) c); |
| 260 | else |
| 261 | - error_at (*loc, "stray %<\\%o%> in program", (int) c); |
| 262 | + { |
| 263 | + rich_location rich_loc (line_table, *loc); |
| 264 | + rich_loc.set_escape_on_output (true); |
| 265 | + error_at (&rich_loc, "stray %<\\%o%> in program", (int) c); |
| 266 | + } |
| 267 | } |
| 268 | goto retry; |
| 269 | |
| 270 | diff --git a/gcc/common.opt b/gcc/common.opt |
| 271 | --- a/gcc/common.opt 2021-12-13 22:08:44.939137107 -0800 |
| 272 | +++ b/gcc/common.opt 2021-12-14 01:16:01.541943272 -0800 |
| 273 | @@ -1348,6 +1348,10 @@ fdiagnostics-format= |
| 274 | Common Joined RejectNegative Enum(diagnostics_output_format) |
| 275 | -fdiagnostics-format=[text|json] Select output format. |
| 276 | |
| 277 | +fdiagnostics-escape-format= |
| 278 | +Common Joined RejectNegative Enum(diagnostics_escape_format) |
| 279 | +-fdiagnostics-escape-format=[unicode|bytes] Select how to escape non-printable-ASCII bytes in the source for diagnostics that suggest it. |
| 280 | + |
| 281 | ; Required for these enum values. |
| 282 | SourceInclude |
| 283 | diagnostic.h |
| 284 | @@ -1362,6 +1366,15 @@ EnumValue |
| 285 | Enum(diagnostics_column_unit) String(byte) Value(DIAGNOSTICS_COLUMN_UNIT_BYTE) |
| 286 | |
| 287 | Enum |
| 288 | +Name(diagnostics_escape_format) Type(int) |
| 289 | + |
| 290 | +EnumValue |
| 291 | +Enum(diagnostics_escape_format) String(unicode) Value(DIAGNOSTICS_ESCAPE_FORMAT_UNICODE) |
| 292 | + |
| 293 | +EnumValue |
| 294 | +Enum(diagnostics_escape_format) String(bytes) Value(DIAGNOSTICS_ESCAPE_FORMAT_BYTES) |
| 295 | + |
| 296 | +Enum |
| 297 | Name(diagnostics_output_format) Type(int) |
| 298 | |
| 299 | EnumValue |
| 300 | diff --git a/gcc/diagnostic.c b/gcc/diagnostic.c |
| 301 | --- a/gcc/diagnostic.c 2021-07-27 23:55:07.232286576 -0700 |
| 302 | +++ b/gcc/diagnostic.c 2021-12-14 01:16:01.545943202 -0800 |
| 303 | @@ -230,6 +230,7 @@ diagnostic_initialize (diagnostic_contex |
| 304 | context->column_unit = DIAGNOSTICS_COLUMN_UNIT_DISPLAY; |
| 305 | context->column_origin = 1; |
| 306 | context->tabstop = 8; |
| 307 | + context->escape_format = DIAGNOSTICS_ESCAPE_FORMAT_UNICODE; |
| 308 | context->edit_context_ptr = NULL; |
| 309 | context->diagnostic_group_nesting_depth = 0; |
| 310 | context->diagnostic_group_emission_count = 0; |
| 311 | @@ -382,7 +383,10 @@ convert_column_unit (enum diagnostics_co |
| 312 | gcc_unreachable (); |
| 313 | |
| 314 | case DIAGNOSTICS_COLUMN_UNIT_DISPLAY: |
| 315 | - return location_compute_display_column (s, tabstop); |
| 316 | + { |
| 317 | + cpp_char_column_policy policy (tabstop, cpp_wcwidth); |
| 318 | + return location_compute_display_column (s, policy); |
| 319 | + } |
| 320 | |
| 321 | case DIAGNOSTICS_COLUMN_UNIT_BYTE: |
| 322 | return s.column; |
| 323 | @@ -2275,8 +2279,8 @@ test_diagnostic_get_location_text () |
| 324 | const char *const content = "smile \xf0\x9f\x98\x82\n"; |
| 325 | const int line_bytes = strlen (content) - 1; |
| 326 | const int def_tabstop = 8; |
| 327 | - const int display_width = cpp_display_width (content, line_bytes, |
| 328 | - def_tabstop); |
| 329 | + const cpp_char_column_policy policy (def_tabstop, cpp_wcwidth); |
| 330 | + const int display_width = cpp_display_width (content, line_bytes, policy); |
| 331 | ASSERT_EQ (line_bytes - 2, display_width); |
| 332 | temp_source_file tmp (SELFTEST_LOCATION, ".c", content); |
| 333 | const char *const fname = tmp.get_filename (); |
| 334 | diff --git a/gcc/diagnostic-format-json.cc b/gcc/diagnostic-format-json.cc |
| 335 | --- a/gcc/diagnostic-format-json.cc 2021-07-27 23:55:07.232286576 -0700 |
| 336 | +++ b/gcc/diagnostic-format-json.cc 2021-12-14 01:16:01.541943272 -0800 |
| 337 | @@ -264,6 +264,9 @@ json_end_diagnostic (diagnostic_context |
| 338 | json::value *path_value = context->make_json_for_path (context, path); |
| 339 | diag_obj->set ("path", path_value); |
| 340 | } |
| 341 | + |
| 342 | + diag_obj->set ("escape-source", |
| 343 | + new json::literal (richloc->escape_on_output_p ())); |
| 344 | } |
| 345 | |
| 346 | /* No-op implementation of "begin_group_cb" for JSON output. */ |
| 347 | diff --git a/gcc/diagnostic.h b/gcc/diagnostic.h |
| 348 | --- a/gcc/diagnostic.h 2021-07-27 23:55:07.236286632 -0700 |
| 349 | +++ b/gcc/diagnostic.h 2021-12-14 01:16:01.545943202 -0800 |
| 350 | @@ -38,6 +38,20 @@ enum diagnostics_column_unit |
| 351 | DIAGNOSTICS_COLUMN_UNIT_BYTE |
| 352 | }; |
| 353 | |
| 354 | +/* An enum for controlling how to print non-ASCII characters/bytes when |
| 355 | + a diagnostic suggests escaping the source code on output. */ |
| 356 | + |
| 357 | +enum diagnostics_escape_format |
| 358 | +{ |
| 359 | + /* Escape non-ASCII Unicode characters in the form <U+XXXX> and |
| 360 | + non-UTF-8 bytes in the form <XX>. */ |
| 361 | + DIAGNOSTICS_ESCAPE_FORMAT_UNICODE, |
| 362 | + |
| 363 | + /* Escape non-ASCII bytes in the form <XX> (thus showing the underlying |
| 364 | + encoding of non-ASCII Unicode characters). */ |
| 365 | + DIAGNOSTICS_ESCAPE_FORMAT_BYTES |
| 366 | +}; |
| 367 | + |
| 368 | /* Enum for overriding the standard output format. */ |
| 369 | |
| 370 | enum diagnostics_output_format |
| 371 | @@ -320,6 +334,10 @@ struct diagnostic_context |
| 372 | /* The size of the tabstop for tab expansion. */ |
| 373 | int tabstop; |
| 374 | |
| 375 | + /* How should non-ASCII/non-printable bytes be escaped when |
| 376 | + a diagnostic suggests escaping the source code on output. */ |
| 377 | + enum diagnostics_escape_format escape_format; |
| 378 | + |
| 379 | /* If non-NULL, an edit_context to which fix-it hints should be |
| 380 | applied, for generating patches. */ |
| 381 | edit_context *edit_context_ptr; |
| 382 | diff --git a/gcc/diagnostic-show-locus.c b/gcc/diagnostic-show-locus.c |
| 383 | --- a/gcc/diagnostic-show-locus.c 2021-07-27 23:55:07.232286576 -0700 |
| 384 | +++ b/gcc/diagnostic-show-locus.c 2021-12-14 01:16:01.545943202 -0800 |
| 385 | @@ -175,10 +175,26 @@ enum column_unit { |
| 386 | class exploc_with_display_col : public expanded_location |
| 387 | { |
| 388 | public: |
| 389 | - exploc_with_display_col (const expanded_location &exploc, int tabstop) |
| 390 | - : expanded_location (exploc), |
| 391 | - m_display_col (location_compute_display_column (exploc, tabstop)) |
| 392 | - {} |
| 393 | + exploc_with_display_col (const expanded_location &exploc, |
| 394 | + const cpp_char_column_policy &policy, |
| 395 | + enum location_aspect aspect) |
| 396 | + : expanded_location (exploc), |
| 397 | + m_display_col (location_compute_display_column (exploc, policy)) |
| 398 | + { |
| 399 | + if (exploc.column > 0) |
| 400 | + { |
| 401 | + /* m_display_col is now the final column of the byte. |
| 402 | + If escaping has happened, we may want the first column instead. */ |
| 403 | + if (aspect != LOCATION_ASPECT_FINISH) |
| 404 | + { |
| 405 | + expanded_location prev_exploc (exploc); |
| 406 | + prev_exploc.column--; |
| 407 | + int prev_display_col |
| 408 | + = (location_compute_display_column (prev_exploc, policy)); |
| 409 | + m_display_col = prev_display_col + 1; |
| 410 | + } |
| 411 | + } |
| 412 | + } |
| 413 | |
| 414 | int m_display_col; |
| 415 | }; |
| 416 | @@ -313,6 +329,31 @@ test_line_span () |
| 417 | |
| 418 | #endif /* #if CHECKING_P */ |
| 419 | |
| 420 | +/* A bundle of information containing how to print unicode |
| 421 | + characters and bytes when quoting source code. |
| 422 | + |
| 423 | + Provides a unified place to support escaping some subset |
| 424 | + of characters to some format. |
| 425 | + |
| 426 | + Extends char_column_policy; printing is split out to avoid |
| 427 | + libcpp having to know about pretty_printer. */ |
| 428 | + |
| 429 | +struct char_display_policy : public cpp_char_column_policy |
| 430 | +{ |
| 431 | + public: |
| 432 | + char_display_policy (int tabstop, |
| 433 | + int (*width_cb) (cppchar_t c), |
| 434 | + void (*print_cb) (pretty_printer *pp, |
| 435 | + const cpp_decoded_char &cp)) |
| 436 | + : cpp_char_column_policy (tabstop, width_cb), |
| 437 | + m_print_cb (print_cb) |
| 438 | + { |
| 439 | + } |
| 440 | + |
| 441 | + void (*m_print_cb) (pretty_printer *pp, |
| 442 | + const cpp_decoded_char &cp); |
| 443 | +}; |
| 444 | + |
| 445 | /* A class to control the overall layout when printing a diagnostic. |
| 446 | |
| 447 | The layout is determined within the constructor. |
| 448 | @@ -345,6 +386,8 @@ class layout |
| 449 | |
| 450 | void print_line (linenum_type row); |
| 451 | |
| 452 | + void on_bad_codepoint (const char *ptr, cppchar_t ch, size_t ch_sz); |
| 453 | + |
| 454 | private: |
| 455 | bool will_show_line_p (linenum_type row) const; |
| 456 | void print_leading_fixits (linenum_type row); |
| 457 | @@ -386,6 +429,7 @@ class layout |
| 458 | private: |
| 459 | diagnostic_context *m_context; |
| 460 | pretty_printer *m_pp; |
| 461 | + char_display_policy m_policy; |
| 462 | location_t m_primary_loc; |
| 463 | exploc_with_display_col m_exploc; |
| 464 | colorizer m_colorizer; |
| 465 | @@ -398,6 +442,7 @@ class layout |
| 466 | auto_vec <line_span> m_line_spans; |
| 467 | int m_linenum_width; |
| 468 | int m_x_offset_display; |
| 469 | + bool m_escape_on_output; |
| 470 | }; |
| 471 | |
| 472 | /* Implementation of "class colorizer". */ |
| 473 | @@ -646,6 +691,11 @@ layout_range::intersects_line_p (linenum |
| 474 | /* Default for when we don't care what the tab expansion is set to. */ |
| 475 | static const int def_tabstop = 8; |
| 476 | |
| 477 | +static cpp_char_column_policy def_policy () |
| 478 | +{ |
| 479 | + return cpp_char_column_policy (8, cpp_wcwidth); |
| 480 | +} |
| 481 | + |
| 482 | /* Create some expanded locations for testing layout_range. The filename |
| 483 | member of the explocs is set to the empty string. This member will only be |
| 484 | inspected by the calls to location_compute_display_column() made from the |
| 485 | @@ -662,10 +712,13 @@ make_range (int start_line, int start_co |
| 486 | = {"", start_line, start_col, NULL, false}; |
| 487 | const expanded_location finish_exploc |
| 488 | = {"", end_line, end_col, NULL, false}; |
| 489 | - return layout_range (exploc_with_display_col (start_exploc, def_tabstop), |
| 490 | - exploc_with_display_col (finish_exploc, def_tabstop), |
| 491 | + return layout_range (exploc_with_display_col (start_exploc, def_policy (), |
| 492 | + LOCATION_ASPECT_START), |
| 493 | + exploc_with_display_col (finish_exploc, def_policy (), |
| 494 | + LOCATION_ASPECT_FINISH), |
| 495 | SHOW_RANGE_WITHOUT_CARET, |
| 496 | - exploc_with_display_col (start_exploc, def_tabstop), |
| 497 | + exploc_with_display_col (start_exploc, def_policy (), |
| 498 | + LOCATION_ASPECT_CARET), |
| 499 | 0, NULL); |
| 500 | } |
| 501 | |
| 502 | @@ -959,6 +1012,164 @@ fixit_cmp (const void *p_a, const void * |
| 503 | return hint_a->get_start_loc () - hint_b->get_start_loc (); |
| 504 | } |
| 505 | |
| 506 | +/* Callbacks for use when not escaping the source. */ |
| 507 | + |
| 508 | +/* The default callback for char_column_policy::m_width_cb is cpp_wcwidth. */ |
| 509 | + |
| 510 | +/* Callback for char_display_policy::m_print_cb for printing source chars |
| 511 | + when not escaping the source. */ |
| 512 | + |
| 513 | +static void |
| 514 | +default_print_decoded_ch (pretty_printer *pp, |
| 515 | + const cpp_decoded_char &decoded_ch) |
| 516 | +{ |
| 517 | + for (const char *ptr = decoded_ch.m_start_byte; |
| 518 | + ptr != decoded_ch.m_next_byte; ptr++) |
| 519 | + { |
| 520 | + if (*ptr == '\0' || *ptr == '\r') |
| 521 | + { |
| 522 | + pp_space (pp); |
| 523 | + continue; |
| 524 | + } |
| 525 | + |
| 526 | + pp_character (pp, *ptr); |
| 527 | + } |
| 528 | +} |
| 529 | + |
| 530 | +/* Callbacks for use with DIAGNOSTICS_ESCAPE_FORMAT_BYTES. */ |
| 531 | + |
| 532 | +static const int width_per_escaped_byte = 4; |
| 533 | + |
| 534 | +/* Callback for char_column_policy::m_width_cb for determining the |
| 535 | + display width when escaping with DIAGNOSTICS_ESCAPE_FORMAT_BYTES. */ |
| 536 | + |
| 537 | +static int |
| 538 | +escape_as_bytes_width (cppchar_t ch) |
| 539 | +{ |
| 540 | + if (ch < 0x80 && ISPRINT (ch)) |
| 541 | + return cpp_wcwidth (ch); |
| 542 | + else |
| 543 | + { |
| 544 | + if (ch <= 0x7F) return 1 * width_per_escaped_byte; |
| 545 | + if (ch <= 0x7FF) return 2 * width_per_escaped_byte; |
| 546 | + if (ch <= 0xFFFF) return 3 * width_per_escaped_byte; |
| 547 | + return 4 * width_per_escaped_byte; |
| 548 | + } |
| 549 | +} |
| 550 | + |
| 551 | +/* Callback for char_display_policy::m_print_cb for printing source chars |
| 552 | + when escaping with DIAGNOSTICS_ESCAPE_FORMAT_BYTES. */ |
| 553 | + |
| 554 | +static void |
| 555 | +escape_as_bytes_print (pretty_printer *pp, |
| 556 | + const cpp_decoded_char &decoded_ch) |
| 557 | +{ |
| 558 | + if (!decoded_ch.m_valid_ch) |
| 559 | + { |
| 560 | + for (const char *iter = decoded_ch.m_start_byte; |
| 561 | + iter != decoded_ch.m_next_byte; ++iter) |
| 562 | + { |
| 563 | + char buf[16]; |
| 564 | + sprintf (buf, "<%02x>", (unsigned char)*iter); |
| 565 | + pp_string (pp, buf); |
| 566 | + } |
| 567 | + return; |
| 568 | + } |
| 569 | + |
| 570 | + cppchar_t ch = decoded_ch.m_ch; |
| 571 | + if (ch < 0x80 && ISPRINT (ch)) |
| 572 | + pp_character (pp, ch); |
| 573 | + else |
| 574 | + { |
| 575 | + for (const char *iter = decoded_ch.m_start_byte; |
| 576 | + iter < decoded_ch.m_next_byte; ++iter) |
| 577 | + { |
| 578 | + char buf[16]; |
| 579 | + sprintf (buf, "<%02x>", (unsigned char)*iter); |
| 580 | + pp_string (pp, buf); |
| 581 | + } |
| 582 | + } |
| 583 | +} |
| 584 | + |
| 585 | +/* Callbacks for use with DIAGNOSTICS_ESCAPE_FORMAT_UNICODE. */ |
| 586 | + |
| 587 | +/* Callback for char_column_policy::m_width_cb for determining the |
| 588 | + display width when escaping with DIAGNOSTICS_ESCAPE_FORMAT_UNICODE. */ |
| 589 | + |
| 590 | +static int |
| 591 | +escape_as_unicode_width (cppchar_t ch) |
| 592 | +{ |
| 593 | + if (ch < 0x80 && ISPRINT (ch)) |
| 594 | + return cpp_wcwidth (ch); |
| 595 | + else |
| 596 | + { |
| 597 | + // Width of "<U+%04x>" |
| 598 | + if (ch > 0xfffff) |
| 599 | + return 10; |
| 600 | + else if (ch > 0xffff) |
| 601 | + return 9; |
| 602 | + else |
| 603 | + return 8; |
| 604 | + } |
| 605 | +} |
| 606 | + |
| 607 | +/* Callback for char_display_policy::m_print_cb for printing source chars |
| 608 | + when escaping with DIAGNOSTICS_ESCAPE_FORMAT_UNICODE. */ |
| 609 | + |
| 610 | +static void |
| 611 | +escape_as_unicode_print (pretty_printer *pp, |
| 612 | + const cpp_decoded_char &decoded_ch) |
| 613 | +{ |
| 614 | + if (!decoded_ch.m_valid_ch) |
| 615 | + { |
| 616 | + escape_as_bytes_print (pp, decoded_ch); |
| 617 | + return; |
| 618 | + } |
| 619 | + |
| 620 | + cppchar_t ch = decoded_ch.m_ch; |
| 621 | + if (ch < 0x80 && ISPRINT (ch)) |
| 622 | + pp_character (pp, ch); |
| 623 | + else |
| 624 | + { |
| 625 | + char buf[16]; |
| 626 | + sprintf (buf, "<U+%04X>", ch); |
| 627 | + pp_string (pp, buf); |
| 628 | + } |
| 629 | +} |
| 630 | + |
| 631 | +/* Populate a char_display_policy based on DC and RICHLOC. */ |
| 632 | + |
| 633 | +static char_display_policy |
| 634 | +make_policy (const diagnostic_context &dc, |
| 635 | + const rich_location &richloc) |
| 636 | +{ |
| 637 | + /* The default is to not escape non-ASCII bytes. */ |
| 638 | + char_display_policy result |
| 639 | + (dc.tabstop, cpp_wcwidth, default_print_decoded_ch); |
| 640 | + |
| 641 | + /* If the diagnostic suggests escaping non-ASCII bytes, then |
| 642 | + use policy from user-supplied options. */ |
| 643 | + if (richloc.escape_on_output_p ()) |
| 644 | + { |
| 645 | + result.m_undecoded_byte_width = width_per_escaped_byte; |
| 646 | + switch (dc.escape_format) |
| 647 | + { |
| 648 | + default: |
| 649 | + gcc_unreachable (); |
| 650 | + case DIAGNOSTICS_ESCAPE_FORMAT_UNICODE: |
| 651 | + result.m_width_cb = escape_as_unicode_width; |
| 652 | + result.m_print_cb = escape_as_unicode_print; |
| 653 | + break; |
| 654 | + case DIAGNOSTICS_ESCAPE_FORMAT_BYTES: |
| 655 | + result.m_width_cb = escape_as_bytes_width; |
| 656 | + result.m_print_cb = escape_as_bytes_print; |
| 657 | + break; |
| 658 | + } |
| 659 | + } |
| 660 | + |
| 661 | + return result; |
| 662 | +} |
| 663 | + |
| 664 | /* Implementation of class layout. */ |
| 665 | |
| 666 | /* Constructor for class layout. |
| 667 | @@ -975,8 +1186,10 @@ layout::layout (diagnostic_context * con |
| 668 | diagnostic_t diagnostic_kind) |
| 669 | : m_context (context), |
| 670 | m_pp (context->printer), |
| 671 | + m_policy (make_policy (*context, *richloc)), |
| 672 | m_primary_loc (richloc->get_range (0)->m_loc), |
| 673 | - m_exploc (richloc->get_expanded_location (0), context->tabstop), |
| 674 | + m_exploc (richloc->get_expanded_location (0), m_policy, |
| 675 | + LOCATION_ASPECT_CARET), |
| 676 | m_colorizer (context, diagnostic_kind), |
| 677 | m_colorize_source_p (context->colorize_source_p), |
| 678 | m_show_labels_p (context->show_labels_p), |
| 679 | @@ -986,7 +1199,8 @@ layout::layout (diagnostic_context * con |
| 680 | m_fixit_hints (richloc->get_num_fixit_hints ()), |
| 681 | m_line_spans (1 + richloc->get_num_locations ()), |
| 682 | m_linenum_width (0), |
| 683 | - m_x_offset_display (0) |
| 684 | + m_x_offset_display (0), |
| 685 | + m_escape_on_output (richloc->escape_on_output_p ()) |
| 686 | { |
| 687 | for (unsigned int idx = 0; idx < richloc->get_num_locations (); idx++) |
| 688 | { |
| 689 | @@ -1072,10 +1286,13 @@ layout::maybe_add_location_range (const |
| 690 | |
| 691 | /* Everything is now known to be in the correct source file, |
| 692 | but it may require further sanitization. */ |
| 693 | - layout_range ri (exploc_with_display_col (start, m_context->tabstop), |
| 694 | - exploc_with_display_col (finish, m_context->tabstop), |
| 695 | + layout_range ri (exploc_with_display_col (start, m_policy, |
| 696 | + LOCATION_ASPECT_START), |
| 697 | + exploc_with_display_col (finish, m_policy, |
| 698 | + LOCATION_ASPECT_FINISH), |
| 699 | loc_range->m_range_display_kind, |
| 700 | - exploc_with_display_col (caret, m_context->tabstop), |
| 701 | + exploc_with_display_col (caret, m_policy, |
| 702 | + LOCATION_ASPECT_CARET), |
| 703 | original_idx, loc_range->m_label); |
| 704 | |
| 705 | /* If we have a range that finishes before it starts (perhaps |
| 706 | @@ -1409,7 +1626,7 @@ layout::calculate_x_offset_display () |
| 707 | = get_line_bytes_without_trailing_whitespace (line.get_buffer (), |
| 708 | line.length ()); |
| 709 | int eol_display_column |
| 710 | - = cpp_display_width (line.get_buffer (), line_bytes, m_context->tabstop); |
| 711 | + = cpp_display_width (line.get_buffer (), line_bytes, m_policy); |
| 712 | if (caret_display_column > eol_display_column |
| 713 | || !caret_display_column) |
| 714 | { |
| 715 | @@ -1488,7 +1705,7 @@ layout::print_source_line (linenum_type |
| 716 | /* This object helps to keep track of which display column we are at, which is |
| 717 | necessary for computing the line bounds in display units, for doing |
| 718 | tab expansion, and for implementing m_x_offset_display. */ |
| 719 | - cpp_display_width_computation dw (line, line_bytes, m_context->tabstop); |
| 720 | + cpp_display_width_computation dw (line, line_bytes, m_policy); |
| 721 | |
| 722 | /* Skip the first m_x_offset_display display columns. In case the leading |
| 723 | portion that will be skipped ends with a character with wcwidth > 1, then |
| 724 | @@ -1536,7 +1753,8 @@ layout::print_source_line (linenum_type |
| 725 | tabs and replacing some control bytes with spaces as necessary. */ |
| 726 | const char *c = dw.next_byte (); |
| 727 | const int start_disp_col = dw.display_cols_processed () + 1; |
| 728 | - const int this_display_width = dw.process_next_codepoint (); |
| 729 | + cpp_decoded_char cp; |
| 730 | + const int this_display_width = dw.process_next_codepoint (&cp); |
| 731 | if (*c == '\t') |
| 732 | { |
| 733 | /* The returned display width is the number of spaces into which the |
| 734 | @@ -1545,15 +1763,6 @@ layout::print_source_line (linenum_type |
| 735 | pp_space (m_pp); |
| 736 | continue; |
| 737 | } |
| 738 | - if (*c == '\0' || *c == '\r') |
| 739 | - { |
| 740 | - /* cpp_wcwidth() promises to return 1 for all control bytes, and we |
| 741 | - want to output these as a single space too, so this case is |
| 742 | - actually the same as the '\t' case. */ |
| 743 | - gcc_assert (this_display_width == 1); |
| 744 | - pp_space (m_pp); |
| 745 | - continue; |
| 746 | - } |
| 747 | |
| 748 | /* We have a (possibly multibyte) character to output; update the line |
| 749 | bounds if it is not whitespace. */ |
| 750 | @@ -1565,7 +1774,8 @@ layout::print_source_line (linenum_type |
| 751 | } |
| 752 | |
| 753 | /* Output the character. */ |
| 754 | - while (c != dw.next_byte ()) pp_character (m_pp, *c++); |
| 755 | + m_policy.m_print_cb (m_pp, cp); |
| 756 | + c = dw.next_byte (); |
| 757 | } |
| 758 | print_newline (); |
| 759 | return lbounds; |
| 760 | @@ -1664,14 +1874,14 @@ layout::print_annotation_line (linenum_t |
| 761 | class line_label |
| 762 | { |
| 763 | public: |
| 764 | - line_label (diagnostic_context *context, int state_idx, int column, |
| 765 | + line_label (const cpp_char_column_policy &policy, |
| 766 | + int state_idx, int column, |
| 767 | label_text text) |
| 768 | : m_state_idx (state_idx), m_column (column), |
| 769 | m_text (text), m_label_line (0), m_has_vbar (true) |
| 770 | { |
| 771 | const int bytes = strlen (text.m_buffer); |
| 772 | - m_display_width |
| 773 | - = cpp_display_width (text.m_buffer, bytes, context->tabstop); |
| 774 | + m_display_width = cpp_display_width (text.m_buffer, bytes, policy); |
| 775 | } |
| 776 | |
| 777 | /* Sorting is primarily by column, then by state index. */ |
| 778 | @@ -1731,7 +1941,7 @@ layout::print_any_labels (linenum_type r |
| 779 | if (text.m_buffer == NULL) |
| 780 | continue; |
| 781 | |
| 782 | - labels.safe_push (line_label (m_context, i, disp_col, text)); |
| 783 | + labels.safe_push (line_label (m_policy, i, disp_col, text)); |
| 784 | } |
| 785 | } |
| 786 | |
| 787 | @@ -2011,7 +2221,7 @@ public: |
| 788 | |
| 789 | /* Get the range of bytes or display columns that HINT would affect. */ |
| 790 | static column_range |
| 791 | -get_affected_range (diagnostic_context *context, |
| 792 | +get_affected_range (const cpp_char_column_policy &policy, |
| 793 | const fixit_hint *hint, enum column_unit col_unit) |
| 794 | { |
| 795 | expanded_location exploc_start = expand_location (hint->get_start_loc ()); |
| 796 | @@ -2022,13 +2232,11 @@ get_affected_range (diagnostic_context * |
| 797 | int finish_column; |
| 798 | if (col_unit == CU_DISPLAY_COLS) |
| 799 | { |
| 800 | - start_column |
| 801 | - = location_compute_display_column (exploc_start, context->tabstop); |
| 802 | + start_column = location_compute_display_column (exploc_start, policy); |
| 803 | if (hint->insertion_p ()) |
| 804 | finish_column = start_column - 1; |
| 805 | else |
| 806 | - finish_column |
| 807 | - = location_compute_display_column (exploc_finish, context->tabstop); |
| 808 | + finish_column = location_compute_display_column (exploc_finish, policy); |
| 809 | } |
| 810 | else |
| 811 | { |
| 812 | @@ -2041,12 +2249,13 @@ get_affected_range (diagnostic_context * |
| 813 | /* Get the range of display columns that would be printed for HINT. */ |
| 814 | |
| 815 | static column_range |
| 816 | -get_printed_columns (diagnostic_context *context, const fixit_hint *hint) |
| 817 | +get_printed_columns (const cpp_char_column_policy &policy, |
| 818 | + const fixit_hint *hint) |
| 819 | { |
| 820 | expanded_location exploc = expand_location (hint->get_start_loc ()); |
| 821 | - int start_column = location_compute_display_column (exploc, context->tabstop); |
| 822 | + int start_column = location_compute_display_column (exploc, policy); |
| 823 | int hint_width = cpp_display_width (hint->get_string (), hint->get_length (), |
| 824 | - context->tabstop); |
| 825 | + policy); |
| 826 | int final_hint_column = start_column + hint_width - 1; |
| 827 | if (hint->insertion_p ()) |
| 828 | { |
| 829 | @@ -2056,8 +2265,7 @@ get_printed_columns (diagnostic_context |
| 830 | { |
| 831 | exploc = expand_location (hint->get_next_loc ()); |
| 832 | --exploc.column; |
| 833 | - int finish_column |
| 834 | - = location_compute_display_column (exploc, context->tabstop); |
| 835 | + int finish_column = location_compute_display_column (exploc, policy); |
| 836 | return column_range (start_column, |
| 837 | MAX (finish_column, final_hint_column)); |
| 838 | } |
| 839 | @@ -2075,13 +2283,13 @@ public: |
| 840 | column_range affected_columns, |
| 841 | column_range printed_columns, |
| 842 | const char *new_text, size_t new_text_len, |
| 843 | - int tabstop) |
| 844 | + const cpp_char_column_policy &policy) |
| 845 | : m_affected_bytes (affected_bytes), |
| 846 | m_affected_columns (affected_columns), |
| 847 | m_printed_columns (printed_columns), |
| 848 | m_text (xstrdup (new_text)), |
| 849 | m_byte_length (new_text_len), |
| 850 | - m_tabstop (tabstop), |
| 851 | + m_policy (policy), |
| 852 | m_alloc_sz (new_text_len + 1) |
| 853 | { |
| 854 | compute_display_cols (); |
| 855 | @@ -2099,7 +2307,7 @@ public: |
| 856 | |
| 857 | void compute_display_cols () |
| 858 | { |
| 859 | - m_display_cols = cpp_display_width (m_text, m_byte_length, m_tabstop); |
| 860 | + m_display_cols = cpp_display_width (m_text, m_byte_length, m_policy); |
| 861 | } |
| 862 | |
| 863 | void overwrite (int dst_offset, const char_span &src_span) |
| 864 | @@ -2127,7 +2335,7 @@ public: |
| 865 | char *m_text; |
| 866 | size_t m_byte_length; /* Not including null-terminator. */ |
| 867 | int m_display_cols; |
| 868 | - int m_tabstop; |
| 869 | + const cpp_char_column_policy &m_policy; |
| 870 | size_t m_alloc_sz; |
| 871 | }; |
| 872 | |
| 873 | @@ -2163,15 +2371,16 @@ correction::ensure_terminated () |
| 874 | class line_corrections |
| 875 | { |
| 876 | public: |
| 877 | - line_corrections (diagnostic_context *context, const char *filename, |
| 878 | + line_corrections (const char_display_policy &policy, |
| 879 | + const char *filename, |
| 880 | linenum_type row) |
| 881 | - : m_context (context), m_filename (filename), m_row (row) |
| 882 | + : m_policy (policy), m_filename (filename), m_row (row) |
| 883 | {} |
| 884 | ~line_corrections (); |
| 885 | |
| 886 | void add_hint (const fixit_hint *hint); |
| 887 | |
| 888 | - diagnostic_context *m_context; |
| 889 | + const char_display_policy &m_policy; |
| 890 | const char *m_filename; |
| 891 | linenum_type m_row; |
| 892 | auto_vec <correction *> m_corrections; |
| 893 | @@ -2217,10 +2426,10 @@ source_line::source_line (const char *fi |
| 894 | void |
| 895 | line_corrections::add_hint (const fixit_hint *hint) |
| 896 | { |
| 897 | - column_range affected_bytes = get_affected_range (m_context, hint, CU_BYTES); |
| 898 | - column_range affected_columns = get_affected_range (m_context, hint, |
| 899 | + column_range affected_bytes = get_affected_range (m_policy, hint, CU_BYTES); |
| 900 | + column_range affected_columns = get_affected_range (m_policy, hint, |
| 901 | CU_DISPLAY_COLS); |
| 902 | - column_range printed_columns = get_printed_columns (m_context, hint); |
| 903 | + column_range printed_columns = get_printed_columns (m_policy, hint); |
| 904 | |
| 905 | /* Potentially consolidate. */ |
| 906 | if (!m_corrections.is_empty ()) |
| 907 | @@ -2289,7 +2498,7 @@ line_corrections::add_hint (const fixit_ |
| 908 | printed_columns, |
| 909 | hint->get_string (), |
| 910 | hint->get_length (), |
| 911 | - m_context->tabstop)); |
| 912 | + m_policy)); |
| 913 | } |
| 914 | |
| 915 | /* If there are any fixit hints on source line ROW, print them. |
| 916 | @@ -2303,7 +2512,7 @@ layout::print_trailing_fixits (linenum_t |
| 917 | { |
| 918 | /* Build a list of correction instances for the line, |
| 919 | potentially consolidating hints (for the sake of readability). */ |
| 920 | - line_corrections corrections (m_context, m_exploc.file, row); |
| 921 | + line_corrections corrections (m_policy, m_exploc.file, row); |
| 922 | for (unsigned int i = 0; i < m_fixit_hints.length (); i++) |
| 923 | { |
| 924 | const fixit_hint *hint = m_fixit_hints[i]; |
| 925 | @@ -2646,6 +2855,59 @@ namespace selftest { |
| 926 | |
| 927 | /* Selftests for diagnostic_show_locus. */ |
| 928 | |
| 929 | +/* Verify that cpp_display_width correctly handles escaping. */ |
| 930 | + |
| 931 | +static void |
| 932 | +test_display_widths () |
| 933 | +{ |
| 934 | + gcc_rich_location richloc (UNKNOWN_LOCATION); |
| 935 | + |
| 936 | + /* U+03C0 "GREEK SMALL LETTER PI". */ |
| 937 | + const char *pi = "\xCF\x80"; |
| 938 | + /* U+1F642 "SLIGHTLY SMILING FACE". */ |
| 939 | + const char *emoji = "\xF0\x9F\x99\x82"; |
| 940 | + /* Stray trailing byte of a UTF-8 character. */ |
| 941 | + const char *stray = "\xBF"; |
| 942 | + /* U+10FFFF. */ |
| 943 | + const char *max_codepoint = "\xF4\x8F\xBF\xBF"; |
| 944 | + |
| 945 | + /* No escaping. */ |
| 946 | + { |
| 947 | + test_diagnostic_context dc; |
| 948 | + char_display_policy policy (make_policy (dc, richloc)); |
| 949 | + ASSERT_EQ (cpp_display_width (pi, strlen (pi), policy), 1); |
| 950 | + ASSERT_EQ (cpp_display_width (emoji, strlen (emoji), policy), 2); |
| 951 | + ASSERT_EQ (cpp_display_width (stray, strlen (stray), policy), 1); |
| 952 | + /* Don't check width of U+10FFFF; it's in a private use plane. */ |
| 953 | + } |
| 954 | + |
| 955 | + richloc.set_escape_on_output (true); |
| 956 | + |
| 957 | + { |
| 958 | + test_diagnostic_context dc; |
| 959 | + dc.escape_format = DIAGNOSTICS_ESCAPE_FORMAT_UNICODE; |
| 960 | + char_display_policy policy (make_policy (dc, richloc)); |
| 961 | + ASSERT_EQ (cpp_display_width (pi, strlen (pi), policy), 8); |
| 962 | + ASSERT_EQ (cpp_display_width (emoji, strlen (emoji), policy), 9); |
| 963 | + ASSERT_EQ (cpp_display_width (stray, strlen (stray), policy), 4); |
| 964 | + ASSERT_EQ (cpp_display_width (max_codepoint, strlen (max_codepoint), |
| 965 | + policy), |
| 966 | + strlen ("<U+10FFFF>")); |
| 967 | + } |
| 968 | + |
| 969 | + { |
| 970 | + test_diagnostic_context dc; |
| 971 | + dc.escape_format = DIAGNOSTICS_ESCAPE_FORMAT_BYTES; |
| 972 | + char_display_policy policy (make_policy (dc, richloc)); |
| 973 | + ASSERT_EQ (cpp_display_width (pi, strlen (pi), policy), 8); |
| 974 | + ASSERT_EQ (cpp_display_width (emoji, strlen (emoji), policy), 16); |
| 975 | + ASSERT_EQ (cpp_display_width (stray, strlen (stray), policy), 4); |
| 976 | + ASSERT_EQ (cpp_display_width (max_codepoint, strlen (max_codepoint), |
| 977 | + policy), |
| 978 | + 16); |
| 979 | + } |
| 980 | +} |
| 981 | + |
| 982 | /* For precise tests of the layout, make clear where the source line will |
| 983 | start. test_left_margin sets the total byte count from the left side of the |
| 984 | screen to the start of source lines, after the line number and the separator, |
| 985 | @@ -2715,10 +2977,10 @@ test_layout_x_offset_display_utf8 (const |
| 986 | char_span lspan = location_get_source_line (tmp.get_filename (), 1); |
| 987 | ASSERT_EQ (line_display_cols, |
| 988 | cpp_display_width (lspan.get_buffer (), lspan.length (), |
| 989 | - def_tabstop)); |
| 990 | + def_policy ())); |
| 991 | ASSERT_EQ (line_display_cols, |
| 992 | location_compute_display_column (expand_location (line_end), |
| 993 | - def_tabstop)); |
| 994 | + def_policy ())); |
| 995 | ASSERT_EQ (0, memcmp (lspan.get_buffer () + (emoji_col - 1), |
| 996 | "\xf0\x9f\x98\x82\xf0\x9f\x98\x82", 8)); |
| 997 | |
| 998 | @@ -2866,12 +3128,13 @@ test_layout_x_offset_display_tab (const |
| 999 | ASSERT_EQ ('\t', *(lspan.get_buffer () + (tab_col - 1))); |
| 1000 | for (int tabstop = 1; tabstop != num_tabstops; ++tabstop) |
| 1001 | { |
| 1002 | + cpp_char_column_policy policy (tabstop, cpp_wcwidth); |
| 1003 | ASSERT_EQ (line_bytes + extra_width[tabstop], |
| 1004 | cpp_display_width (lspan.get_buffer (), lspan.length (), |
| 1005 | - tabstop)); |
| 1006 | + policy)); |
| 1007 | ASSERT_EQ (line_bytes + extra_width[tabstop], |
| 1008 | location_compute_display_column (expand_location (line_end), |
| 1009 | - tabstop)); |
| 1010 | + policy)); |
| 1011 | } |
| 1012 | |
| 1013 | /* Check that the tab is expanded to the expected number of spaces. */ |
| 1014 | @@ -4003,6 +4266,43 @@ test_one_liner_labels_utf8 () |
| 1015 | " bb\xf0\x9f\x98\x82\xf0\x9f\x98\x82\n", |
| 1016 | pp_formatted_text (dc.printer)); |
| 1017 | } |
| 1018 | + |
| 1019 | + /* Example of escaping the source lines. */ |
| 1020 | + { |
| 1021 | + text_range_label label0 ("label 0\xf0\x9f\x98\x82"); |
| 1022 | + text_range_label label1 ("label 1\xcf\x80"); |
| 1023 | + text_range_label label2 ("label 2\xcf\x80"); |
| 1024 | + gcc_rich_location richloc (foo, &label0); |
| 1025 | + richloc.add_range (bar, SHOW_RANGE_WITHOUT_CARET, &label1); |
| 1026 | + richloc.add_range (field, SHOW_RANGE_WITHOUT_CARET, &label2); |
| 1027 | + richloc.set_escape_on_output (true); |
| 1028 | + |
| 1029 | + { |
| 1030 | + test_diagnostic_context dc; |
| 1031 | + dc.escape_format = DIAGNOSTICS_ESCAPE_FORMAT_UNICODE; |
| 1032 | + diagnostic_show_locus (&dc, &richloc, DK_ERROR); |
| 1033 | + ASSERT_STREQ (" <U+1F602>_foo = <U+03C0>_bar.<U+1F602>_field<U+03C0>;\n" |
| 1034 | + " ^~~~~~~~~~~~~ ~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~~~\n" |
| 1035 | + " | | |\n" |
| 1036 | + " | | label 2\xcf\x80\n" |
| 1037 | + " | label 1\xcf\x80\n" |
| 1038 | + " label 0\xf0\x9f\x98\x82\n", |
| 1039 | + pp_formatted_text (dc.printer)); |
| 1040 | + } |
| 1041 | + { |
| 1042 | + test_diagnostic_context dc; |
| 1043 | + dc.escape_format = DIAGNOSTICS_ESCAPE_FORMAT_BYTES; |
| 1044 | + diagnostic_show_locus (&dc, &richloc, DK_ERROR); |
| 1045 | + ASSERT_STREQ |
| 1046 | + (" <f0><9f><98><82>_foo = <cf><80>_bar.<f0><9f><98><82>_field<cf><80>;\n" |
| 1047 | + " ^~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n" |
| 1048 | + " | | |\n" |
| 1049 | + " | | label 2\xcf\x80\n" |
| 1050 | + " | label 1\xcf\x80\n" |
| 1051 | + " label 0\xf0\x9f\x98\x82\n", |
| 1052 | + pp_formatted_text (dc.printer)); |
| 1053 | + } |
| 1054 | + } |
| 1055 | } |
| 1056 | |
| 1057 | /* Make sure that colorization codes don't interrupt a multibyte |
| 1058 | @@ -4057,9 +4357,9 @@ test_diagnostic_show_locus_one_liner_utf |
| 1059 | |
| 1060 | char_span lspan = location_get_source_line (tmp.get_filename (), 1); |
| 1061 | ASSERT_EQ (25, cpp_display_width (lspan.get_buffer (), lspan.length (), |
| 1062 | - def_tabstop)); |
| 1063 | + def_policy ())); |
| 1064 | ASSERT_EQ (25, location_compute_display_column (expand_location (line_end), |
| 1065 | - def_tabstop)); |
| 1066 | + def_policy ())); |
| 1067 | |
| 1068 | test_one_liner_simple_caret_utf8 (); |
| 1069 | test_one_liner_caret_and_range_utf8 (); |
| 1070 | @@ -4445,30 +4745,31 @@ test_overlapped_fixit_printing (const li |
| 1071 | pp_formatted_text (dc.printer)); |
| 1072 | |
| 1073 | /* Unit-test the line_corrections machinery. */ |
| 1074 | + char_display_policy policy (make_policy (dc, richloc)); |
| 1075 | ASSERT_EQ (3, richloc.get_num_fixit_hints ()); |
| 1076 | const fixit_hint *hint_0 = richloc.get_fixit_hint (0); |
| 1077 | ASSERT_EQ (column_range (12, 12), |
| 1078 | - get_affected_range (&dc, hint_0, CU_BYTES)); |
| 1079 | + get_affected_range (policy, hint_0, CU_BYTES)); |
| 1080 | ASSERT_EQ (column_range (12, 12), |
| 1081 | - get_affected_range (&dc, hint_0, CU_DISPLAY_COLS)); |
| 1082 | - ASSERT_EQ (column_range (12, 22), get_printed_columns (&dc, hint_0)); |
| 1083 | + get_affected_range (policy, hint_0, CU_DISPLAY_COLS)); |
| 1084 | + ASSERT_EQ (column_range (12, 22), get_printed_columns (policy, hint_0)); |
| 1085 | const fixit_hint *hint_1 = richloc.get_fixit_hint (1); |
| 1086 | ASSERT_EQ (column_range (18, 18), |
| 1087 | - get_affected_range (&dc, hint_1, CU_BYTES)); |
| 1088 | + get_affected_range (policy, hint_1, CU_BYTES)); |
| 1089 | ASSERT_EQ (column_range (18, 18), |
| 1090 | - get_affected_range (&dc, hint_1, CU_DISPLAY_COLS)); |
| 1091 | - ASSERT_EQ (column_range (18, 20), get_printed_columns (&dc, hint_1)); |
| 1092 | + get_affected_range (policy, hint_1, CU_DISPLAY_COLS)); |
| 1093 | + ASSERT_EQ (column_range (18, 20), get_printed_columns (policy, hint_1)); |
| 1094 | const fixit_hint *hint_2 = richloc.get_fixit_hint (2); |
| 1095 | ASSERT_EQ (column_range (29, 28), |
| 1096 | - get_affected_range (&dc, hint_2, CU_BYTES)); |
| 1097 | + get_affected_range (policy, hint_2, CU_BYTES)); |
| 1098 | ASSERT_EQ (column_range (29, 28), |
| 1099 | - get_affected_range (&dc, hint_2, CU_DISPLAY_COLS)); |
| 1100 | - ASSERT_EQ (column_range (29, 29), get_printed_columns (&dc, hint_2)); |
| 1101 | + get_affected_range (policy, hint_2, CU_DISPLAY_COLS)); |
| 1102 | + ASSERT_EQ (column_range (29, 29), get_printed_columns (policy, hint_2)); |
| 1103 | |
| 1104 | /* Add each hint in turn to a line_corrections instance, |
| 1105 | and verify that they are consolidated into one correction instance |
| 1106 | as expected. */ |
| 1107 | - line_corrections lc (&dc, tmp.get_filename (), 1); |
| 1108 | + line_corrections lc (policy, tmp.get_filename (), 1); |
| 1109 | |
| 1110 | /* The first replace hint by itself. */ |
| 1111 | lc.add_hint (hint_0); |
| 1112 | @@ -4660,30 +4961,31 @@ test_overlapped_fixit_printing_utf8 (con |
| 1113 | pp_formatted_text (dc.printer)); |
| 1114 | |
| 1115 | /* Unit-test the line_corrections machinery. */ |
| 1116 | + char_display_policy policy (make_policy (dc, richloc)); |
| 1117 | ASSERT_EQ (3, richloc.get_num_fixit_hints ()); |
| 1118 | const fixit_hint *hint_0 = richloc.get_fixit_hint (0); |
| 1119 | ASSERT_EQ (column_range (14, 14), |
| 1120 | - get_affected_range (&dc, hint_0, CU_BYTES)); |
| 1121 | + get_affected_range (policy, hint_0, CU_BYTES)); |
| 1122 | ASSERT_EQ (column_range (12, 12), |
| 1123 | - get_affected_range (&dc, hint_0, CU_DISPLAY_COLS)); |
| 1124 | - ASSERT_EQ (column_range (12, 22), get_printed_columns (&dc, hint_0)); |
| 1125 | + get_affected_range (policy, hint_0, CU_DISPLAY_COLS)); |
| 1126 | + ASSERT_EQ (column_range (12, 22), get_printed_columns (policy, hint_0)); |
| 1127 | const fixit_hint *hint_1 = richloc.get_fixit_hint (1); |
| 1128 | ASSERT_EQ (column_range (22, 22), |
| 1129 | - get_affected_range (&dc, hint_1, CU_BYTES)); |
| 1130 | + get_affected_range (policy, hint_1, CU_BYTES)); |
| 1131 | ASSERT_EQ (column_range (18, 18), |
| 1132 | - get_affected_range (&dc, hint_1, CU_DISPLAY_COLS)); |
| 1133 | - ASSERT_EQ (column_range (18, 20), get_printed_columns (&dc, hint_1)); |
| 1134 | + get_affected_range (policy, hint_1, CU_DISPLAY_COLS)); |
| 1135 | + ASSERT_EQ (column_range (18, 20), get_printed_columns (policy, hint_1)); |
| 1136 | const fixit_hint *hint_2 = richloc.get_fixit_hint (2); |
| 1137 | ASSERT_EQ (column_range (35, 34), |
| 1138 | - get_affected_range (&dc, hint_2, CU_BYTES)); |
| 1139 | + get_affected_range (policy, hint_2, CU_BYTES)); |
| 1140 | ASSERT_EQ (column_range (30, 29), |
| 1141 | - get_affected_range (&dc, hint_2, CU_DISPLAY_COLS)); |
| 1142 | - ASSERT_EQ (column_range (30, 30), get_printed_columns (&dc, hint_2)); |
| 1143 | + get_affected_range (policy, hint_2, CU_DISPLAY_COLS)); |
| 1144 | + ASSERT_EQ (column_range (30, 30), get_printed_columns (policy, hint_2)); |
| 1145 | |
| 1146 | /* Add each hint in turn to a line_corrections instance, |
| 1147 | and verify that they are consolidated into one correction instance |
| 1148 | as expected. */ |
| 1149 | - line_corrections lc (&dc, tmp.get_filename (), 1); |
| 1150 | + line_corrections lc (policy, tmp.get_filename (), 1); |
| 1151 | |
| 1152 | /* The first replace hint by itself. */ |
| 1153 | lc.add_hint (hint_0); |
| 1154 | @@ -4877,15 +5179,16 @@ test_overlapped_fixit_printing_2 (const |
| 1155 | richloc.add_fixit_insert_before (col_21, "}"); |
| 1156 | |
| 1157 | /* These fixits should be accepted; they can't be consolidated. */ |
| 1158 | + char_display_policy policy (make_policy (dc, richloc)); |
| 1159 | ASSERT_EQ (2, richloc.get_num_fixit_hints ()); |
| 1160 | const fixit_hint *hint_0 = richloc.get_fixit_hint (0); |
| 1161 | ASSERT_EQ (column_range (23, 22), |
| 1162 | - get_affected_range (&dc, hint_0, CU_BYTES)); |
| 1163 | - ASSERT_EQ (column_range (23, 23), get_printed_columns (&dc, hint_0)); |
| 1164 | + get_affected_range (policy, hint_0, CU_BYTES)); |
| 1165 | + ASSERT_EQ (column_range (23, 23), get_printed_columns (policy, hint_0)); |
| 1166 | const fixit_hint *hint_1 = richloc.get_fixit_hint (1); |
| 1167 | ASSERT_EQ (column_range (21, 20), |
| 1168 | - get_affected_range (&dc, hint_1, CU_BYTES)); |
| 1169 | - ASSERT_EQ (column_range (21, 21), get_printed_columns (&dc, hint_1)); |
| 1170 | + get_affected_range (policy, hint_1, CU_BYTES)); |
| 1171 | + ASSERT_EQ (column_range (21, 21), get_printed_columns (policy, hint_1)); |
| 1172 | |
| 1173 | /* Verify that they're printed correctly. */ |
| 1174 | diagnostic_show_locus (&dc, &richloc, DK_ERROR); |
| 1175 | @@ -5152,10 +5455,11 @@ test_tab_expansion (const line_table_cas |
| 1176 | ....................123 45678901234 56789012345 columns */ |
| 1177 | |
| 1178 | const int tabstop = 8; |
| 1179 | + cpp_char_column_policy policy (tabstop, cpp_wcwidth); |
| 1180 | const int first_non_ws_byte_col = 7; |
| 1181 | const int right_quote_byte_col = 15; |
| 1182 | const int last_byte_col = 25; |
| 1183 | - ASSERT_EQ (35, cpp_display_width (content, last_byte_col, tabstop)); |
| 1184 | + ASSERT_EQ (35, cpp_display_width (content, last_byte_col, policy)); |
| 1185 | |
| 1186 | temp_source_file tmp (SELFTEST_LOCATION, ".c", content); |
| 1187 | line_table_test ltt (case_); |
| 1188 | @@ -5198,6 +5502,114 @@ test_tab_expansion (const line_table_cas |
| 1189 | } |
| 1190 | } |
| 1191 | |
| 1192 | +/* Verify that the escaping machinery can cope with a variety of different |
| 1193 | + invalid bytes. */ |
| 1194 | + |
| 1195 | +static void |
| 1196 | +test_escaping_bytes_1 (const line_table_case &case_) |
| 1197 | +{ |
| 1198 | + const char content[] = "before\0\1\2\3\r\x80\xff""after\n"; |
| 1199 | + const size_t sz = sizeof (content); |
| 1200 | + temp_source_file tmp (SELFTEST_LOCATION, ".c", content, sz); |
| 1201 | + line_table_test ltt (case_); |
| 1202 | + const line_map_ordinary *ord_map = linemap_check_ordinary |
| 1203 | + (linemap_add (line_table, LC_ENTER, false, tmp.get_filename (), 0)); |
| 1204 | + linemap_line_start (line_table, 1, 100); |
| 1205 | + |
| 1206 | + location_t finish |
| 1207 | + = linemap_position_for_line_and_column (line_table, ord_map, 1, |
| 1208 | + strlen (content)); |
| 1209 | + |
| 1210 | + if (finish > LINE_MAP_MAX_LOCATION_WITH_COLS) |
| 1211 | + return; |
| 1212 | + |
| 1213 | + /* Locations of the NUL and \r bytes. */ |
| 1214 | + location_t nul_loc |
| 1215 | + = linemap_position_for_line_and_column (line_table, ord_map, 1, 7); |
| 1216 | + location_t r_loc |
| 1217 | + = linemap_position_for_line_and_column (line_table, ord_map, 1, 11); |
| 1218 | + gcc_rich_location richloc (nul_loc); |
| 1219 | + richloc.add_range (r_loc); |
| 1220 | + |
| 1221 | + { |
| 1222 | + test_diagnostic_context dc; |
| 1223 | + diagnostic_show_locus (&dc, &richloc, DK_ERROR); |
| 1224 | + ASSERT_STREQ (" before \1\2\3 \x80\xff""after\n" |
| 1225 | + " ^ ~\n", |
| 1226 | + pp_formatted_text (dc.printer)); |
| 1227 | + } |
| 1228 | + richloc.set_escape_on_output (true); |
| 1229 | + { |
| 1230 | + test_diagnostic_context dc; |
| 1231 | + dc.escape_format = DIAGNOSTICS_ESCAPE_FORMAT_UNICODE; |
| 1232 | + diagnostic_show_locus (&dc, &richloc, DK_ERROR); |
| 1233 | + ASSERT_STREQ |
| 1234 | + (" before<U+0000><U+0001><U+0002><U+0003><U+000D><80><ff>after\n" |
| 1235 | + " ^~~~~~~~ ~~~~~~~~\n", |
| 1236 | + pp_formatted_text (dc.printer)); |
| 1237 | + } |
| 1238 | + { |
| 1239 | + test_diagnostic_context dc; |
| 1240 | + dc.escape_format = DIAGNOSTICS_ESCAPE_FORMAT_BYTES; |
| 1241 | + diagnostic_show_locus (&dc, &richloc, DK_ERROR); |
| 1242 | + ASSERT_STREQ (" before<00><01><02><03><0d><80><ff>after\n" |
| 1243 | + " ^~~~ ~~~~\n", |
| 1244 | + pp_formatted_text (dc.printer)); |
| 1245 | + } |
| 1246 | +} |
| 1247 | + |
| 1248 | +/* As above, but verify that we handle the initial byte of a line |
| 1249 | + correctly. */ |
| 1250 | + |
| 1251 | +static void |
| 1252 | +test_escaping_bytes_2 (const line_table_case &case_) |
| 1253 | +{ |
| 1254 | + const char content[] = "\0after\n"; |
| 1255 | + const size_t sz = sizeof (content); |
| 1256 | + temp_source_file tmp (SELFTEST_LOCATION, ".c", content, sz); |
| 1257 | + line_table_test ltt (case_); |
| 1258 | + const line_map_ordinary *ord_map = linemap_check_ordinary |
| 1259 | + (linemap_add (line_table, LC_ENTER, false, tmp.get_filename (), 0)); |
| 1260 | + linemap_line_start (line_table, 1, 100); |
| 1261 | + |
| 1262 | + location_t finish |
| 1263 | + = linemap_position_for_line_and_column (line_table, ord_map, 1, |
| 1264 | + strlen (content)); |
| 1265 | + |
| 1266 | + if (finish > LINE_MAP_MAX_LOCATION_WITH_COLS) |
| 1267 | + return; |
| 1268 | + |
| 1269 | + /* Location of the NUL byte. */ |
| 1270 | + location_t nul_loc |
| 1271 | + = linemap_position_for_line_and_column (line_table, ord_map, 1, 1); |
| 1272 | + gcc_rich_location richloc (nul_loc); |
| 1273 | + |
| 1274 | + { |
| 1275 | + test_diagnostic_context dc; |
| 1276 | + diagnostic_show_locus (&dc, &richloc, DK_ERROR); |
| 1277 | + ASSERT_STREQ (" after\n" |
| 1278 | + " ^\n", |
| 1279 | + pp_formatted_text (dc.printer)); |
| 1280 | + } |
| 1281 | + richloc.set_escape_on_output (true); |
| 1282 | + { |
| 1283 | + test_diagnostic_context dc; |
| 1284 | + dc.escape_format = DIAGNOSTICS_ESCAPE_FORMAT_UNICODE; |
| 1285 | + diagnostic_show_locus (&dc, &richloc, DK_ERROR); |
| 1286 | + ASSERT_STREQ (" <U+0000>after\n" |
| 1287 | + " ^~~~~~~~\n", |
| 1288 | + pp_formatted_text (dc.printer)); |
| 1289 | + } |
| 1290 | + { |
| 1291 | + test_diagnostic_context dc; |
| 1292 | + dc.escape_format = DIAGNOSTICS_ESCAPE_FORMAT_BYTES; |
| 1293 | + diagnostic_show_locus (&dc, &richloc, DK_ERROR); |
| 1294 | + ASSERT_STREQ (" <00>after\n" |
| 1295 | + " ^~~~\n", |
| 1296 | + pp_formatted_text (dc.printer)); |
| 1297 | + } |
| 1298 | +} |
| 1299 | + |
| 1300 | /* Verify that line numbers are correctly printed for the case of |
| 1301 | a multiline range in which the width of the line numbers changes |
| 1302 | (e.g. from "9" to "10"). */ |
| 1303 | @@ -5254,6 +5666,8 @@ diagnostic_show_locus_c_tests () |
| 1304 | test_layout_range_for_single_line (); |
| 1305 | test_layout_range_for_multiple_lines (); |
| 1306 | |
| 1307 | + test_display_widths (); |
| 1308 | + |
| 1309 | for_each_line_table_case (test_layout_x_offset_display_utf8); |
| 1310 | for_each_line_table_case (test_layout_x_offset_display_tab); |
| 1311 | |
| 1312 | @@ -5274,6 +5688,8 @@ diagnostic_show_locus_c_tests () |
| 1313 | for_each_line_table_case (test_fixit_replace_containing_newline); |
| 1314 | for_each_line_table_case (test_fixit_deletion_affecting_newline); |
| 1315 | for_each_line_table_case (test_tab_expansion); |
| 1316 | + for_each_line_table_case (test_escaping_bytes_1); |
| 1317 | + for_each_line_table_case (test_escaping_bytes_2); |
| 1318 | |
| 1319 | test_line_numbers_multiline_range (); |
| 1320 | } |
| 1321 | diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi |
| 1322 | --- a/gcc/doc/invoke.texi 2021-12-13 23:23:05.764437151 -0800 |
| 1323 | +++ b/gcc/doc/invoke.texi 2021-12-14 01:16:01.553943061 -0800 |
| 1324 | @@ -312,7 +312,8 @@ Objective-C and Objective-C++ Dialects}. |
| 1325 | -fdiagnostics-show-path-depths @gol |
| 1326 | -fno-show-column @gol |
| 1327 | -fdiagnostics-column-unit=@r{[}display@r{|}byte@r{]} @gol |
| 1328 | --fdiagnostics-column-origin=@var{origin}} |
| 1329 | +-fdiagnostics-column-origin=@var{origin} @gol |
| 1330 | +-fdiagnostics-escape-format=@r{[}unicode@r{|}bytes@r{]}} |
| 1331 | |
| 1332 | @item Warning Options |
| 1333 | @xref{Warning Options,,Options to Request or Suppress Warnings}. |
| 1334 | @@ -5083,6 +5084,38 @@ first column. The default value of 1 co |
| 1335 | behavior and to the GNU style guide. Some utilities may perform better with an |
| 1336 | origin of 0; any non-negative value may be specified. |
| 1337 | |
| 1338 | +@item -fdiagnostics-escape-format=@var{FORMAT} |
| 1339 | +@opindex fdiagnostics-escape-format |
| 1340 | +When GCC prints pertinent source lines for a diagnostic it normally attempts |
| 1341 | +to print the source bytes directly. However, some diagnostics relate to encoding |
| 1342 | +issues in the source file, such as malformed UTF-8, or issues with Unicode |
| 1343 | +normalization. These diagnostics are flagged so that GCC will escape bytes |
| 1344 | +that are not printable ASCII when printing their pertinent source lines. |
| 1345 | + |
| 1346 | +This option controls how such bytes should be escaped. |
| 1347 | + |
| 1348 | +The default @var{FORMAT}, @samp{unicode} displays Unicode characters that |
| 1349 | +are not printable ASCII in the form @samp{<U+XXXX>}, and bytes that do not |
| 1350 | +correspond to a Unicode character validly-encoded in UTF-8-encoded will be |
| 1351 | +displayed as hexadecimal in the form @samp{<XX>}. |
| 1352 | + |
| 1353 | +For example, a source line containing the string @samp{before} followed by the |
| 1354 | +Unicode character U+03C0 (``GREEK SMALL LETTER PI'', with UTF-8 encoding |
| 1355 | +0xCF 0x80) followed by the byte 0xBF (a stray UTF-8 trailing byte), followed by |
| 1356 | +the string @samp{after} will be printed for such a diagnostic as: |
| 1357 | + |
| 1358 | +@smallexample |
| 1359 | + before<U+03C0><BF>after |
| 1360 | +@end smallexample |
| 1361 | + |
| 1362 | +Setting @var{FORMAT} to @samp{bytes} will display all non-printable-ASCII bytes |
| 1363 | +in the form @samp{<XX>}, thus showing the underlying encoding of non-ASCII |
| 1364 | +Unicode characters. For the example above, the following will be printed: |
| 1365 | + |
| 1366 | +@smallexample |
| 1367 | + before<CF><80><BF>after |
| 1368 | +@end smallexample |
| 1369 | + |
| 1370 | @item -fdiagnostics-format=@var{FORMAT} |
| 1371 | @opindex fdiagnostics-format |
| 1372 | Select a different format for printing diagnostics. |
| 1373 | @@ -5150,9 +5183,11 @@ might be printed in JSON form (after for |
| 1374 | @} |
| 1375 | @} |
| 1376 | ], |
| 1377 | + "escape-source": false, |
| 1378 | "message": "...this statement, but the latter is @dots{}" |
| 1379 | @} |
| 1380 | ] |
| 1381 | + "escape-source": false, |
| 1382 | "column-origin": 1, |
| 1383 | @}, |
| 1384 | @dots{} |
| 1385 | @@ -5239,6 +5274,7 @@ of the expression, which have labels. I |
| 1386 | "label": "T @{aka struct t@}" |
| 1387 | @} |
| 1388 | ], |
| 1389 | + "escape-source": false, |
| 1390 | "message": "invalid operands to binary + @dots{}" |
| 1391 | @} |
| 1392 | @end smallexample |
| 1393 | @@ -5292,6 +5328,7 @@ might be printed in JSON form as: |
| 1394 | @} |
| 1395 | @} |
| 1396 | ], |
| 1397 | + "escape-source": false, |
| 1398 | "message": "\u2018struct s\u2019 has no member named @dots{}" |
| 1399 | @} |
| 1400 | @end smallexample |
| 1401 | @@ -5349,6 +5386,10 @@ For example, the intraprocedural example |
| 1402 | ] |
| 1403 | @end smallexample |
| 1404 | |
| 1405 | +Diagnostics have a boolean attribute @code{escape-source}, hinting whether |
| 1406 | +non-ASCII bytes should be escaped when printing the pertinent lines of |
| 1407 | +source code (@code{true} for diagnostics involving source encoding issues). |
| 1408 | + |
| 1409 | @end table |
| 1410 | |
| 1411 | @node Warning Options |
| 1412 | diff --git a/gcc/input.c b/gcc/input.c |
| 1413 | --- a/gcc/input.c 2021-07-27 23:55:07.328287915 -0700 |
| 1414 | +++ b/gcc/input.c 2021-12-14 01:16:01.553943061 -0800 |
| 1415 | @@ -913,7 +913,8 @@ make_location (location_t caret, source_ |
| 1416 | source line in order to calculate the display width. If that cannot be done |
| 1417 | for any reason, then returns the byte column as a fallback. */ |
| 1418 | int |
| 1419 | -location_compute_display_column (expanded_location exploc, int tabstop) |
| 1420 | +location_compute_display_column (expanded_location exploc, |
| 1421 | + const cpp_char_column_policy &policy) |
| 1422 | { |
| 1423 | if (!(exploc.file && *exploc.file && exploc.line && exploc.column)) |
| 1424 | return exploc.column; |
| 1425 | @@ -921,7 +922,7 @@ location_compute_display_column (expande |
| 1426 | /* If line is NULL, this function returns exploc.column which is the |
| 1427 | desired fallback. */ |
| 1428 | return cpp_byte_column_to_display_column (line.get_buffer (), line.length (), |
| 1429 | - exploc.column, tabstop); |
| 1430 | + exploc.column, policy); |
| 1431 | } |
| 1432 | |
| 1433 | /* Dump statistics to stderr about the memory usage of the line_table |
| 1434 | @@ -3611,43 +3612,50 @@ test_line_offset_overflow () |
| 1435 | void test_cpp_utf8 () |
| 1436 | { |
| 1437 | const int def_tabstop = 8; |
| 1438 | + cpp_char_column_policy policy (def_tabstop, cpp_wcwidth); |
| 1439 | + |
| 1440 | /* Verify that wcwidth of invalid UTF-8 or control bytes is 1. */ |
| 1441 | { |
| 1442 | - int w_bad = cpp_display_width ("\xf0!\x9f!\x98!\x82!", 8, def_tabstop); |
| 1443 | + int w_bad = cpp_display_width ("\xf0!\x9f!\x98!\x82!", 8, policy); |
| 1444 | ASSERT_EQ (8, w_bad); |
| 1445 | - int w_ctrl = cpp_display_width ("\r\n\v\0\1", 5, def_tabstop); |
| 1446 | + int w_ctrl = cpp_display_width ("\r\n\v\0\1", 5, policy); |
| 1447 | ASSERT_EQ (5, w_ctrl); |
| 1448 | } |
| 1449 | |
| 1450 | /* Verify that wcwidth of valid UTF-8 is as expected. */ |
| 1451 | { |
| 1452 | - const int w_pi = cpp_display_width ("\xcf\x80", 2, def_tabstop); |
| 1453 | + const int w_pi = cpp_display_width ("\xcf\x80", 2, policy); |
| 1454 | ASSERT_EQ (1, w_pi); |
| 1455 | - const int w_emoji = cpp_display_width ("\xf0\x9f\x98\x82", 4, def_tabstop); |
| 1456 | + const int w_emoji = cpp_display_width ("\xf0\x9f\x98\x82", 4, policy); |
| 1457 | ASSERT_EQ (2, w_emoji); |
| 1458 | const int w_umlaut_precomposed = cpp_display_width ("\xc3\xbf", 2, |
| 1459 | - def_tabstop); |
| 1460 | + policy); |
| 1461 | ASSERT_EQ (1, w_umlaut_precomposed); |
| 1462 | const int w_umlaut_combining = cpp_display_width ("y\xcc\x88", 3, |
| 1463 | - def_tabstop); |
| 1464 | + policy); |
| 1465 | ASSERT_EQ (1, w_umlaut_combining); |
| 1466 | - const int w_han = cpp_display_width ("\xe4\xb8\xba", 3, def_tabstop); |
| 1467 | + const int w_han = cpp_display_width ("\xe4\xb8\xba", 3, policy); |
| 1468 | ASSERT_EQ (2, w_han); |
| 1469 | - const int w_ascii = cpp_display_width ("GCC", 3, def_tabstop); |
| 1470 | + const int w_ascii = cpp_display_width ("GCC", 3, policy); |
| 1471 | ASSERT_EQ (3, w_ascii); |
| 1472 | const int w_mixed = cpp_display_width ("\xcf\x80 = 3.14 \xf0\x9f\x98\x82" |
| 1473 | "\x9f! \xe4\xb8\xba y\xcc\x88", |
| 1474 | - 24, def_tabstop); |
| 1475 | + 24, policy); |
| 1476 | ASSERT_EQ (18, w_mixed); |
| 1477 | } |
| 1478 | |
| 1479 | /* Verify that display width properly expands tabs. */ |
| 1480 | { |
| 1481 | const char *tstr = "\tabc\td"; |
| 1482 | - ASSERT_EQ (6, cpp_display_width (tstr, 6, 1)); |
| 1483 | - ASSERT_EQ (10, cpp_display_width (tstr, 6, 3)); |
| 1484 | - ASSERT_EQ (17, cpp_display_width (tstr, 6, 8)); |
| 1485 | - ASSERT_EQ (1, cpp_display_column_to_byte_column (tstr, 6, 7, 8)); |
| 1486 | + ASSERT_EQ (6, cpp_display_width (tstr, 6, |
| 1487 | + cpp_char_column_policy (1, cpp_wcwidth))); |
| 1488 | + ASSERT_EQ (10, cpp_display_width (tstr, 6, |
| 1489 | + cpp_char_column_policy (3, cpp_wcwidth))); |
| 1490 | + ASSERT_EQ (17, cpp_display_width (tstr, 6, |
| 1491 | + cpp_char_column_policy (8, cpp_wcwidth))); |
| 1492 | + ASSERT_EQ (1, |
| 1493 | + cpp_display_column_to_byte_column |
| 1494 | + (tstr, 6, 7, cpp_char_column_policy (8, cpp_wcwidth))); |
| 1495 | } |
| 1496 | |
| 1497 | /* Verify that cpp_byte_column_to_display_column can go past the end, |
| 1498 | @@ -3660,13 +3668,13 @@ void test_cpp_utf8 () |
| 1499 | /* 111122223456 |
| 1500 | Byte columns. */ |
| 1501 | |
| 1502 | - ASSERT_EQ (5, cpp_display_width (str, 6, def_tabstop)); |
| 1503 | + ASSERT_EQ (5, cpp_display_width (str, 6, policy)); |
| 1504 | ASSERT_EQ (105, |
| 1505 | - cpp_byte_column_to_display_column (str, 6, 106, def_tabstop)); |
| 1506 | + cpp_byte_column_to_display_column (str, 6, 106, policy)); |
| 1507 | ASSERT_EQ (10000, |
| 1508 | - cpp_byte_column_to_display_column (NULL, 0, 10000, def_tabstop)); |
| 1509 | + cpp_byte_column_to_display_column (NULL, 0, 10000, policy)); |
| 1510 | ASSERT_EQ (0, |
| 1511 | - cpp_byte_column_to_display_column (NULL, 10000, 0, def_tabstop)); |
| 1512 | + cpp_byte_column_to_display_column (NULL, 10000, 0, policy)); |
| 1513 | } |
| 1514 | |
| 1515 | /* Verify that cpp_display_column_to_byte_column can go past the end, |
| 1516 | @@ -3680,25 +3688,25 @@ void test_cpp_utf8 () |
| 1517 | /* 000000000000000000000000000000000111111 |
| 1518 | 111122223333444456666777788889999012345 |
| 1519 | Byte columns. */ |
| 1520 | - ASSERT_EQ (4, cpp_display_column_to_byte_column (str, 15, 2, def_tabstop)); |
| 1521 | + ASSERT_EQ (4, cpp_display_column_to_byte_column (str, 15, 2, policy)); |
| 1522 | ASSERT_EQ (15, |
| 1523 | - cpp_display_column_to_byte_column (str, 15, 11, def_tabstop)); |
| 1524 | + cpp_display_column_to_byte_column (str, 15, 11, policy)); |
| 1525 | ASSERT_EQ (115, |
| 1526 | - cpp_display_column_to_byte_column (str, 15, 111, def_tabstop)); |
| 1527 | + cpp_display_column_to_byte_column (str, 15, 111, policy)); |
| 1528 | ASSERT_EQ (10000, |
| 1529 | - cpp_display_column_to_byte_column (NULL, 0, 10000, def_tabstop)); |
| 1530 | + cpp_display_column_to_byte_column (NULL, 0, 10000, policy)); |
| 1531 | ASSERT_EQ (0, |
| 1532 | - cpp_display_column_to_byte_column (NULL, 10000, 0, def_tabstop)); |
| 1533 | + cpp_display_column_to_byte_column (NULL, 10000, 0, policy)); |
| 1534 | |
| 1535 | /* Verify that we do not interrupt a UTF-8 sequence. */ |
| 1536 | - ASSERT_EQ (4, cpp_display_column_to_byte_column (str, 15, 1, def_tabstop)); |
| 1537 | + ASSERT_EQ (4, cpp_display_column_to_byte_column (str, 15, 1, policy)); |
| 1538 | |
| 1539 | for (int byte_col = 1; byte_col <= 15; ++byte_col) |
| 1540 | { |
| 1541 | const int disp_col |
| 1542 | - = cpp_byte_column_to_display_column (str, 15, byte_col, def_tabstop); |
| 1543 | + = cpp_byte_column_to_display_column (str, 15, byte_col, policy); |
| 1544 | const int byte_col2 |
| 1545 | - = cpp_display_column_to_byte_column (str, 15, disp_col, def_tabstop); |
| 1546 | + = cpp_display_column_to_byte_column (str, 15, disp_col, policy); |
| 1547 | |
| 1548 | /* If we ask for the display column in the middle of a UTF-8 |
| 1549 | sequence, it will return the length of the partial sequence, |
| 1550 | diff --git a/gcc/input.h b/gcc/input.h |
| 1551 | --- a/gcc/input.h 2021-07-27 23:55:07.328287915 -0700 |
| 1552 | +++ b/gcc/input.h 2021-12-14 01:16:01.553943061 -0800 |
| 1553 | @@ -39,8 +39,11 @@ STATIC_ASSERT (BUILTINS_LOCATION < RESER |
| 1554 | extern bool is_location_from_builtin_token (location_t); |
| 1555 | extern expanded_location expand_location (location_t); |
| 1556 | |
| 1557 | -extern int location_compute_display_column (expanded_location exploc, |
| 1558 | - int tabstop); |
| 1559 | +class cpp_char_column_policy; |
| 1560 | + |
| 1561 | +extern int |
| 1562 | +location_compute_display_column (expanded_location exploc, |
| 1563 | + const cpp_char_column_policy &policy); |
| 1564 | |
| 1565 | /* A class capturing the bounds of a buffer, to allow for run-time |
| 1566 | bounds-checking in a checked build. */ |
| 1567 | diff --git a/gcc/opts.c b/gcc/opts.c |
| 1568 | --- a/gcc/opts.c 2021-07-27 23:55:07.364288417 -0700 |
| 1569 | +++ b/gcc/opts.c 2021-12-14 01:16:01.553943061 -0800 |
| 1570 | @@ -2573,6 +2573,10 @@ common_handle_option (struct gcc_options |
| 1571 | dc->column_origin = value; |
| 1572 | break; |
| 1573 | |
| 1574 | + case OPT_fdiagnostics_escape_format_: |
| 1575 | + dc->escape_format = (enum diagnostics_escape_format)value; |
| 1576 | + break; |
| 1577 | + |
| 1578 | case OPT_fdiagnostics_show_cwe: |
| 1579 | dc->show_cwe = value; |
| 1580 | break; |
| 1581 | diff --git a/gcc/selftest.c b/gcc/selftest.c |
| 1582 | --- a/gcc/selftest.c 2021-07-27 23:55:07.500290315 -0700 |
| 1583 | +++ b/gcc/selftest.c 2021-12-14 01:16:01.557942991 -0800 |
| 1584 | @@ -193,6 +193,21 @@ temp_source_file::temp_source_file (cons |
| 1585 | fclose (out); |
| 1586 | } |
| 1587 | |
| 1588 | +/* As above, but with a size, to allow for NUL bytes in CONTENT. */ |
| 1589 | + |
| 1590 | +temp_source_file::temp_source_file (const location &loc, |
| 1591 | + const char *suffix, |
| 1592 | + const char *content, |
| 1593 | + size_t sz) |
| 1594 | +: named_temp_file (suffix) |
| 1595 | +{ |
| 1596 | + FILE *out = fopen (get_filename (), "w"); |
| 1597 | + if (!out) |
| 1598 | + fail_formatted (loc, "unable to open tempfile: %s", get_filename ()); |
| 1599 | + fwrite (content, sz, 1, out); |
| 1600 | + fclose (out); |
| 1601 | +} |
| 1602 | + |
| 1603 | /* Avoid introducing locale-specific differences in the results |
| 1604 | by hardcoding open_quote and close_quote. */ |
| 1605 | |
| 1606 | diff --git a/gcc/selftest.h b/gcc/selftest.h |
| 1607 | --- a/gcc/selftest.h 2021-07-27 23:55:07.500290315 -0700 |
| 1608 | +++ b/gcc/selftest.h 2021-12-14 01:16:01.557942991 -0800 |
| 1609 | @@ -112,6 +112,8 @@ class temp_source_file : public named_te |
| 1610 | public: |
| 1611 | temp_source_file (const location &loc, const char *suffix, |
| 1612 | const char *content); |
| 1613 | + temp_source_file (const location &loc, const char *suffix, |
| 1614 | + const char *content, size_t sz); |
| 1615 | }; |
| 1616 | |
| 1617 | /* RAII-style class for avoiding introducing locale-specific differences |
| 1618 | diff --git a/gcc/testsuite/c-c++-common/diagnostic-format-json-1.c b/gcc/testsuite/c-c++-common/diagnostic-format-json-1.c |
| 1619 | --- a/gcc/testsuite/c-c++-common/diagnostic-format-json-1.c 2021-07-27 23:55:07.596291654 -0700 |
| 1620 | +++ b/gcc/testsuite/c-c++-common/diagnostic-format-json-1.c 2021-12-14 01:16:01.557942991 -0800 |
| 1621 | @@ -9,6 +9,7 @@ |
| 1622 | |
| 1623 | /* { dg-regexp "\"kind\": \"error\"" } */ |
| 1624 | /* { dg-regexp "\"column-origin\": 1" } */ |
| 1625 | +/* { dg-regexp "\"escape-source\": false" } */ |
| 1626 | /* { dg-regexp "\"message\": \"#error message\"" } */ |
| 1627 | |
| 1628 | /* { dg-regexp "\"caret\": \{" } */ |
| 1629 | diff --git a/gcc/testsuite/c-c++-common/diagnostic-format-json-2.c b/gcc/testsuite/c-c++-common/diagnostic-format-json-2.c |
| 1630 | --- a/gcc/testsuite/c-c++-common/diagnostic-format-json-2.c 2021-07-27 23:55:07.596291654 -0700 |
| 1631 | +++ b/gcc/testsuite/c-c++-common/diagnostic-format-json-2.c 2021-12-14 01:16:01.557942991 -0800 |
| 1632 | @@ -9,6 +9,7 @@ |
| 1633 | |
| 1634 | /* { dg-regexp "\"kind\": \"warning\"" } */ |
| 1635 | /* { dg-regexp "\"column-origin\": 1" } */ |
| 1636 | +/* { dg-regexp "\"escape-source\": false" } */ |
| 1637 | /* { dg-regexp "\"message\": \"#warning message\"" } */ |
| 1638 | /* { dg-regexp "\"option\": \"-Wcpp\"" } */ |
| 1639 | /* { dg-regexp "\"option_url\": \"https:\[^\n\r\"\]*#index-Wcpp\"" } */ |
| 1640 | diff --git a/gcc/testsuite/c-c++-common/diagnostic-format-json-3.c b/gcc/testsuite/c-c++-common/diagnostic-format-json-3.c |
| 1641 | --- a/gcc/testsuite/c-c++-common/diagnostic-format-json-3.c 2021-07-27 23:55:07.596291654 -0700 |
| 1642 | +++ b/gcc/testsuite/c-c++-common/diagnostic-format-json-3.c 2021-12-14 01:16:01.557942991 -0800 |
| 1643 | @@ -9,6 +9,7 @@ |
| 1644 | |
| 1645 | /* { dg-regexp "\"kind\": \"error\"" } */ |
| 1646 | /* { dg-regexp "\"column-origin\": 1" } */ |
| 1647 | +/* { dg-regexp "\"escape-source\": false" } */ |
| 1648 | /* { dg-regexp "\"message\": \"#warning message\"" } */ |
| 1649 | /* { dg-regexp "\"option\": \"-Werror=cpp\"" } */ |
| 1650 | /* { dg-regexp "\"option_url\": \"https:\[^\n\r\"\]*#index-Wcpp\"" } */ |
| 1651 | diff --git a/gcc/testsuite/c-c++-common/diagnostic-format-json-4.c b/gcc/testsuite/c-c++-common/diagnostic-format-json-4.c |
| 1652 | --- a/gcc/testsuite/c-c++-common/diagnostic-format-json-4.c 2021-07-27 23:55:07.596291654 -0700 |
| 1653 | +++ b/gcc/testsuite/c-c++-common/diagnostic-format-json-4.c 2021-12-14 01:16:01.557942991 -0800 |
| 1654 | @@ -19,6 +19,7 @@ int test (void) |
| 1655 | |
| 1656 | /* { dg-regexp "\"kind\": \"note\"" } */ |
| 1657 | /* { dg-regexp "\"message\": \"...this statement, but the latter is misleadingly indented as if it were guarded by the 'if'\"" } */ |
| 1658 | +/* { dg-regexp "\"escape-source\": false" } */ |
| 1659 | |
| 1660 | /* { dg-regexp "\"caret\": \{" } */ |
| 1661 | /* { dg-regexp "\"file\": \"\[^\n\r\"\]*diagnostic-format-json-4.c\"" } */ |
| 1662 | @@ -39,6 +40,7 @@ int test (void) |
| 1663 | /* { dg-regexp "\"kind\": \"warning\"" } */ |
| 1664 | /* { dg-regexp "\"column-origin\": 1" } */ |
| 1665 | /* { dg-regexp "\"message\": \"this 'if' clause does not guard...\"" } */ |
| 1666 | +/* { dg-regexp "\"escape-source\": false" } */ |
| 1667 | /* { dg-regexp "\"option\": \"-Wmisleading-indentation\"" } */ |
| 1668 | /* { dg-regexp "\"option_url\": \"https:\[^\n\r\"\]*#index-Wmisleading-indentation\"" } */ |
| 1669 | |
| 1670 | diff --git a/gcc/testsuite/c-c++-common/diagnostic-format-json-5.c b/gcc/testsuite/c-c++-common/diagnostic-format-json-5.c |
| 1671 | --- a/gcc/testsuite/c-c++-common/diagnostic-format-json-5.c 2021-07-27 23:55:07.596291654 -0700 |
| 1672 | +++ b/gcc/testsuite/c-c++-common/diagnostic-format-json-5.c 2021-12-14 01:16:01.557942991 -0800 |
| 1673 | @@ -14,6 +14,7 @@ int test (struct s *ptr) |
| 1674 | |
| 1675 | /* { dg-regexp "\"kind\": \"error\"" } */ |
| 1676 | /* { dg-regexp "\"column-origin\": 1" } */ |
| 1677 | +/* { dg-regexp "\"escape-source\": false" } */ |
| 1678 | /* { dg-regexp "\"message\": \".*\"" } */ |
| 1679 | |
| 1680 | /* Verify fix-it hints. */ |
| 1681 | diff --git a/gcc/testsuite/gcc.dg/cpp/warn-normalized-4-bytes.c b/gcc/testsuite/gcc.dg/cpp/warn-normalized-4-bytes.c |
| 1682 | --- a/gcc/testsuite/gcc.dg/cpp/warn-normalized-4-bytes.c 1969-12-31 16:00:00.000000000 -0800 |
| 1683 | +++ b/gcc/testsuite/gcc.dg/cpp/warn-normalized-4-bytes.c 2021-12-14 01:16:01.557942991 -0800 |
| 1684 | @@ -0,0 +1,21 @@ |
| 1685 | +// { dg-do preprocess } |
| 1686 | +// { dg-options "-std=gnu99 -Werror=normalized=nfc -fdiagnostics-show-caret -fdiagnostics-escape-format=bytes" } |
| 1687 | +/* { dg-message "some warnings being treated as errors" "" {target "*-*-*"} 0 } */ |
| 1688 | + |
| 1689 | +/* འ= U+0F43 TIBETAN LETTER GHA, which has decomposition "0F42 0FB7" i.e. |
| 1690 | + U+0F42 TIBETAN LETTER GA: འ|
| 1691 | + U+0FB7 TIBETAN SUBJOINED LETTER HA: ྷ |
| 1692 | + |
| 1693 | + The UTF-8 encoding of U+0F43 TIBETAN LETTER GHA is: E0 BD 83. */ |
| 1694 | + |
| 1695 | +foo before_\u0F43_after bar // { dg-error "`before_.U00000f43_after' is not in NFC .-Werror=normalized=." } |
| 1696 | +/* { dg-begin-multiline-output "" } |
| 1697 | + foo before_\u0F43_after bar |
| 1698 | + ^~~~~~~~~~~~~~~~~~~ |
| 1699 | + { dg-end-multiline-output "" } */ |
| 1700 | + |
| 1701 | +foo before_à½_after bar // { dg-error "`before_.U00000f43_after' is not in NFC .-Werror=normalized=." } |
| 1702 | +/* { dg-begin-multiline-output "" } |
| 1703 | + foo before_<e0><bd><83>_after bar |
| 1704 | + ^~~~~~~~~~~~~~~~~~~~~~~~~ |
| 1705 | + { dg-end-multiline-output "" } */ |
| 1706 | diff --git a/gcc/testsuite/gcc.dg/cpp/warn-normalized-4-unicode.c b/gcc/testsuite/gcc.dg/cpp/warn-normalized-4-unicode.c |
| 1707 | --- a/gcc/testsuite/gcc.dg/cpp/warn-normalized-4-unicode.c 1969-12-31 16:00:00.000000000 -0800 |
| 1708 | +++ b/gcc/testsuite/gcc.dg/cpp/warn-normalized-4-unicode.c 2021-12-14 01:16:01.557942991 -0800 |
| 1709 | @@ -0,0 +1,19 @@ |
| 1710 | +// { dg-do preprocess } |
| 1711 | +// { dg-options "-std=gnu99 -Werror=normalized=nfc -fdiagnostics-show-caret -fdiagnostics-escape-format=unicode" } |
| 1712 | +/* { dg-message "some warnings being treated as errors" "" {target "*-*-*"} 0 } */ |
| 1713 | + |
| 1714 | +/* འ= U+0F43 TIBETAN LETTER GHA, which has decomposition "0F42 0FB7" i.e. |
| 1715 | + U+0F42 TIBETAN LETTER GA: འ|
| 1716 | + U+0FB7 TIBETAN SUBJOINED LETTER HA: ྷ */ |
| 1717 | + |
| 1718 | +foo before_\u0F43_after bar // { dg-error "`before_.U00000f43_after' is not in NFC .-Werror=normalized=." } |
| 1719 | +/* { dg-begin-multiline-output "" } |
| 1720 | + foo before_\u0F43_after bar |
| 1721 | + ^~~~~~~~~~~~~~~~~~~ |
| 1722 | + { dg-end-multiline-output "" } */ |
| 1723 | + |
| 1724 | +foo before_à½_after bar // { dg-error "`before_.U00000f43_after' is not in NFC .-Werror=normalized=." } |
| 1725 | +/* { dg-begin-multiline-output "" } |
| 1726 | + foo before_<U+0F43>_after bar |
| 1727 | + ^~~~~~~~~~~~~~~~~~~~~ |
| 1728 | + { dg-end-multiline-output "" } */ |
| 1729 | diff --git a/gcc/testsuite/gfortran.dg/diagnostic-format-json-1.F90 b/gcc/testsuite/gfortran.dg/diagnostic-format-json-1.F90 |
| 1730 | --- a/gcc/testsuite/gfortran.dg/diagnostic-format-json-1.F90 2021-07-27 23:55:08.472303878 -0700 |
| 1731 | +++ b/gcc/testsuite/gfortran.dg/diagnostic-format-json-1.F90 2021-12-14 01:16:01.557942991 -0800 |
| 1732 | @@ -9,6 +9,7 @@ |
| 1733 | |
| 1734 | ! { dg-regexp "\"kind\": \"error\"" } |
| 1735 | ! { dg-regexp "\"column-origin\": 1" } |
| 1736 | +! { dg-regexp "\"escape-source\": false" } |
| 1737 | ! { dg-regexp "\"message\": \"#error message\"" } |
| 1738 | |
| 1739 | ! { dg-regexp "\"caret\": \{" } |
| 1740 | diff --git a/gcc/testsuite/gfortran.dg/diagnostic-format-json-2.F90 b/gcc/testsuite/gfortran.dg/diagnostic-format-json-2.F90 |
| 1741 | --- a/gcc/testsuite/gfortran.dg/diagnostic-format-json-2.F90 2021-07-27 23:55:08.472303878 -0700 |
| 1742 | +++ b/gcc/testsuite/gfortran.dg/diagnostic-format-json-2.F90 2021-12-14 01:16:01.557942991 -0800 |
| 1743 | @@ -9,6 +9,7 @@ |
| 1744 | |
| 1745 | ! { dg-regexp "\"kind\": \"warning\"" } |
| 1746 | ! { dg-regexp "\"column-origin\": 1" } |
| 1747 | +! { dg-regexp "\"escape-source\": false" } |
| 1748 | ! { dg-regexp "\"message\": \"#warning message\"" } |
| 1749 | ! { dg-regexp "\"option\": \"-Wcpp\"" } |
| 1750 | ! { dg-regexp "\"option_url\": \"\[^\n\r\"\]*#index-Wcpp\"" } |
| 1751 | diff --git a/gcc/testsuite/gfortran.dg/diagnostic-format-json-3.F90 b/gcc/testsuite/gfortran.dg/diagnostic-format-json-3.F90 |
| 1752 | --- a/gcc/testsuite/gfortran.dg/diagnostic-format-json-3.F90 2021-07-27 23:55:08.472303878 -0700 |
| 1753 | +++ b/gcc/testsuite/gfortran.dg/diagnostic-format-json-3.F90 2021-12-14 01:16:01.557942991 -0800 |
| 1754 | @@ -9,6 +9,7 @@ |
| 1755 | |
| 1756 | ! { dg-regexp "\"kind\": \"error\"" } |
| 1757 | ! { dg-regexp "\"column-origin\": 1" } |
| 1758 | +! { dg-regexp "\"escape-source\": false" } |
| 1759 | ! { dg-regexp "\"message\": \"#warning message\"" } |
| 1760 | ! { dg-regexp "\"option\": \"-Werror=cpp\"" } |
| 1761 | ! { dg-regexp "\"option_url\": \"\[^\n\r\"\]*#index-Wcpp\"" } |
| 1762 | diff --git a/libcpp/charset.c b/libcpp/charset.c |
| 1763 | --- a/libcpp/charset.c 2021-07-27 23:55:08.712307227 -0700 |
| 1764 | +++ b/libcpp/charset.c 2021-12-14 01:16:01.557942991 -0800 |
| 1765 | @@ -1552,12 +1552,14 @@ convert_escape (cpp_reader *pfile, const |
| 1766 | "unknown escape sequence: '\\%c'", (int) c); |
| 1767 | else |
| 1768 | { |
| 1769 | + encoding_rich_location rich_loc (pfile); |
| 1770 | + |
| 1771 | /* diagnostic.c does not support "%03o". When it does, this |
| 1772 | code can use %03o directly in the diagnostic again. */ |
| 1773 | char buf[32]; |
| 1774 | sprintf(buf, "%03o", (int) c); |
| 1775 | - cpp_error (pfile, CPP_DL_PEDWARN, |
| 1776 | - "unknown escape sequence: '\\%s'", buf); |
| 1777 | + cpp_error_at (pfile, CPP_DL_PEDWARN, &rich_loc, |
| 1778 | + "unknown escape sequence: '\\%s'", buf); |
| 1779 | } |
| 1780 | } |
| 1781 | |
| 1782 | @@ -2280,14 +2282,16 @@ cpp_string_location_reader::get_next () |
| 1783 | } |
| 1784 | |
| 1785 | cpp_display_width_computation:: |
| 1786 | -cpp_display_width_computation (const char *data, int data_length, int tabstop) : |
| 1787 | +cpp_display_width_computation (const char *data, int data_length, |
| 1788 | + const cpp_char_column_policy &policy) : |
| 1789 | m_begin (data), |
| 1790 | m_next (m_begin), |
| 1791 | m_bytes_left (data_length), |
| 1792 | - m_tabstop (tabstop), |
| 1793 | + m_policy (policy), |
| 1794 | m_display_cols (0) |
| 1795 | { |
| 1796 | - gcc_assert (m_tabstop > 0); |
| 1797 | + gcc_assert (policy.m_tabstop > 0); |
| 1798 | + gcc_assert (policy.m_width_cb); |
| 1799 | } |
| 1800 | |
| 1801 | |
| 1802 | @@ -2299,19 +2303,28 @@ cpp_display_width_computation (const cha |
| 1803 | point to a valid UTF-8-encoded sequence, then it will be treated as a single |
| 1804 | byte with display width 1. m_cur_display_col is the current display column, |
| 1805 | relative to which tab stops should be expanded. Returns the display width of |
| 1806 | - the codepoint just processed. */ |
| 1807 | + the codepoint just processed. |
| 1808 | + If OUT is non-NULL, it is populated. */ |
| 1809 | |
| 1810 | int |
| 1811 | -cpp_display_width_computation::process_next_codepoint () |
| 1812 | +cpp_display_width_computation::process_next_codepoint (cpp_decoded_char *out) |
| 1813 | { |
| 1814 | cppchar_t c; |
| 1815 | int next_width; |
| 1816 | |
| 1817 | + if (out) |
| 1818 | + out->m_start_byte = m_next; |
| 1819 | + |
| 1820 | if (*m_next == '\t') |
| 1821 | { |
| 1822 | ++m_next; |
| 1823 | --m_bytes_left; |
| 1824 | - next_width = m_tabstop - (m_display_cols % m_tabstop); |
| 1825 | + next_width = m_policy.m_tabstop - (m_display_cols % m_policy.m_tabstop); |
| 1826 | + if (out) |
| 1827 | + { |
| 1828 | + out->m_ch = '\t'; |
| 1829 | + out->m_valid_ch = true; |
| 1830 | + } |
| 1831 | } |
| 1832 | else if (one_utf8_to_cppchar ((const uchar **) &m_next, &m_bytes_left, &c) |
| 1833 | != 0) |
| 1834 | @@ -2321,14 +2334,24 @@ cpp_display_width_computation::process_n |
| 1835 | of one. */ |
| 1836 | ++m_next; |
| 1837 | --m_bytes_left; |
| 1838 | - next_width = 1; |
| 1839 | + next_width = m_policy.m_undecoded_byte_width; |
| 1840 | + if (out) |
| 1841 | + out->m_valid_ch = false; |
| 1842 | } |
| 1843 | else |
| 1844 | { |
| 1845 | /* one_utf8_to_cppchar() has updated m_next and m_bytes_left for us. */ |
| 1846 | - next_width = cpp_wcwidth (c); |
| 1847 | + next_width = m_policy.m_width_cb (c); |
| 1848 | + if (out) |
| 1849 | + { |
| 1850 | + out->m_ch = c; |
| 1851 | + out->m_valid_ch = true; |
| 1852 | + } |
| 1853 | } |
| 1854 | |
| 1855 | + if (out) |
| 1856 | + out->m_next_byte = m_next; |
| 1857 | + |
| 1858 | m_display_cols += next_width; |
| 1859 | return next_width; |
| 1860 | } |
| 1861 | @@ -2344,7 +2367,7 @@ cpp_display_width_computation::advance_d |
| 1862 | const int start = m_display_cols; |
| 1863 | const int target = start + n; |
| 1864 | while (m_display_cols < target && !done ()) |
| 1865 | - process_next_codepoint (); |
| 1866 | + process_next_codepoint (NULL); |
| 1867 | return m_display_cols - start; |
| 1868 | } |
| 1869 | |
| 1870 | @@ -2352,29 +2375,33 @@ cpp_display_width_computation::advance_d |
| 1871 | how many display columns are occupied by the first COLUMN bytes. COLUMN |
| 1872 | may exceed DATA_LENGTH, in which case the phantom bytes at the end are |
| 1873 | treated as if they have display width 1. Tabs are expanded to the next tab |
| 1874 | - stop, relative to the start of DATA. */ |
| 1875 | + stop, relative to the start of DATA, and non-printable-ASCII characters |
| 1876 | + will be escaped as per POLICY. */ |
| 1877 | |
| 1878 | int |
| 1879 | cpp_byte_column_to_display_column (const char *data, int data_length, |
| 1880 | - int column, int tabstop) |
| 1881 | + int column, |
| 1882 | + const cpp_char_column_policy &policy) |
| 1883 | { |
| 1884 | const int offset = MAX (0, column - data_length); |
| 1885 | - cpp_display_width_computation dw (data, column - offset, tabstop); |
| 1886 | + cpp_display_width_computation dw (data, column - offset, policy); |
| 1887 | while (!dw.done ()) |
| 1888 | - dw.process_next_codepoint (); |
| 1889 | + dw.process_next_codepoint (NULL); |
| 1890 | return dw.display_cols_processed () + offset; |
| 1891 | } |
| 1892 | |
| 1893 | /* For the string of length DATA_LENGTH bytes that begins at DATA, compute |
| 1894 | the least number of bytes that will result in at least DISPLAY_COL display |
| 1895 | columns. The return value may exceed DATA_LENGTH if the entire string does |
| 1896 | - not occupy enough display columns. */ |
| 1897 | + not occupy enough display columns. Non-printable-ASCII characters |
| 1898 | + will be escaped as per POLICY. */ |
| 1899 | |
| 1900 | int |
| 1901 | cpp_display_column_to_byte_column (const char *data, int data_length, |
| 1902 | - int display_col, int tabstop) |
| 1903 | + int display_col, |
| 1904 | + const cpp_char_column_policy &policy) |
| 1905 | { |
| 1906 | - cpp_display_width_computation dw (data, data_length, tabstop); |
| 1907 | + cpp_display_width_computation dw (data, data_length, policy); |
| 1908 | const int avail_display = dw.advance_display_cols (display_col); |
| 1909 | return dw.bytes_processed () + MAX (0, display_col - avail_display); |
| 1910 | } |
| 1911 | diff --git a/libcpp/errors.c b/libcpp/errors.c |
| 1912 | --- a/libcpp/errors.c 2021-07-27 23:55:08.712307227 -0700 |
| 1913 | +++ b/libcpp/errors.c 2021-12-14 01:16:01.557942991 -0800 |
| 1914 | @@ -27,6 +27,31 @@ along with this program; see the file CO |
| 1915 | #include "cpplib.h" |
| 1916 | #include "internal.h" |
| 1917 | |
| 1918 | +/* Get a location_t for the current location in PFILE, |
| 1919 | + generally that of the previously lexed token. */ |
| 1920 | + |
| 1921 | +location_t |
| 1922 | +cpp_diagnostic_get_current_location (cpp_reader *pfile) |
| 1923 | +{ |
| 1924 | + if (CPP_OPTION (pfile, traditional)) |
| 1925 | + { |
| 1926 | + if (pfile->state.in_directive) |
| 1927 | + return pfile->directive_line; |
| 1928 | + else |
| 1929 | + return pfile->line_table->highest_line; |
| 1930 | + } |
| 1931 | + /* We don't want to refer to a token before the beginning of the |
| 1932 | + current run -- that is invalid. */ |
| 1933 | + else if (pfile->cur_token == pfile->cur_run->base) |
| 1934 | + { |
| 1935 | + return 0; |
| 1936 | + } |
| 1937 | + else |
| 1938 | + { |
| 1939 | + return pfile->cur_token[-1].src_loc; |
| 1940 | + } |
| 1941 | +} |
| 1942 | + |
| 1943 | /* Print a diagnostic at the given location. */ |
| 1944 | |
| 1945 | ATTRIBUTE_FPTR_PRINTF(5,0) |
| 1946 | @@ -52,25 +77,7 @@ cpp_diagnostic (cpp_reader * pfile, enum |
| 1947 | enum cpp_warning_reason reason, |
| 1948 | const char *msgid, va_list *ap) |
| 1949 | { |
| 1950 | - location_t src_loc; |
| 1951 | - |
| 1952 | - if (CPP_OPTION (pfile, traditional)) |
| 1953 | - { |
| 1954 | - if (pfile->state.in_directive) |
| 1955 | - src_loc = pfile->directive_line; |
| 1956 | - else |
| 1957 | - src_loc = pfile->line_table->highest_line; |
| 1958 | - } |
| 1959 | - /* We don't want to refer to a token before the beginning of the |
| 1960 | - current run -- that is invalid. */ |
| 1961 | - else if (pfile->cur_token == pfile->cur_run->base) |
| 1962 | - { |
| 1963 | - src_loc = 0; |
| 1964 | - } |
| 1965 | - else |
| 1966 | - { |
| 1967 | - src_loc = pfile->cur_token[-1].src_loc; |
| 1968 | - } |
| 1969 | + location_t src_loc = cpp_diagnostic_get_current_location (pfile); |
| 1970 | rich_location richloc (pfile->line_table, src_loc); |
| 1971 | return cpp_diagnostic_at (pfile, level, reason, &richloc, msgid, ap); |
| 1972 | } |
| 1973 | @@ -142,6 +149,43 @@ cpp_warning_syshdr (cpp_reader * pfile, |
| 1974 | |
| 1975 | va_end (ap); |
| 1976 | return ret; |
| 1977 | +} |
| 1978 | + |
| 1979 | +/* As cpp_warning above, but use RICHLOC as the location of the diagnostic. */ |
| 1980 | + |
| 1981 | +bool cpp_warning_at (cpp_reader *pfile, enum cpp_warning_reason reason, |
| 1982 | + rich_location *richloc, const char *msgid, ...) |
| 1983 | +{ |
| 1984 | + va_list ap; |
| 1985 | + bool ret; |
| 1986 | + |
| 1987 | + va_start (ap, msgid); |
| 1988 | + |
| 1989 | + ret = cpp_diagnostic_at (pfile, CPP_DL_WARNING, reason, richloc, |
| 1990 | + msgid, &ap); |
| 1991 | + |
| 1992 | + va_end (ap); |
| 1993 | + return ret; |
| 1994 | + |
| 1995 | +} |
| 1996 | + |
| 1997 | +/* As cpp_pedwarning above, but use RICHLOC as the location of the |
| 1998 | + diagnostic. */ |
| 1999 | + |
| 2000 | +bool |
| 2001 | +cpp_pedwarning_at (cpp_reader * pfile, enum cpp_warning_reason reason, |
| 2002 | + rich_location *richloc, const char *msgid, ...) |
| 2003 | +{ |
| 2004 | + va_list ap; |
| 2005 | + bool ret; |
| 2006 | + |
| 2007 | + va_start (ap, msgid); |
| 2008 | + |
| 2009 | + ret = cpp_diagnostic_at (pfile, CPP_DL_PEDWARN, reason, richloc, |
| 2010 | + msgid, &ap); |
| 2011 | + |
| 2012 | + va_end (ap); |
| 2013 | + return ret; |
| 2014 | } |
| 2015 | |
| 2016 | /* Print a diagnostic at a specific location. */ |
| 2017 | diff --git a/libcpp/include/cpplib.h b/libcpp/include/cpplib.h |
| 2018 | --- a/libcpp/include/cpplib.h 2021-12-13 23:23:05.768437079 -0800 |
| 2019 | +++ b/libcpp/include/cpplib.h 2021-12-14 01:20:16.189507386 -0800 |
| 2020 | @@ -1275,6 +1275,14 @@ extern bool cpp_warning_syshdr (cpp_read |
| 2021 | const char *msgid, ...) |
| 2022 | ATTRIBUTE_PRINTF_3; |
| 2023 | |
| 2024 | +/* As their counterparts above, but use RICHLOC. */ |
| 2025 | +extern bool cpp_warning_at (cpp_reader *, enum cpp_warning_reason, |
| 2026 | + rich_location *richloc, const char *msgid, ...) |
| 2027 | + ATTRIBUTE_PRINTF_4; |
| 2028 | +extern bool cpp_pedwarning_at (cpp_reader *, enum cpp_warning_reason, |
| 2029 | + rich_location *richloc, const char *msgid, ...) |
| 2030 | + ATTRIBUTE_PRINTF_4; |
| 2031 | + |
| 2032 | /* Output a diagnostic with "MSGID: " preceding the |
| 2033 | error string of errno. No location is printed. */ |
| 2034 | extern bool cpp_errno (cpp_reader *, enum cpp_diagnostic_level, |
| 2035 | @@ -1435,42 +1443,95 @@ extern const char * cpp_get_userdef_suff |
| 2036 | |
| 2037 | /* In charset.c */ |
| 2038 | |
| 2039 | +/* The result of attempting to decode a run of UTF-8 bytes. */ |
| 2040 | + |
| 2041 | +struct cpp_decoded_char |
| 2042 | +{ |
| 2043 | + const char *m_start_byte; |
| 2044 | + const char *m_next_byte; |
| 2045 | + |
| 2046 | + bool m_valid_ch; |
| 2047 | + cppchar_t m_ch; |
| 2048 | +}; |
| 2049 | + |
| 2050 | +/* Information for mapping between code points and display columns. |
| 2051 | + |
| 2052 | + This is a tabstop value, along with a callback for getting the |
| 2053 | + widths of characters. Normally this callback is cpp_wcwidth, but we |
| 2054 | + support other schemes for escaping non-ASCII unicode as a series of |
| 2055 | + ASCII chars when printing the user's source code in diagnostic-show-locus.c |
| 2056 | + |
| 2057 | + For example, consider: |
| 2058 | + - the Unicode character U+03C0 "GREEK SMALL LETTER PI" (UTF-8: 0xCF 0x80) |
| 2059 | + - the Unicode character U+1F642 "SLIGHTLY SMILING FACE" |
| 2060 | + (UTF-8: 0xF0 0x9F 0x99 0x82) |
| 2061 | + - the byte 0xBF (a stray trailing byte of a UTF-8 character) |
| 2062 | + Normally U+03C0 would occupy one display column, U+1F642 |
| 2063 | + would occupy two display columns, and the stray byte would be |
| 2064 | + printed verbatim as one display column. |
| 2065 | + |
| 2066 | + However when escaping them as unicode code points as "<U+03C0>" |
| 2067 | + and "<U+1F642>" they occupy 8 and 9 display columns respectively, |
| 2068 | + and when escaping them as bytes as "<CF><80>" and "<F0><9F><99><82>" |
| 2069 | + they occupy 8 and 16 display columns respectively. In both cases |
| 2070 | + the stray byte is escaped to <BF> as 4 display columns. */ |
| 2071 | + |
| 2072 | +struct cpp_char_column_policy |
| 2073 | +{ |
| 2074 | + cpp_char_column_policy (int tabstop, |
| 2075 | + int (*width_cb) (cppchar_t c)) |
| 2076 | + : m_tabstop (tabstop), |
| 2077 | + m_undecoded_byte_width (1), |
| 2078 | + m_width_cb (width_cb) |
| 2079 | + {} |
| 2080 | + |
| 2081 | + int m_tabstop; |
| 2082 | + /* Width in display columns of a stray byte that isn't decodable |
| 2083 | + as UTF-8. */ |
| 2084 | + int m_undecoded_byte_width; |
| 2085 | + int (*m_width_cb) (cppchar_t c); |
| 2086 | +}; |
| 2087 | + |
| 2088 | /* A class to manage the state while converting a UTF-8 sequence to cppchar_t |
| 2089 | and computing the display width one character at a time. */ |
| 2090 | class cpp_display_width_computation { |
| 2091 | public: |
| 2092 | cpp_display_width_computation (const char *data, int data_length, |
| 2093 | - int tabstop); |
| 2094 | + const cpp_char_column_policy &policy); |
| 2095 | const char *next_byte () const { return m_next; } |
| 2096 | int bytes_processed () const { return m_next - m_begin; } |
| 2097 | int bytes_left () const { return m_bytes_left; } |
| 2098 | bool done () const { return !bytes_left (); } |
| 2099 | int display_cols_processed () const { return m_display_cols; } |
| 2100 | |
| 2101 | - int process_next_codepoint (); |
| 2102 | + int process_next_codepoint (cpp_decoded_char *out); |
| 2103 | int advance_display_cols (int n); |
| 2104 | |
| 2105 | private: |
| 2106 | const char *const m_begin; |
| 2107 | const char *m_next; |
| 2108 | size_t m_bytes_left; |
| 2109 | - const int m_tabstop; |
| 2110 | + const cpp_char_column_policy &m_policy; |
| 2111 | int m_display_cols; |
| 2112 | }; |
| 2113 | |
| 2114 | /* Convenience functions that are simple use cases for class |
| 2115 | cpp_display_width_computation. Tab characters will be expanded to spaces |
| 2116 | - as determined by TABSTOP. */ |
| 2117 | + as determined by POLICY.m_tabstop, and non-printable-ASCII characters |
| 2118 | + will be escaped as per POLICY. */ |
| 2119 | + |
| 2120 | int cpp_byte_column_to_display_column (const char *data, int data_length, |
| 2121 | - int column, int tabstop); |
| 2122 | + int column, |
| 2123 | + const cpp_char_column_policy &policy); |
| 2124 | inline int cpp_display_width (const char *data, int data_length, |
| 2125 | - int tabstop) |
| 2126 | + const cpp_char_column_policy &policy) |
| 2127 | { |
| 2128 | return cpp_byte_column_to_display_column (data, data_length, data_length, |
| 2129 | - tabstop); |
| 2130 | + policy); |
| 2131 | } |
| 2132 | int cpp_display_column_to_byte_column (const char *data, int data_length, |
| 2133 | - int display_col, int tabstop); |
| 2134 | + int display_col, |
| 2135 | + const cpp_char_column_policy &policy); |
| 2136 | int cpp_wcwidth (cppchar_t c); |
| 2137 | |
| 2138 | #endif /* ! LIBCPP_CPPLIB_H */ |
| 2139 | diff --git a/libcpp/include/line-map.h b/libcpp/include/line-map.h |
| 2140 | --- a/libcpp/include/line-map.h 2021-07-27 23:55:08.716307283 -0700 |
| 2141 | +++ b/libcpp/include/line-map.h 2021-12-14 01:16:01.557942991 -0800 |
| 2142 | @@ -1781,6 +1781,18 @@ class rich_location |
| 2143 | const diagnostic_path *get_path () const { return m_path; } |
| 2144 | void set_path (const diagnostic_path *path) { m_path = path; } |
| 2145 | |
| 2146 | + /* A flag for hinting that the diagnostic involves character encoding |
| 2147 | + issues, and thus that it will be helpful to the user if we show some |
| 2148 | + representation of how the characters in the pertinent source lines |
| 2149 | + are encoded. |
| 2150 | + The default is false (i.e. do not escape). |
| 2151 | + When set to true, non-ASCII bytes in the pertinent source lines will |
| 2152 | + be escaped in a manner controlled by the user-supplied option |
| 2153 | + -fdiagnostics-escape-format=, so that the user can better understand |
| 2154 | + what's going on with the encoding in their source file. */ |
| 2155 | + bool escape_on_output_p () const { return m_escape_on_output; } |
| 2156 | + void set_escape_on_output (bool flag) { m_escape_on_output = flag; } |
| 2157 | + |
| 2158 | private: |
| 2159 | bool reject_impossible_fixit (location_t where); |
| 2160 | void stop_supporting_fixits (); |
| 2161 | @@ -1807,6 +1819,7 @@ protected: |
| 2162 | bool m_fixits_cannot_be_auto_applied; |
| 2163 | |
| 2164 | const diagnostic_path *m_path; |
| 2165 | + bool m_escape_on_output; |
| 2166 | }; |
| 2167 | |
| 2168 | /* A struct for the result of range_label::get_text: a NUL-terminated buffer |
| 2169 | diff --git a/libcpp/internal.h b/libcpp/internal.h |
| 2170 | --- a/libcpp/internal.h 2021-12-13 23:23:05.768437079 -0800 |
| 2171 | +++ b/libcpp/internal.h 2021-12-14 01:16:01.557942991 -0800 |
| 2172 | @@ -776,6 +776,9 @@ extern void _cpp_do_file_change (cpp_rea |
| 2173 | extern void _cpp_pop_buffer (cpp_reader *); |
| 2174 | extern char *_cpp_bracket_include (cpp_reader *); |
| 2175 | |
| 2176 | +/* In errors.c */ |
| 2177 | +extern location_t cpp_diagnostic_get_current_location (cpp_reader *); |
| 2178 | + |
| 2179 | /* In traditional.c. */ |
| 2180 | extern bool _cpp_scan_out_logical_line (cpp_reader *, cpp_macro *, bool); |
| 2181 | extern bool _cpp_read_logical_line_trad (cpp_reader *); |
| 2182 | @@ -942,6 +945,26 @@ int linemap_get_expansion_line (class li |
| 2183 | const char* linemap_get_expansion_filename (class line_maps *, |
| 2184 | location_t); |
| 2185 | |
| 2186 | +/* A subclass of rich_location for emitting a diagnostic |
| 2187 | + at the current location of the reader, but flagging |
| 2188 | + it with set_escape_on_output (true). */ |
| 2189 | +class encoding_rich_location : public rich_location |
| 2190 | +{ |
| 2191 | + public: |
| 2192 | + encoding_rich_location (cpp_reader *pfile) |
| 2193 | + : rich_location (pfile->line_table, |
| 2194 | + cpp_diagnostic_get_current_location (pfile)) |
| 2195 | + { |
| 2196 | + set_escape_on_output (true); |
| 2197 | + } |
| 2198 | + |
| 2199 | + encoding_rich_location (cpp_reader *pfile, location_t loc) |
| 2200 | + : rich_location (pfile->line_table, loc) |
| 2201 | + { |
| 2202 | + set_escape_on_output (true); |
| 2203 | + } |
| 2204 | +}; |
| 2205 | + |
| 2206 | #ifdef __cplusplus |
| 2207 | } |
| 2208 | #endif |
| 2209 | diff --git a/libcpp/lex.c b/libcpp/lex.c |
| 2210 | --- a/libcpp/lex.c 2021-12-14 01:14:48.435225968 -0800 |
| 2211 | +++ b/libcpp/lex.c 2021-12-14 01:24:37.220995816 -0800 |
| 2212 | @@ -1774,7 +1774,11 @@ skip_whitespace (cpp_reader *pfile, cppc |
| 2213 | while (is_nvspace (c)); |
| 2214 | |
| 2215 | if (saw_NUL) |
| 2216 | - cpp_error (pfile, CPP_DL_WARNING, "null character(s) ignored"); |
| 2217 | + { |
| 2218 | + encoding_rich_location rich_loc (pfile); |
| 2219 | + cpp_error_at (pfile, CPP_DL_WARNING, &rich_loc, |
| 2220 | + "null character(s) ignored"); |
| 2221 | + } |
| 2222 | |
| 2223 | buffer->cur--; |
| 2224 | } |
| 2225 | @@ -1803,6 +1807,28 @@ warn_about_normalization (cpp_reader *pf |
| 2226 | if (CPP_OPTION (pfile, warn_normalize) < NORMALIZE_STATE_RESULT (s) |
| 2227 | && !pfile->state.skipping) |
| 2228 | { |
| 2229 | + location_t loc = token->src_loc; |
| 2230 | + |
| 2231 | + /* If possible, create a location range for the token. */ |
| 2232 | + if (loc >= RESERVED_LOCATION_COUNT |
| 2233 | + && token->type != CPP_EOF |
| 2234 | + /* There must be no line notes to process. */ |
| 2235 | + && (!(pfile->buffer->cur |
| 2236 | + >= pfile->buffer->notes[pfile->buffer->cur_note].pos |
| 2237 | + && !pfile->overlaid_buffer))) |
| 2238 | + { |
| 2239 | + source_range tok_range; |
| 2240 | + tok_range.m_start = loc; |
| 2241 | + tok_range.m_finish |
| 2242 | + = linemap_position_for_column (pfile->line_table, |
| 2243 | + CPP_BUF_COLUMN (pfile->buffer, |
| 2244 | + pfile->buffer->cur)); |
| 2245 | + loc = COMBINE_LOCATION_DATA (pfile->line_table, |
| 2246 | + loc, tok_range, NULL); |
| 2247 | + } |
| 2248 | + |
| 2249 | + encoding_rich_location rich_loc (pfile, loc); |
| 2250 | + |
| 2251 | /* Make sure that the token is printed using UCNs, even |
| 2252 | if we'd otherwise happily print UTF-8. */ |
| 2253 | unsigned char *buf = XNEWVEC (unsigned char, cpp_token_len (token)); |
| 2254 | @@ -1810,11 +1836,11 @@ warn_about_normalization (cpp_reader *pf |
| 2255 | |
| 2256 | sz = cpp_spell_token (pfile, token, buf, false) - buf; |
| 2257 | if (NORMALIZE_STATE_RESULT (s) == normalized_C) |
| 2258 | - cpp_warning_with_line (pfile, CPP_W_NORMALIZE, token->src_loc, 0, |
| 2259 | - "`%.*s' is not in NFKC", (int) sz, buf); |
| 2260 | + cpp_warning_at (pfile, CPP_W_NORMALIZE, &rich_loc, |
| 2261 | + "`%.*s' is not in NFKC", (int) sz, buf); |
| 2262 | else |
| 2263 | - cpp_warning_with_line (pfile, CPP_W_NORMALIZE, token->src_loc, 0, |
| 2264 | - "`%.*s' is not in NFC", (int) sz, buf); |
| 2265 | + cpp_warning_at (pfile, CPP_W_NORMALIZE, &rich_loc, |
| 2266 | + "`%.*s' is not in NFC", (int) sz, buf); |
| 2267 | free (buf); |
| 2268 | } |
| 2269 | } |
| 2270 | diff --git a/libcpp/line-map.c b/libcpp/line-map.c |
| 2271 | --- a/libcpp/line-map.c 2021-07-27 23:55:08.716307283 -0700 |
| 2272 | +++ b/libcpp/line-map.c 2021-12-14 01:16:01.561942921 -0800 |
| 2273 | @@ -2086,7 +2086,8 @@ rich_location::rich_location (line_maps |
| 2274 | m_fixit_hints (), |
| 2275 | m_seen_impossible_fixit (false), |
| 2276 | m_fixits_cannot_be_auto_applied (false), |
| 2277 | - m_path (NULL) |
| 2278 | + m_path (NULL), |
| 2279 | + m_escape_on_output (false) |
| 2280 | { |
| 2281 | add_range (loc, SHOW_RANGE_WITH_CARET, label); |
| 2282 | } |