| From bd5e882cf6e0def3dd1bc106075d59a303fe0d1e Mon Sep 17 00:00:00 2001 |
| From: David Malcolm <dmalcolm@redhat.com> |
| Date: Mon, 18 Oct 2021 18:55:31 -0400 |
| Subject: [PATCH] diagnostics: escape non-ASCII source bytes for certain |
| diagnostics |
| MIME-Version: 1.0 |
| Content-Type: text/plain; charset=utf8 |
| Content-Transfer-Encoding: 8bit |
| |
| This patch adds support to GCC's diagnostic subsystem for escaping certain |
| bytes and Unicode characters when quoting source code. |
| |
| Specifically, this patch adds a new flag rich_location::m_escape_on_output |
| which is a hint from a diagnostic that non-ASCII bytes in the pertinent |
| lines of the user's source code should be escaped when printed. |
| |
| The patch sets this for the following diagnostics: |
| - when complaining about stray bytes in the program (when these |
| are non-printable) |
| - when complaining about "null character(s) ignored"); |
| - for -Wnormalized= (and generate source ranges for such warnings) |
| |
| The escaping is controlled by a new option: |
| -fdiagnostics-escape-format=[unicode|bytes] |
| |
| For example, consider a diagnostic involing a source line containing the |
| string "before" followed by the Unicode character U+03C0 ("GREEK SMALL |
| LETTER PI", with UTF-8 encoding 0xCF 0x80) followed by the byte 0xBF |
| (a stray UTF-8 trailing byte), followed by the string "after", where the |
| diagnostic highlights the U+03C0 character. |
| |
| By default, this line will be printed verbatim to the user when |
| reporting a diagnostic at it, as: |
| |
| beforeÏXafter |
| ^ |
| |
| (using X for the stray byte to avoid putting invalid UTF-8 in this |
| commit message) |
| |
| If the diagnostic sets the "escape" flag, it will be printed as: |
| |
| before<U+03C0><BF>after |
| ^~~~~~~~ |
| |
| with -fdiagnostics-escape-format=unicode (the default), or as: |
| |
| before<CF><80><BF>after |
| ^~~~~~~~ |
| |
| if the user supplies -fdiagnostics-escape-format=bytes. |
| |
| This only affects how the source is printed; it does not affect |
| how column numbers that are printed (as per -fdiagnostics-column-unit= |
| and -fdiagnostics-column-origin=). |
| |
| gcc/c-family/ChangeLog: |
| * c-lex.c (c_lex_with_flags): When complaining about non-printable |
| CPP_OTHER tokens, set the "escape on output" flag. |
| |
| gcc/ChangeLog: |
| * common.opt (fdiagnostics-escape-format=): New. |
| (diagnostics_escape_format): New enum. |
| (DIAGNOSTICS_ESCAPE_FORMAT_UNICODE): New enum value. |
| (DIAGNOSTICS_ESCAPE_FORMAT_BYTES): Likewise. |
| * diagnostic-format-json.cc (json_end_diagnostic): Add |
| "escape-source" attribute. |
| * diagnostic-show-locus.c |
| (exploc_with_display_col::exploc_with_display_col): Replace |
| "tabstop" param with a cpp_char_column_policy and add an "aspect" |
| param. Use these to compute m_display_col accordingly. |
| (struct char_display_policy): New struct. |
| (layout::m_policy): New field. |
| (layout::m_escape_on_output): New field. |
| (def_policy): New function. |
| (make_range): Update for changes to exploc_with_display_col ctor. |
| (default_print_decoded_ch): New. |
| (width_per_escaped_byte): New. |
| (escape_as_bytes_width): New. |
| (escape_as_bytes_print): New. |
| (escape_as_unicode_width): New. |
| (escape_as_unicode_print): New. |
| (make_policy): New. |
| (layout::layout): Initialize new fields. Update m_exploc ctor |
| call for above change to ctor. |
| (layout::maybe_add_location_range): Update for changes to |
| exploc_with_display_col ctor. |
| (layout::calculate_x_offset_display): Update for change to |
| cpp_display_width. |
| (layout::print_source_line): Pass policy |
| to cpp_display_width_computation. Capture cpp_decoded_char when |
| calling process_next_codepoint. Move printing of source code to |
| m_policy.m_print_cb. |
| (line_label::line_label): Pass in policy rather than context. |
| (layout::print_any_labels): Update for change to line_label ctor. |
| (get_affected_range): Pass in policy rather than context, updating |
| calls to location_compute_display_column accordingly. |
| (get_printed_columns): Likewise, also for cpp_display_width. |
| (correction::correction): Pass in policy rather than tabstop. |
| (correction::compute_display_cols): Pass m_policy rather than |
| m_tabstop to cpp_display_width. |
| (correction::m_tabstop): Replace with... |
| (correction::m_policy): ...this. |
| (line_corrections::line_corrections): Pass in policy rather than |
| context. |
| (line_corrections::m_context): Replace with... |
| (line_corrections::m_policy): ...this. |
| (line_corrections::add_hint): Update to use m_policy rather than |
| m_context. |
| (line_corrections::add_hint): Likewise. |
| (layout::print_trailing_fixits): Likewise. |
| (selftest::test_display_widths): New. |
| (selftest::test_layout_x_offset_display_utf8): Update to use |
| policy rather than tabstop. |
| (selftest::test_one_liner_labels_utf8): Add test of escaping |
| source lines. |
| (selftest::test_diagnostic_show_locus_one_liner_utf8): Update to |
| use policy rather than tabstop. |
| (selftest::test_overlapped_fixit_printing): Likewise. |
| (selftest::test_overlapped_fixit_printing_utf8): Likewise. |
| (selftest::test_overlapped_fixit_printing_2): Likewise. |
| (selftest::test_tab_expansion): Likewise. |
| (selftest::test_escaping_bytes_1): New. |
| (selftest::test_escaping_bytes_2): New. |
| (selftest::diagnostic_show_locus_c_tests): Call the new tests. |
| * diagnostic.c (diagnostic_initialize): Initialize |
| context->escape_format. |
| (convert_column_unit): Update to use default character width policy. |
| (selftest::test_diagnostic_get_location_text): Likewise. |
| * diagnostic.h (enum diagnostics_escape_format): New enum. |
| (diagnostic_context::escape_format): New field. |
| * doc/invoke.texi (-fdiagnostics-escape-format=): New option. |
| (-fdiagnostics-format=): Add "escape-source" attribute to examples |
| of JSON output, and document it. |
| * input.c (location_compute_display_column): Pass in "policy" |
| rather than "tabstop", passing to |
| cpp_byte_column_to_display_column. |
| (selftest::test_cpp_utf8): Update to use cpp_char_column_policy. |
| * input.h (class cpp_char_column_policy): New forward decl. |
| (location_compute_display_column): Pass in "policy" rather than |
| "tabstop". |
| * opts.c (common_handle_option): Handle |
| OPT_fdiagnostics_escape_format_. |
| * selftest.c (temp_source_file::temp_source_file): New ctor |
| overload taking a size_t. |
| * selftest.h (temp_source_file::temp_source_file): Likewise. |
| |
| gcc/testsuite/ChangeLog: |
| * c-c++-common/diagnostic-format-json-1.c: Add regexp to consume |
| "escape-source" attribute. |
| * c-c++-common/diagnostic-format-json-2.c: Likewise. |
| * c-c++-common/diagnostic-format-json-3.c: Likewise. |
| * c-c++-common/diagnostic-format-json-4.c: Likewise, twice. |
| * c-c++-common/diagnostic-format-json-5.c: Likewise. |
| * gcc.dg/cpp/warn-normalized-4-bytes.c: New test. |
| * gcc.dg/cpp/warn-normalized-4-unicode.c: New test. |
| * gcc.dg/encoding-issues-bytes.c: New test. |
| * gcc.dg/encoding-issues-unicode.c: New test. |
| * gfortran.dg/diagnostic-format-json-1.F90: Add regexp to consume |
| "escape-source" attribute. |
| * gfortran.dg/diagnostic-format-json-2.F90: Likewise. |
| * gfortran.dg/diagnostic-format-json-3.F90: Likewise. |
| |
| libcpp/ChangeLog: |
| * charset.c (convert_escape): Use encoding_rich_location when |
| complaining about nonprintable unknown escape sequences. |
| (cpp_display_width_computation::::cpp_display_width_computation): |
| Pass in policy rather than tabstop. |
| (cpp_display_width_computation::process_next_codepoint): Add "out" |
| param and populate *out if non-NULL. |
| (cpp_display_width_computation::advance_display_cols): Pass NULL |
| to process_next_codepoint. |
| (cpp_byte_column_to_display_column): Pass in policy rather than |
| tabstop. Pass NULL to process_next_codepoint. |
| (cpp_display_column_to_byte_column): Pass in policy rather than |
| tabstop. |
| * errors.c (cpp_diagnostic_get_current_location): New function, |
| splitting out the logic from... |
| (cpp_diagnostic): ...here. |
| (cpp_warning_at): New function. |
| (cpp_pedwarning_at): New function. |
| * include/cpplib.h (cpp_warning_at): New decl for rich_location. |
| (cpp_pedwarning_at): Likewise. |
| (struct cpp_decoded_char): New. |
| (struct cpp_char_column_policy): New. |
| (cpp_display_width_computation::cpp_display_width_computation): |
| Replace "tabstop" param with "policy". |
| (cpp_display_width_computation::process_next_codepoint): Add "out" |
| param. |
| (cpp_display_width_computation::m_tabstop): Replace with... |
| (cpp_display_width_computation::m_policy): ...this. |
| (cpp_byte_column_to_display_column): Replace "tabstop" param with |
| "policy". |
| (cpp_display_width): Likewise. |
| (cpp_display_column_to_byte_column): Likewise. |
| * include/line-map.h (rich_location::escape_on_output_p): New. |
| (rich_location::set_escape_on_output): New. |
| (rich_location::m_escape_on_output): New. |
| * internal.h (cpp_diagnostic_get_current_location): New decl. |
| (class encoding_rich_location): New. |
| * lex.c (skip_whitespace): Use encoding_rich_location when |
| complaining about null characters. |
| (warn_about_normalization): Generate a source range when |
| complaining about improperly normalized tokens, rather than just a |
| point, and use encoding_rich_location so that the source code |
| is escaped on printing. |
| * line-map.c (rich_location::rich_location): Initialize |
| m_escape_on_output. |
| |
| Signed-off-by: David Malcolm <dmalcolm@redhat.com> |
| |
| CVE: CVE-2021-42574 |
| Upstream-Status: Backport [https://gcc.gnu.org/git/gitweb.cgi?p=gcc.git;h=bd5e882cf6e0def3dd1bc106075d59a303fe0d1e] |
| Signed-off-by: Pgowda <pgowda.cve@gmail.com> |
| |
| --- |
| gcc/c-family/c-lex.c | 6 +- |
| gcc/common.opt | 13 + |
| gcc/diagnostic-format-json.cc | 3 + |
| gcc/diagnostic-show-locus.c | 580 +++++++++++++++--- |
| gcc/diagnostic.c | 10 +- |
| gcc/diagnostic.h | 18 + |
| gcc/doc/invoke.texi | 43 +- |
| gcc/input.c | 62 +- |
| gcc/input.h | 7 +- |
| gcc/opts.c | 4 + |
| gcc/selftest.c | 15 + |
| gcc/selftest.h | 2 + |
| .../c-c++-common/diagnostic-format-json-1.c | 1 + |
| .../c-c++-common/diagnostic-format-json-2.c | 1 + |
| .../c-c++-common/diagnostic-format-json-3.c | 1 + |
| .../c-c++-common/diagnostic-format-json-4.c | 2 + |
| .../c-c++-common/diagnostic-format-json-5.c | 1 + |
| .../gcc.dg/cpp/warn-normalized-4-bytes.c | 21 + |
| .../gcc.dg/cpp/warn-normalized-4-unicode.c | 19 + |
| gcc/testsuite/gcc.dg/encoding-issues-bytes.c | Bin 0 -> 595 bytes |
| .../gcc.dg/encoding-issues-unicode.c | Bin 0 -> 613 bytes |
| .../gfortran.dg/diagnostic-format-json-1.F90 | 1 + |
| .../gfortran.dg/diagnostic-format-json-2.F90 | 1 + |
| .../gfortran.dg/diagnostic-format-json-3.F90 | 1 + |
| libcpp/charset.c | 63 +- |
| libcpp/errors.c | 82 ++- |
| libcpp/include/cpplib.h | 76 ++- |
| libcpp/include/line-map.h | 13 + |
| libcpp/internal.h | 23 + |
| libcpp/lex.c | 38 +- |
| libcpp/line-map.c | 3 +- |
| 31 files changed, 942 insertions(+), 168 deletions(-) |
| create mode 100644 gcc/testsuite/gcc.dg/cpp/warn-normalized-4-bytes.c |
| create mode 100644 gcc/testsuite/gcc.dg/cpp/warn-normalized-4-unicode.c |
| create mode 100644 gcc/testsuite/gcc.dg/encoding-issues-bytes.c |
| create mode 100644 gcc/testsuite/gcc.dg/encoding-issues-unicode.c |
| |
| diff --git a/gcc/c-family/c-lex.c b/gcc/c-family/c-lex.c |
| --- a/gcc/c-family/c-lex.c 2021-07-27 23:55:06.980283060 -0700 |
| +++ b/gcc/c-family/c-lex.c 2021-12-14 01:16:01.541943272 -0800 |
| @@ -603,7 +603,11 @@ c_lex_with_flags (tree *value, location_ |
| else if (ISGRAPH (c)) |
| error_at (*loc, "stray %qc in program", (int) c); |
| else |
| - error_at (*loc, "stray %<\\%o%> in program", (int) c); |
| + { |
| + rich_location rich_loc (line_table, *loc); |
| + rich_loc.set_escape_on_output (true); |
| + error_at (&rich_loc, "stray %<\\%o%> in program", (int) c); |
| + } |
| } |
| goto retry; |
| |
| diff --git a/gcc/common.opt b/gcc/common.opt |
| --- a/gcc/common.opt 2021-12-13 22:08:44.939137107 -0800 |
| +++ b/gcc/common.opt 2021-12-14 01:16:01.541943272 -0800 |
| @@ -1348,6 +1348,10 @@ fdiagnostics-format= |
| Common Joined RejectNegative Enum(diagnostics_output_format) |
| -fdiagnostics-format=[text|json] Select output format. |
| |
| +fdiagnostics-escape-format= |
| +Common Joined RejectNegative Enum(diagnostics_escape_format) |
| +-fdiagnostics-escape-format=[unicode|bytes] Select how to escape non-printable-ASCII bytes in the source for diagnostics that suggest it. |
| + |
| ; Required for these enum values. |
| SourceInclude |
| diagnostic.h |
| @@ -1362,6 +1366,15 @@ EnumValue |
| Enum(diagnostics_column_unit) String(byte) Value(DIAGNOSTICS_COLUMN_UNIT_BYTE) |
| |
| Enum |
| +Name(diagnostics_escape_format) Type(int) |
| + |
| +EnumValue |
| +Enum(diagnostics_escape_format) String(unicode) Value(DIAGNOSTICS_ESCAPE_FORMAT_UNICODE) |
| + |
| +EnumValue |
| +Enum(diagnostics_escape_format) String(bytes) Value(DIAGNOSTICS_ESCAPE_FORMAT_BYTES) |
| + |
| +Enum |
| Name(diagnostics_output_format) Type(int) |
| |
| EnumValue |
| diff --git a/gcc/diagnostic.c b/gcc/diagnostic.c |
| --- a/gcc/diagnostic.c 2021-07-27 23:55:07.232286576 -0700 |
| +++ b/gcc/diagnostic.c 2021-12-14 01:16:01.545943202 -0800 |
| @@ -230,6 +230,7 @@ diagnostic_initialize (diagnostic_contex |
| context->column_unit = DIAGNOSTICS_COLUMN_UNIT_DISPLAY; |
| context->column_origin = 1; |
| context->tabstop = 8; |
| + context->escape_format = DIAGNOSTICS_ESCAPE_FORMAT_UNICODE; |
| context->edit_context_ptr = NULL; |
| context->diagnostic_group_nesting_depth = 0; |
| context->diagnostic_group_emission_count = 0; |
| @@ -382,7 +383,10 @@ convert_column_unit (enum diagnostics_co |
| gcc_unreachable (); |
| |
| case DIAGNOSTICS_COLUMN_UNIT_DISPLAY: |
| - return location_compute_display_column (s, tabstop); |
| + { |
| + cpp_char_column_policy policy (tabstop, cpp_wcwidth); |
| + return location_compute_display_column (s, policy); |
| + } |
| |
| case DIAGNOSTICS_COLUMN_UNIT_BYTE: |
| return s.column; |
| @@ -2275,8 +2279,8 @@ test_diagnostic_get_location_text () |
| const char *const content = "smile \xf0\x9f\x98\x82\n"; |
| const int line_bytes = strlen (content) - 1; |
| const int def_tabstop = 8; |
| - const int display_width = cpp_display_width (content, line_bytes, |
| - def_tabstop); |
| + const cpp_char_column_policy policy (def_tabstop, cpp_wcwidth); |
| + const int display_width = cpp_display_width (content, line_bytes, policy); |
| ASSERT_EQ (line_bytes - 2, display_width); |
| temp_source_file tmp (SELFTEST_LOCATION, ".c", content); |
| const char *const fname = tmp.get_filename (); |
| diff --git a/gcc/diagnostic-format-json.cc b/gcc/diagnostic-format-json.cc |
| --- a/gcc/diagnostic-format-json.cc 2021-07-27 23:55:07.232286576 -0700 |
| +++ b/gcc/diagnostic-format-json.cc 2021-12-14 01:16:01.541943272 -0800 |
| @@ -264,6 +264,9 @@ json_end_diagnostic (diagnostic_context |
| json::value *path_value = context->make_json_for_path (context, path); |
| diag_obj->set ("path", path_value); |
| } |
| + |
| + diag_obj->set ("escape-source", |
| + new json::literal (richloc->escape_on_output_p ())); |
| } |
| |
| /* No-op implementation of "begin_group_cb" for JSON output. */ |
| diff --git a/gcc/diagnostic.h b/gcc/diagnostic.h |
| --- a/gcc/diagnostic.h 2021-07-27 23:55:07.236286632 -0700 |
| +++ b/gcc/diagnostic.h 2021-12-14 01:16:01.545943202 -0800 |
| @@ -38,6 +38,20 @@ enum diagnostics_column_unit |
| DIAGNOSTICS_COLUMN_UNIT_BYTE |
| }; |
| |
| +/* An enum for controlling how to print non-ASCII characters/bytes when |
| + a diagnostic suggests escaping the source code on output. */ |
| + |
| +enum diagnostics_escape_format |
| +{ |
| + /* Escape non-ASCII Unicode characters in the form <U+XXXX> and |
| + non-UTF-8 bytes in the form <XX>. */ |
| + DIAGNOSTICS_ESCAPE_FORMAT_UNICODE, |
| + |
| + /* Escape non-ASCII bytes in the form <XX> (thus showing the underlying |
| + encoding of non-ASCII Unicode characters). */ |
| + DIAGNOSTICS_ESCAPE_FORMAT_BYTES |
| +}; |
| + |
| /* Enum for overriding the standard output format. */ |
| |
| enum diagnostics_output_format |
| @@ -320,6 +334,10 @@ struct diagnostic_context |
| /* The size of the tabstop for tab expansion. */ |
| int tabstop; |
| |
| + /* How should non-ASCII/non-printable bytes be escaped when |
| + a diagnostic suggests escaping the source code on output. */ |
| + enum diagnostics_escape_format escape_format; |
| + |
| /* If non-NULL, an edit_context to which fix-it hints should be |
| applied, for generating patches. */ |
| edit_context *edit_context_ptr; |
| diff --git a/gcc/diagnostic-show-locus.c b/gcc/diagnostic-show-locus.c |
| --- a/gcc/diagnostic-show-locus.c 2021-07-27 23:55:07.232286576 -0700 |
| +++ b/gcc/diagnostic-show-locus.c 2021-12-14 01:16:01.545943202 -0800 |
| @@ -175,10 +175,26 @@ enum column_unit { |
| class exploc_with_display_col : public expanded_location |
| { |
| public: |
| - exploc_with_display_col (const expanded_location &exploc, int tabstop) |
| - : expanded_location (exploc), |
| - m_display_col (location_compute_display_column (exploc, tabstop)) |
| - {} |
| + exploc_with_display_col (const expanded_location &exploc, |
| + const cpp_char_column_policy &policy, |
| + enum location_aspect aspect) |
| + : expanded_location (exploc), |
| + m_display_col (location_compute_display_column (exploc, policy)) |
| + { |
| + if (exploc.column > 0) |
| + { |
| + /* m_display_col is now the final column of the byte. |
| + If escaping has happened, we may want the first column instead. */ |
| + if (aspect != LOCATION_ASPECT_FINISH) |
| + { |
| + expanded_location prev_exploc (exploc); |
| + prev_exploc.column--; |
| + int prev_display_col |
| + = (location_compute_display_column (prev_exploc, policy)); |
| + m_display_col = prev_display_col + 1; |
| + } |
| + } |
| + } |
| |
| int m_display_col; |
| }; |
| @@ -313,6 +329,31 @@ test_line_span () |
| |
| #endif /* #if CHECKING_P */ |
| |
| +/* A bundle of information containing how to print unicode |
| + characters and bytes when quoting source code. |
| + |
| + Provides a unified place to support escaping some subset |
| + of characters to some format. |
| + |
| + Extends char_column_policy; printing is split out to avoid |
| + libcpp having to know about pretty_printer. */ |
| + |
| +struct char_display_policy : public cpp_char_column_policy |
| +{ |
| + public: |
| + char_display_policy (int tabstop, |
| + int (*width_cb) (cppchar_t c), |
| + void (*print_cb) (pretty_printer *pp, |
| + const cpp_decoded_char &cp)) |
| + : cpp_char_column_policy (tabstop, width_cb), |
| + m_print_cb (print_cb) |
| + { |
| + } |
| + |
| + void (*m_print_cb) (pretty_printer *pp, |
| + const cpp_decoded_char &cp); |
| +}; |
| + |
| /* A class to control the overall layout when printing a diagnostic. |
| |
| The layout is determined within the constructor. |
| @@ -345,6 +386,8 @@ class layout |
| |
| void print_line (linenum_type row); |
| |
| + void on_bad_codepoint (const char *ptr, cppchar_t ch, size_t ch_sz); |
| + |
| private: |
| bool will_show_line_p (linenum_type row) const; |
| void print_leading_fixits (linenum_type row); |
| @@ -386,6 +429,7 @@ class layout |
| private: |
| diagnostic_context *m_context; |
| pretty_printer *m_pp; |
| + char_display_policy m_policy; |
| location_t m_primary_loc; |
| exploc_with_display_col m_exploc; |
| colorizer m_colorizer; |
| @@ -398,6 +442,7 @@ class layout |
| auto_vec <line_span> m_line_spans; |
| int m_linenum_width; |
| int m_x_offset_display; |
| + bool m_escape_on_output; |
| }; |
| |
| /* Implementation of "class colorizer". */ |
| @@ -646,6 +691,11 @@ layout_range::intersects_line_p (linenum |
| /* Default for when we don't care what the tab expansion is set to. */ |
| static const int def_tabstop = 8; |
| |
| +static cpp_char_column_policy def_policy () |
| +{ |
| + return cpp_char_column_policy (8, cpp_wcwidth); |
| +} |
| + |
| /* Create some expanded locations for testing layout_range. The filename |
| member of the explocs is set to the empty string. This member will only be |
| inspected by the calls to location_compute_display_column() made from the |
| @@ -662,10 +712,13 @@ make_range (int start_line, int start_co |
| = {"", start_line, start_col, NULL, false}; |
| const expanded_location finish_exploc |
| = {"", end_line, end_col, NULL, false}; |
| - return layout_range (exploc_with_display_col (start_exploc, def_tabstop), |
| - exploc_with_display_col (finish_exploc, def_tabstop), |
| + return layout_range (exploc_with_display_col (start_exploc, def_policy (), |
| + LOCATION_ASPECT_START), |
| + exploc_with_display_col (finish_exploc, def_policy (), |
| + LOCATION_ASPECT_FINISH), |
| SHOW_RANGE_WITHOUT_CARET, |
| - exploc_with_display_col (start_exploc, def_tabstop), |
| + exploc_with_display_col (start_exploc, def_policy (), |
| + LOCATION_ASPECT_CARET), |
| 0, NULL); |
| } |
| |
| @@ -959,6 +1012,164 @@ fixit_cmp (const void *p_a, const void * |
| return hint_a->get_start_loc () - hint_b->get_start_loc (); |
| } |
| |
| +/* Callbacks for use when not escaping the source. */ |
| + |
| +/* The default callback for char_column_policy::m_width_cb is cpp_wcwidth. */ |
| + |
| +/* Callback for char_display_policy::m_print_cb for printing source chars |
| + when not escaping the source. */ |
| + |
| +static void |
| +default_print_decoded_ch (pretty_printer *pp, |
| + const cpp_decoded_char &decoded_ch) |
| +{ |
| + for (const char *ptr = decoded_ch.m_start_byte; |
| + ptr != decoded_ch.m_next_byte; ptr++) |
| + { |
| + if (*ptr == '\0' || *ptr == '\r') |
| + { |
| + pp_space (pp); |
| + continue; |
| + } |
| + |
| + pp_character (pp, *ptr); |
| + } |
| +} |
| + |
| +/* Callbacks for use with DIAGNOSTICS_ESCAPE_FORMAT_BYTES. */ |
| + |
| +static const int width_per_escaped_byte = 4; |
| + |
| +/* Callback for char_column_policy::m_width_cb for determining the |
| + display width when escaping with DIAGNOSTICS_ESCAPE_FORMAT_BYTES. */ |
| + |
| +static int |
| +escape_as_bytes_width (cppchar_t ch) |
| +{ |
| + if (ch < 0x80 && ISPRINT (ch)) |
| + return cpp_wcwidth (ch); |
| + else |
| + { |
| + if (ch <= 0x7F) return 1 * width_per_escaped_byte; |
| + if (ch <= 0x7FF) return 2 * width_per_escaped_byte; |
| + if (ch <= 0xFFFF) return 3 * width_per_escaped_byte; |
| + return 4 * width_per_escaped_byte; |
| + } |
| +} |
| + |
| +/* Callback for char_display_policy::m_print_cb for printing source chars |
| + when escaping with DIAGNOSTICS_ESCAPE_FORMAT_BYTES. */ |
| + |
| +static void |
| +escape_as_bytes_print (pretty_printer *pp, |
| + const cpp_decoded_char &decoded_ch) |
| +{ |
| + if (!decoded_ch.m_valid_ch) |
| + { |
| + for (const char *iter = decoded_ch.m_start_byte; |
| + iter != decoded_ch.m_next_byte; ++iter) |
| + { |
| + char buf[16]; |
| + sprintf (buf, "<%02x>", (unsigned char)*iter); |
| + pp_string (pp, buf); |
| + } |
| + return; |
| + } |
| + |
| + cppchar_t ch = decoded_ch.m_ch; |
| + if (ch < 0x80 && ISPRINT (ch)) |
| + pp_character (pp, ch); |
| + else |
| + { |
| + for (const char *iter = decoded_ch.m_start_byte; |
| + iter < decoded_ch.m_next_byte; ++iter) |
| + { |
| + char buf[16]; |
| + sprintf (buf, "<%02x>", (unsigned char)*iter); |
| + pp_string (pp, buf); |
| + } |
| + } |
| +} |
| + |
| +/* Callbacks for use with DIAGNOSTICS_ESCAPE_FORMAT_UNICODE. */ |
| + |
| +/* Callback for char_column_policy::m_width_cb for determining the |
| + display width when escaping with DIAGNOSTICS_ESCAPE_FORMAT_UNICODE. */ |
| + |
| +static int |
| +escape_as_unicode_width (cppchar_t ch) |
| +{ |
| + if (ch < 0x80 && ISPRINT (ch)) |
| + return cpp_wcwidth (ch); |
| + else |
| + { |
| + // Width of "<U+%04x>" |
| + if (ch > 0xfffff) |
| + return 10; |
| + else if (ch > 0xffff) |
| + return 9; |
| + else |
| + return 8; |
| + } |
| +} |
| + |
| +/* Callback for char_display_policy::m_print_cb for printing source chars |
| + when escaping with DIAGNOSTICS_ESCAPE_FORMAT_UNICODE. */ |
| + |
| +static void |
| +escape_as_unicode_print (pretty_printer *pp, |
| + const cpp_decoded_char &decoded_ch) |
| +{ |
| + if (!decoded_ch.m_valid_ch) |
| + { |
| + escape_as_bytes_print (pp, decoded_ch); |
| + return; |
| + } |
| + |
| + cppchar_t ch = decoded_ch.m_ch; |
| + if (ch < 0x80 && ISPRINT (ch)) |
| + pp_character (pp, ch); |
| + else |
| + { |
| + char buf[16]; |
| + sprintf (buf, "<U+%04X>", ch); |
| + pp_string (pp, buf); |
| + } |
| +} |
| + |
| +/* Populate a char_display_policy based on DC and RICHLOC. */ |
| + |
| +static char_display_policy |
| +make_policy (const diagnostic_context &dc, |
| + const rich_location &richloc) |
| +{ |
| + /* The default is to not escape non-ASCII bytes. */ |
| + char_display_policy result |
| + (dc.tabstop, cpp_wcwidth, default_print_decoded_ch); |
| + |
| + /* If the diagnostic suggests escaping non-ASCII bytes, then |
| + use policy from user-supplied options. */ |
| + if (richloc.escape_on_output_p ()) |
| + { |
| + result.m_undecoded_byte_width = width_per_escaped_byte; |
| + switch (dc.escape_format) |
| + { |
| + default: |
| + gcc_unreachable (); |
| + case DIAGNOSTICS_ESCAPE_FORMAT_UNICODE: |
| + result.m_width_cb = escape_as_unicode_width; |
| + result.m_print_cb = escape_as_unicode_print; |
| + break; |
| + case DIAGNOSTICS_ESCAPE_FORMAT_BYTES: |
| + result.m_width_cb = escape_as_bytes_width; |
| + result.m_print_cb = escape_as_bytes_print; |
| + break; |
| + } |
| + } |
| + |
| + return result; |
| +} |
| + |
| /* Implementation of class layout. */ |
| |
| /* Constructor for class layout. |
| @@ -975,8 +1186,10 @@ layout::layout (diagnostic_context * con |
| diagnostic_t diagnostic_kind) |
| : m_context (context), |
| m_pp (context->printer), |
| + m_policy (make_policy (*context, *richloc)), |
| m_primary_loc (richloc->get_range (0)->m_loc), |
| - m_exploc (richloc->get_expanded_location (0), context->tabstop), |
| + m_exploc (richloc->get_expanded_location (0), m_policy, |
| + LOCATION_ASPECT_CARET), |
| m_colorizer (context, diagnostic_kind), |
| m_colorize_source_p (context->colorize_source_p), |
| m_show_labels_p (context->show_labels_p), |
| @@ -986,7 +1199,8 @@ layout::layout (diagnostic_context * con |
| m_fixit_hints (richloc->get_num_fixit_hints ()), |
| m_line_spans (1 + richloc->get_num_locations ()), |
| m_linenum_width (0), |
| - m_x_offset_display (0) |
| + m_x_offset_display (0), |
| + m_escape_on_output (richloc->escape_on_output_p ()) |
| { |
| for (unsigned int idx = 0; idx < richloc->get_num_locations (); idx++) |
| { |
| @@ -1072,10 +1286,13 @@ layout::maybe_add_location_range (const |
| |
| /* Everything is now known to be in the correct source file, |
| but it may require further sanitization. */ |
| - layout_range ri (exploc_with_display_col (start, m_context->tabstop), |
| - exploc_with_display_col (finish, m_context->tabstop), |
| + layout_range ri (exploc_with_display_col (start, m_policy, |
| + LOCATION_ASPECT_START), |
| + exploc_with_display_col (finish, m_policy, |
| + LOCATION_ASPECT_FINISH), |
| loc_range->m_range_display_kind, |
| - exploc_with_display_col (caret, m_context->tabstop), |
| + exploc_with_display_col (caret, m_policy, |
| + LOCATION_ASPECT_CARET), |
| original_idx, loc_range->m_label); |
| |
| /* If we have a range that finishes before it starts (perhaps |
| @@ -1409,7 +1626,7 @@ layout::calculate_x_offset_display () |
| = get_line_bytes_without_trailing_whitespace (line.get_buffer (), |
| line.length ()); |
| int eol_display_column |
| - = cpp_display_width (line.get_buffer (), line_bytes, m_context->tabstop); |
| + = cpp_display_width (line.get_buffer (), line_bytes, m_policy); |
| if (caret_display_column > eol_display_column |
| || !caret_display_column) |
| { |
| @@ -1488,7 +1705,7 @@ layout::print_source_line (linenum_type |
| /* This object helps to keep track of which display column we are at, which is |
| necessary for computing the line bounds in display units, for doing |
| tab expansion, and for implementing m_x_offset_display. */ |
| - cpp_display_width_computation dw (line, line_bytes, m_context->tabstop); |
| + cpp_display_width_computation dw (line, line_bytes, m_policy); |
| |
| /* Skip the first m_x_offset_display display columns. In case the leading |
| portion that will be skipped ends with a character with wcwidth > 1, then |
| @@ -1536,7 +1753,8 @@ layout::print_source_line (linenum_type |
| tabs and replacing some control bytes with spaces as necessary. */ |
| const char *c = dw.next_byte (); |
| const int start_disp_col = dw.display_cols_processed () + 1; |
| - const int this_display_width = dw.process_next_codepoint (); |
| + cpp_decoded_char cp; |
| + const int this_display_width = dw.process_next_codepoint (&cp); |
| if (*c == '\t') |
| { |
| /* The returned display width is the number of spaces into which the |
| @@ -1545,15 +1763,6 @@ layout::print_source_line (linenum_type |
| pp_space (m_pp); |
| continue; |
| } |
| - if (*c == '\0' || *c == '\r') |
| - { |
| - /* cpp_wcwidth() promises to return 1 for all control bytes, and we |
| - want to output these as a single space too, so this case is |
| - actually the same as the '\t' case. */ |
| - gcc_assert (this_display_width == 1); |
| - pp_space (m_pp); |
| - continue; |
| - } |
| |
| /* We have a (possibly multibyte) character to output; update the line |
| bounds if it is not whitespace. */ |
| @@ -1565,7 +1774,8 @@ layout::print_source_line (linenum_type |
| } |
| |
| /* Output the character. */ |
| - while (c != dw.next_byte ()) pp_character (m_pp, *c++); |
| + m_policy.m_print_cb (m_pp, cp); |
| + c = dw.next_byte (); |
| } |
| print_newline (); |
| return lbounds; |
| @@ -1664,14 +1874,14 @@ layout::print_annotation_line (linenum_t |
| class line_label |
| { |
| public: |
| - line_label (diagnostic_context *context, int state_idx, int column, |
| + line_label (const cpp_char_column_policy &policy, |
| + int state_idx, int column, |
| label_text text) |
| : m_state_idx (state_idx), m_column (column), |
| m_text (text), m_label_line (0), m_has_vbar (true) |
| { |
| const int bytes = strlen (text.m_buffer); |
| - m_display_width |
| - = cpp_display_width (text.m_buffer, bytes, context->tabstop); |
| + m_display_width = cpp_display_width (text.m_buffer, bytes, policy); |
| } |
| |
| /* Sorting is primarily by column, then by state index. */ |
| @@ -1731,7 +1941,7 @@ layout::print_any_labels (linenum_type r |
| if (text.m_buffer == NULL) |
| continue; |
| |
| - labels.safe_push (line_label (m_context, i, disp_col, text)); |
| + labels.safe_push (line_label (m_policy, i, disp_col, text)); |
| } |
| } |
| |
| @@ -2011,7 +2221,7 @@ public: |
| |
| /* Get the range of bytes or display columns that HINT would affect. */ |
| static column_range |
| -get_affected_range (diagnostic_context *context, |
| +get_affected_range (const cpp_char_column_policy &policy, |
| const fixit_hint *hint, enum column_unit col_unit) |
| { |
| expanded_location exploc_start = expand_location (hint->get_start_loc ()); |
| @@ -2022,13 +2232,11 @@ get_affected_range (diagnostic_context * |
| int finish_column; |
| if (col_unit == CU_DISPLAY_COLS) |
| { |
| - start_column |
| - = location_compute_display_column (exploc_start, context->tabstop); |
| + start_column = location_compute_display_column (exploc_start, policy); |
| if (hint->insertion_p ()) |
| finish_column = start_column - 1; |
| else |
| - finish_column |
| - = location_compute_display_column (exploc_finish, context->tabstop); |
| + finish_column = location_compute_display_column (exploc_finish, policy); |
| } |
| else |
| { |
| @@ -2041,12 +2249,13 @@ get_affected_range (diagnostic_context * |
| /* Get the range of display columns that would be printed for HINT. */ |
| |
| static column_range |
| -get_printed_columns (diagnostic_context *context, const fixit_hint *hint) |
| +get_printed_columns (const cpp_char_column_policy &policy, |
| + const fixit_hint *hint) |
| { |
| expanded_location exploc = expand_location (hint->get_start_loc ()); |
| - int start_column = location_compute_display_column (exploc, context->tabstop); |
| + int start_column = location_compute_display_column (exploc, policy); |
| int hint_width = cpp_display_width (hint->get_string (), hint->get_length (), |
| - context->tabstop); |
| + policy); |
| int final_hint_column = start_column + hint_width - 1; |
| if (hint->insertion_p ()) |
| { |
| @@ -2056,8 +2265,7 @@ get_printed_columns (diagnostic_context |
| { |
| exploc = expand_location (hint->get_next_loc ()); |
| --exploc.column; |
| - int finish_column |
| - = location_compute_display_column (exploc, context->tabstop); |
| + int finish_column = location_compute_display_column (exploc, policy); |
| return column_range (start_column, |
| MAX (finish_column, final_hint_column)); |
| } |
| @@ -2075,13 +2283,13 @@ public: |
| column_range affected_columns, |
| column_range printed_columns, |
| const char *new_text, size_t new_text_len, |
| - int tabstop) |
| + const cpp_char_column_policy &policy) |
| : m_affected_bytes (affected_bytes), |
| m_affected_columns (affected_columns), |
| m_printed_columns (printed_columns), |
| m_text (xstrdup (new_text)), |
| m_byte_length (new_text_len), |
| - m_tabstop (tabstop), |
| + m_policy (policy), |
| m_alloc_sz (new_text_len + 1) |
| { |
| compute_display_cols (); |
| @@ -2099,7 +2307,7 @@ public: |
| |
| void compute_display_cols () |
| { |
| - m_display_cols = cpp_display_width (m_text, m_byte_length, m_tabstop); |
| + m_display_cols = cpp_display_width (m_text, m_byte_length, m_policy); |
| } |
| |
| void overwrite (int dst_offset, const char_span &src_span) |
| @@ -2127,7 +2335,7 @@ public: |
| char *m_text; |
| size_t m_byte_length; /* Not including null-terminator. */ |
| int m_display_cols; |
| - int m_tabstop; |
| + const cpp_char_column_policy &m_policy; |
| size_t m_alloc_sz; |
| }; |
| |
| @@ -2163,15 +2371,16 @@ correction::ensure_terminated () |
| class line_corrections |
| { |
| public: |
| - line_corrections (diagnostic_context *context, const char *filename, |
| + line_corrections (const char_display_policy &policy, |
| + const char *filename, |
| linenum_type row) |
| - : m_context (context), m_filename (filename), m_row (row) |
| + : m_policy (policy), m_filename (filename), m_row (row) |
| {} |
| ~line_corrections (); |
| |
| void add_hint (const fixit_hint *hint); |
| |
| - diagnostic_context *m_context; |
| + const char_display_policy &m_policy; |
| const char *m_filename; |
| linenum_type m_row; |
| auto_vec <correction *> m_corrections; |
| @@ -2217,10 +2426,10 @@ source_line::source_line (const char *fi |
| void |
| line_corrections::add_hint (const fixit_hint *hint) |
| { |
| - column_range affected_bytes = get_affected_range (m_context, hint, CU_BYTES); |
| - column_range affected_columns = get_affected_range (m_context, hint, |
| + column_range affected_bytes = get_affected_range (m_policy, hint, CU_BYTES); |
| + column_range affected_columns = get_affected_range (m_policy, hint, |
| CU_DISPLAY_COLS); |
| - column_range printed_columns = get_printed_columns (m_context, hint); |
| + column_range printed_columns = get_printed_columns (m_policy, hint); |
| |
| /* Potentially consolidate. */ |
| if (!m_corrections.is_empty ()) |
| @@ -2289,7 +2498,7 @@ line_corrections::add_hint (const fixit_ |
| printed_columns, |
| hint->get_string (), |
| hint->get_length (), |
| - m_context->tabstop)); |
| + m_policy)); |
| } |
| |
| /* If there are any fixit hints on source line ROW, print them. |
| @@ -2303,7 +2512,7 @@ layout::print_trailing_fixits (linenum_t |
| { |
| /* Build a list of correction instances for the line, |
| potentially consolidating hints (for the sake of readability). */ |
| - line_corrections corrections (m_context, m_exploc.file, row); |
| + line_corrections corrections (m_policy, m_exploc.file, row); |
| for (unsigned int i = 0; i < m_fixit_hints.length (); i++) |
| { |
| const fixit_hint *hint = m_fixit_hints[i]; |
| @@ -2646,6 +2855,59 @@ namespace selftest { |
| |
| /* Selftests for diagnostic_show_locus. */ |
| |
| +/* Verify that cpp_display_width correctly handles escaping. */ |
| + |
| +static void |
| +test_display_widths () |
| +{ |
| + gcc_rich_location richloc (UNKNOWN_LOCATION); |
| + |
| + /* U+03C0 "GREEK SMALL LETTER PI". */ |
| + const char *pi = "\xCF\x80"; |
| + /* U+1F642 "SLIGHTLY SMILING FACE". */ |
| + const char *emoji = "\xF0\x9F\x99\x82"; |
| + /* Stray trailing byte of a UTF-8 character. */ |
| + const char *stray = "\xBF"; |
| + /* U+10FFFF. */ |
| + const char *max_codepoint = "\xF4\x8F\xBF\xBF"; |
| + |
| + /* No escaping. */ |
| + { |
| + test_diagnostic_context dc; |
| + char_display_policy policy (make_policy (dc, richloc)); |
| + ASSERT_EQ (cpp_display_width (pi, strlen (pi), policy), 1); |
| + ASSERT_EQ (cpp_display_width (emoji, strlen (emoji), policy), 2); |
| + ASSERT_EQ (cpp_display_width (stray, strlen (stray), policy), 1); |
| + /* Don't check width of U+10FFFF; it's in a private use plane. */ |
| + } |
| + |
| + richloc.set_escape_on_output (true); |
| + |
| + { |
| + test_diagnostic_context dc; |
| + dc.escape_format = DIAGNOSTICS_ESCAPE_FORMAT_UNICODE; |
| + char_display_policy policy (make_policy (dc, richloc)); |
| + ASSERT_EQ (cpp_display_width (pi, strlen (pi), policy), 8); |
| + ASSERT_EQ (cpp_display_width (emoji, strlen (emoji), policy), 9); |
| + ASSERT_EQ (cpp_display_width (stray, strlen (stray), policy), 4); |
| + ASSERT_EQ (cpp_display_width (max_codepoint, strlen (max_codepoint), |
| + policy), |
| + strlen ("<U+10FFFF>")); |
| + } |
| + |
| + { |
| + test_diagnostic_context dc; |
| + dc.escape_format = DIAGNOSTICS_ESCAPE_FORMAT_BYTES; |
| + char_display_policy policy (make_policy (dc, richloc)); |
| + ASSERT_EQ (cpp_display_width (pi, strlen (pi), policy), 8); |
| + ASSERT_EQ (cpp_display_width (emoji, strlen (emoji), policy), 16); |
| + ASSERT_EQ (cpp_display_width (stray, strlen (stray), policy), 4); |
| + ASSERT_EQ (cpp_display_width (max_codepoint, strlen (max_codepoint), |
| + policy), |
| + 16); |
| + } |
| +} |
| + |
| /* For precise tests of the layout, make clear where the source line will |
| start. test_left_margin sets the total byte count from the left side of the |
| screen to the start of source lines, after the line number and the separator, |
| @@ -2715,10 +2977,10 @@ test_layout_x_offset_display_utf8 (const |
| char_span lspan = location_get_source_line (tmp.get_filename (), 1); |
| ASSERT_EQ (line_display_cols, |
| cpp_display_width (lspan.get_buffer (), lspan.length (), |
| - def_tabstop)); |
| + def_policy ())); |
| ASSERT_EQ (line_display_cols, |
| location_compute_display_column (expand_location (line_end), |
| - def_tabstop)); |
| + def_policy ())); |
| ASSERT_EQ (0, memcmp (lspan.get_buffer () + (emoji_col - 1), |
| "\xf0\x9f\x98\x82\xf0\x9f\x98\x82", 8)); |
| |
| @@ -2866,12 +3128,13 @@ test_layout_x_offset_display_tab (const |
| ASSERT_EQ ('\t', *(lspan.get_buffer () + (tab_col - 1))); |
| for (int tabstop = 1; tabstop != num_tabstops; ++tabstop) |
| { |
| + cpp_char_column_policy policy (tabstop, cpp_wcwidth); |
| ASSERT_EQ (line_bytes + extra_width[tabstop], |
| cpp_display_width (lspan.get_buffer (), lspan.length (), |
| - tabstop)); |
| + policy)); |
| ASSERT_EQ (line_bytes + extra_width[tabstop], |
| location_compute_display_column (expand_location (line_end), |
| - tabstop)); |
| + policy)); |
| } |
| |
| /* Check that the tab is expanded to the expected number of spaces. */ |
| @@ -4003,6 +4266,43 @@ test_one_liner_labels_utf8 () |
| " bb\xf0\x9f\x98\x82\xf0\x9f\x98\x82\n", |
| pp_formatted_text (dc.printer)); |
| } |
| + |
| + /* Example of escaping the source lines. */ |
| + { |
| + text_range_label label0 ("label 0\xf0\x9f\x98\x82"); |
| + text_range_label label1 ("label 1\xcf\x80"); |
| + text_range_label label2 ("label 2\xcf\x80"); |
| + gcc_rich_location richloc (foo, &label0); |
| + richloc.add_range (bar, SHOW_RANGE_WITHOUT_CARET, &label1); |
| + richloc.add_range (field, SHOW_RANGE_WITHOUT_CARET, &label2); |
| + richloc.set_escape_on_output (true); |
| + |
| + { |
| + test_diagnostic_context dc; |
| + dc.escape_format = DIAGNOSTICS_ESCAPE_FORMAT_UNICODE; |
| + diagnostic_show_locus (&dc, &richloc, DK_ERROR); |
| + ASSERT_STREQ (" <U+1F602>_foo = <U+03C0>_bar.<U+1F602>_field<U+03C0>;\n" |
| + " ^~~~~~~~~~~~~ ~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~~~\n" |
| + " | | |\n" |
| + " | | label 2\xcf\x80\n" |
| + " | label 1\xcf\x80\n" |
| + " label 0\xf0\x9f\x98\x82\n", |
| + pp_formatted_text (dc.printer)); |
| + } |
| + { |
| + test_diagnostic_context dc; |
| + dc.escape_format = DIAGNOSTICS_ESCAPE_FORMAT_BYTES; |
| + diagnostic_show_locus (&dc, &richloc, DK_ERROR); |
| + ASSERT_STREQ |
| + (" <f0><9f><98><82>_foo = <cf><80>_bar.<f0><9f><98><82>_field<cf><80>;\n" |
| + " ^~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n" |
| + " | | |\n" |
| + " | | label 2\xcf\x80\n" |
| + " | label 1\xcf\x80\n" |
| + " label 0\xf0\x9f\x98\x82\n", |
| + pp_formatted_text (dc.printer)); |
| + } |
| + } |
| } |
| |
| /* Make sure that colorization codes don't interrupt a multibyte |
| @@ -4057,9 +4357,9 @@ test_diagnostic_show_locus_one_liner_utf |
| |
| char_span lspan = location_get_source_line (tmp.get_filename (), 1); |
| ASSERT_EQ (25, cpp_display_width (lspan.get_buffer (), lspan.length (), |
| - def_tabstop)); |
| + def_policy ())); |
| ASSERT_EQ (25, location_compute_display_column (expand_location (line_end), |
| - def_tabstop)); |
| + def_policy ())); |
| |
| test_one_liner_simple_caret_utf8 (); |
| test_one_liner_caret_and_range_utf8 (); |
| @@ -4445,30 +4745,31 @@ test_overlapped_fixit_printing (const li |
| pp_formatted_text (dc.printer)); |
| |
| /* Unit-test the line_corrections machinery. */ |
| + char_display_policy policy (make_policy (dc, richloc)); |
| ASSERT_EQ (3, richloc.get_num_fixit_hints ()); |
| const fixit_hint *hint_0 = richloc.get_fixit_hint (0); |
| ASSERT_EQ (column_range (12, 12), |
| - get_affected_range (&dc, hint_0, CU_BYTES)); |
| + get_affected_range (policy, hint_0, CU_BYTES)); |
| ASSERT_EQ (column_range (12, 12), |
| - get_affected_range (&dc, hint_0, CU_DISPLAY_COLS)); |
| - ASSERT_EQ (column_range (12, 22), get_printed_columns (&dc, hint_0)); |
| + get_affected_range (policy, hint_0, CU_DISPLAY_COLS)); |
| + ASSERT_EQ (column_range (12, 22), get_printed_columns (policy, hint_0)); |
| const fixit_hint *hint_1 = richloc.get_fixit_hint (1); |
| ASSERT_EQ (column_range (18, 18), |
| - get_affected_range (&dc, hint_1, CU_BYTES)); |
| + get_affected_range (policy, hint_1, CU_BYTES)); |
| ASSERT_EQ (column_range (18, 18), |
| - get_affected_range (&dc, hint_1, CU_DISPLAY_COLS)); |
| - ASSERT_EQ (column_range (18, 20), get_printed_columns (&dc, hint_1)); |
| + get_affected_range (policy, hint_1, CU_DISPLAY_COLS)); |
| + ASSERT_EQ (column_range (18, 20), get_printed_columns (policy, hint_1)); |
| const fixit_hint *hint_2 = richloc.get_fixit_hint (2); |
| ASSERT_EQ (column_range (29, 28), |
| - get_affected_range (&dc, hint_2, CU_BYTES)); |
| + get_affected_range (policy, hint_2, CU_BYTES)); |
| ASSERT_EQ (column_range (29, 28), |
| - get_affected_range (&dc, hint_2, CU_DISPLAY_COLS)); |
| - ASSERT_EQ (column_range (29, 29), get_printed_columns (&dc, hint_2)); |
| + get_affected_range (policy, hint_2, CU_DISPLAY_COLS)); |
| + ASSERT_EQ (column_range (29, 29), get_printed_columns (policy, hint_2)); |
| |
| /* Add each hint in turn to a line_corrections instance, |
| and verify that they are consolidated into one correction instance |
| as expected. */ |
| - line_corrections lc (&dc, tmp.get_filename (), 1); |
| + line_corrections lc (policy, tmp.get_filename (), 1); |
| |
| /* The first replace hint by itself. */ |
| lc.add_hint (hint_0); |
| @@ -4660,30 +4961,31 @@ test_overlapped_fixit_printing_utf8 (con |
| pp_formatted_text (dc.printer)); |
| |
| /* Unit-test the line_corrections machinery. */ |
| + char_display_policy policy (make_policy (dc, richloc)); |
| ASSERT_EQ (3, richloc.get_num_fixit_hints ()); |
| const fixit_hint *hint_0 = richloc.get_fixit_hint (0); |
| ASSERT_EQ (column_range (14, 14), |
| - get_affected_range (&dc, hint_0, CU_BYTES)); |
| + get_affected_range (policy, hint_0, CU_BYTES)); |
| ASSERT_EQ (column_range (12, 12), |
| - get_affected_range (&dc, hint_0, CU_DISPLAY_COLS)); |
| - ASSERT_EQ (column_range (12, 22), get_printed_columns (&dc, hint_0)); |
| + get_affected_range (policy, hint_0, CU_DISPLAY_COLS)); |
| + ASSERT_EQ (column_range (12, 22), get_printed_columns (policy, hint_0)); |
| const fixit_hint *hint_1 = richloc.get_fixit_hint (1); |
| ASSERT_EQ (column_range (22, 22), |
| - get_affected_range (&dc, hint_1, CU_BYTES)); |
| + get_affected_range (policy, hint_1, CU_BYTES)); |
| ASSERT_EQ (column_range (18, 18), |
| - get_affected_range (&dc, hint_1, CU_DISPLAY_COLS)); |
| - ASSERT_EQ (column_range (18, 20), get_printed_columns (&dc, hint_1)); |
| + get_affected_range (policy, hint_1, CU_DISPLAY_COLS)); |
| + ASSERT_EQ (column_range (18, 20), get_printed_columns (policy, hint_1)); |
| const fixit_hint *hint_2 = richloc.get_fixit_hint (2); |
| ASSERT_EQ (column_range (35, 34), |
| - get_affected_range (&dc, hint_2, CU_BYTES)); |
| + get_affected_range (policy, hint_2, CU_BYTES)); |
| ASSERT_EQ (column_range (30, 29), |
| - get_affected_range (&dc, hint_2, CU_DISPLAY_COLS)); |
| - ASSERT_EQ (column_range (30, 30), get_printed_columns (&dc, hint_2)); |
| + get_affected_range (policy, hint_2, CU_DISPLAY_COLS)); |
| + ASSERT_EQ (column_range (30, 30), get_printed_columns (policy, hint_2)); |
| |
| /* Add each hint in turn to a line_corrections instance, |
| and verify that they are consolidated into one correction instance |
| as expected. */ |
| - line_corrections lc (&dc, tmp.get_filename (), 1); |
| + line_corrections lc (policy, tmp.get_filename (), 1); |
| |
| /* The first replace hint by itself. */ |
| lc.add_hint (hint_0); |
| @@ -4877,15 +5179,16 @@ test_overlapped_fixit_printing_2 (const |
| richloc.add_fixit_insert_before (col_21, "}"); |
| |
| /* These fixits should be accepted; they can't be consolidated. */ |
| + char_display_policy policy (make_policy (dc, richloc)); |
| ASSERT_EQ (2, richloc.get_num_fixit_hints ()); |
| const fixit_hint *hint_0 = richloc.get_fixit_hint (0); |
| ASSERT_EQ (column_range (23, 22), |
| - get_affected_range (&dc, hint_0, CU_BYTES)); |
| - ASSERT_EQ (column_range (23, 23), get_printed_columns (&dc, hint_0)); |
| + get_affected_range (policy, hint_0, CU_BYTES)); |
| + ASSERT_EQ (column_range (23, 23), get_printed_columns (policy, hint_0)); |
| const fixit_hint *hint_1 = richloc.get_fixit_hint (1); |
| ASSERT_EQ (column_range (21, 20), |
| - get_affected_range (&dc, hint_1, CU_BYTES)); |
| - ASSERT_EQ (column_range (21, 21), get_printed_columns (&dc, hint_1)); |
| + get_affected_range (policy, hint_1, CU_BYTES)); |
| + ASSERT_EQ (column_range (21, 21), get_printed_columns (policy, hint_1)); |
| |
| /* Verify that they're printed correctly. */ |
| diagnostic_show_locus (&dc, &richloc, DK_ERROR); |
| @@ -5152,10 +5455,11 @@ test_tab_expansion (const line_table_cas |
| ....................123 45678901234 56789012345 columns */ |
| |
| const int tabstop = 8; |
| + cpp_char_column_policy policy (tabstop, cpp_wcwidth); |
| const int first_non_ws_byte_col = 7; |
| const int right_quote_byte_col = 15; |
| const int last_byte_col = 25; |
| - ASSERT_EQ (35, cpp_display_width (content, last_byte_col, tabstop)); |
| + ASSERT_EQ (35, cpp_display_width (content, last_byte_col, policy)); |
| |
| temp_source_file tmp (SELFTEST_LOCATION, ".c", content); |
| line_table_test ltt (case_); |
| @@ -5198,6 +5502,114 @@ test_tab_expansion (const line_table_cas |
| } |
| } |
| |
| +/* Verify that the escaping machinery can cope with a variety of different |
| + invalid bytes. */ |
| + |
| +static void |
| +test_escaping_bytes_1 (const line_table_case &case_) |
| +{ |
| + const char content[] = "before\0\1\2\3\r\x80\xff""after\n"; |
| + const size_t sz = sizeof (content); |
| + temp_source_file tmp (SELFTEST_LOCATION, ".c", content, sz); |
| + line_table_test ltt (case_); |
| + const line_map_ordinary *ord_map = linemap_check_ordinary |
| + (linemap_add (line_table, LC_ENTER, false, tmp.get_filename (), 0)); |
| + linemap_line_start (line_table, 1, 100); |
| + |
| + location_t finish |
| + = linemap_position_for_line_and_column (line_table, ord_map, 1, |
| + strlen (content)); |
| + |
| + if (finish > LINE_MAP_MAX_LOCATION_WITH_COLS) |
| + return; |
| + |
| + /* Locations of the NUL and \r bytes. */ |
| + location_t nul_loc |
| + = linemap_position_for_line_and_column (line_table, ord_map, 1, 7); |
| + location_t r_loc |
| + = linemap_position_for_line_and_column (line_table, ord_map, 1, 11); |
| + gcc_rich_location richloc (nul_loc); |
| + richloc.add_range (r_loc); |
| + |
| + { |
| + test_diagnostic_context dc; |
| + diagnostic_show_locus (&dc, &richloc, DK_ERROR); |
| + ASSERT_STREQ (" before \1\2\3 \x80\xff""after\n" |
| + " ^ ~\n", |
| + pp_formatted_text (dc.printer)); |
| + } |
| + richloc.set_escape_on_output (true); |
| + { |
| + test_diagnostic_context dc; |
| + dc.escape_format = DIAGNOSTICS_ESCAPE_FORMAT_UNICODE; |
| + diagnostic_show_locus (&dc, &richloc, DK_ERROR); |
| + ASSERT_STREQ |
| + (" before<U+0000><U+0001><U+0002><U+0003><U+000D><80><ff>after\n" |
| + " ^~~~~~~~ ~~~~~~~~\n", |
| + pp_formatted_text (dc.printer)); |
| + } |
| + { |
| + test_diagnostic_context dc; |
| + dc.escape_format = DIAGNOSTICS_ESCAPE_FORMAT_BYTES; |
| + diagnostic_show_locus (&dc, &richloc, DK_ERROR); |
| + ASSERT_STREQ (" before<00><01><02><03><0d><80><ff>after\n" |
| + " ^~~~ ~~~~\n", |
| + pp_formatted_text (dc.printer)); |
| + } |
| +} |
| + |
| +/* As above, but verify that we handle the initial byte of a line |
| + correctly. */ |
| + |
| +static void |
| +test_escaping_bytes_2 (const line_table_case &case_) |
| +{ |
| + const char content[] = "\0after\n"; |
| + const size_t sz = sizeof (content); |
| + temp_source_file tmp (SELFTEST_LOCATION, ".c", content, sz); |
| + line_table_test ltt (case_); |
| + const line_map_ordinary *ord_map = linemap_check_ordinary |
| + (linemap_add (line_table, LC_ENTER, false, tmp.get_filename (), 0)); |
| + linemap_line_start (line_table, 1, 100); |
| + |
| + location_t finish |
| + = linemap_position_for_line_and_column (line_table, ord_map, 1, |
| + strlen (content)); |
| + |
| + if (finish > LINE_MAP_MAX_LOCATION_WITH_COLS) |
| + return; |
| + |
| + /* Location of the NUL byte. */ |
| + location_t nul_loc |
| + = linemap_position_for_line_and_column (line_table, ord_map, 1, 1); |
| + gcc_rich_location richloc (nul_loc); |
| + |
| + { |
| + test_diagnostic_context dc; |
| + diagnostic_show_locus (&dc, &richloc, DK_ERROR); |
| + ASSERT_STREQ (" after\n" |
| + " ^\n", |
| + pp_formatted_text (dc.printer)); |
| + } |
| + richloc.set_escape_on_output (true); |
| + { |
| + test_diagnostic_context dc; |
| + dc.escape_format = DIAGNOSTICS_ESCAPE_FORMAT_UNICODE; |
| + diagnostic_show_locus (&dc, &richloc, DK_ERROR); |
| + ASSERT_STREQ (" <U+0000>after\n" |
| + " ^~~~~~~~\n", |
| + pp_formatted_text (dc.printer)); |
| + } |
| + { |
| + test_diagnostic_context dc; |
| + dc.escape_format = DIAGNOSTICS_ESCAPE_FORMAT_BYTES; |
| + diagnostic_show_locus (&dc, &richloc, DK_ERROR); |
| + ASSERT_STREQ (" <00>after\n" |
| + " ^~~~\n", |
| + pp_formatted_text (dc.printer)); |
| + } |
| +} |
| + |
| /* Verify that line numbers are correctly printed for the case of |
| a multiline range in which the width of the line numbers changes |
| (e.g. from "9" to "10"). */ |
| @@ -5254,6 +5666,8 @@ diagnostic_show_locus_c_tests () |
| test_layout_range_for_single_line (); |
| test_layout_range_for_multiple_lines (); |
| |
| + test_display_widths (); |
| + |
| for_each_line_table_case (test_layout_x_offset_display_utf8); |
| for_each_line_table_case (test_layout_x_offset_display_tab); |
| |
| @@ -5274,6 +5688,8 @@ diagnostic_show_locus_c_tests () |
| for_each_line_table_case (test_fixit_replace_containing_newline); |
| for_each_line_table_case (test_fixit_deletion_affecting_newline); |
| for_each_line_table_case (test_tab_expansion); |
| + for_each_line_table_case (test_escaping_bytes_1); |
| + for_each_line_table_case (test_escaping_bytes_2); |
| |
| test_line_numbers_multiline_range (); |
| } |
| diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi |
| --- a/gcc/doc/invoke.texi 2021-12-13 23:23:05.764437151 -0800 |
| +++ b/gcc/doc/invoke.texi 2021-12-14 01:16:01.553943061 -0800 |
| @@ -312,7 +312,8 @@ Objective-C and Objective-C++ Dialects}. |
| -fdiagnostics-show-path-depths @gol |
| -fno-show-column @gol |
| -fdiagnostics-column-unit=@r{[}display@r{|}byte@r{]} @gol |
| --fdiagnostics-column-origin=@var{origin}} |
| +-fdiagnostics-column-origin=@var{origin} @gol |
| +-fdiagnostics-escape-format=@r{[}unicode@r{|}bytes@r{]}} |
| |
| @item Warning Options |
| @xref{Warning Options,,Options to Request or Suppress Warnings}. |
| @@ -5083,6 +5084,38 @@ first column. The default value of 1 co |
| behavior and to the GNU style guide. Some utilities may perform better with an |
| origin of 0; any non-negative value may be specified. |
| |
| +@item -fdiagnostics-escape-format=@var{FORMAT} |
| +@opindex fdiagnostics-escape-format |
| +When GCC prints pertinent source lines for a diagnostic it normally attempts |
| +to print the source bytes directly. However, some diagnostics relate to encoding |
| +issues in the source file, such as malformed UTF-8, or issues with Unicode |
| +normalization. These diagnostics are flagged so that GCC will escape bytes |
| +that are not printable ASCII when printing their pertinent source lines. |
| + |
| +This option controls how such bytes should be escaped. |
| + |
| +The default @var{FORMAT}, @samp{unicode} displays Unicode characters that |
| +are not printable ASCII in the form @samp{<U+XXXX>}, and bytes that do not |
| +correspond to a Unicode character validly-encoded in UTF-8-encoded will be |
| +displayed as hexadecimal in the form @samp{<XX>}. |
| + |
| +For example, a source line containing the string @samp{before} followed by the |
| +Unicode character U+03C0 (``GREEK SMALL LETTER PI'', with UTF-8 encoding |
| +0xCF 0x80) followed by the byte 0xBF (a stray UTF-8 trailing byte), followed by |
| +the string @samp{after} will be printed for such a diagnostic as: |
| + |
| +@smallexample |
| + before<U+03C0><BF>after |
| +@end smallexample |
| + |
| +Setting @var{FORMAT} to @samp{bytes} will display all non-printable-ASCII bytes |
| +in the form @samp{<XX>}, thus showing the underlying encoding of non-ASCII |
| +Unicode characters. For the example above, the following will be printed: |
| + |
| +@smallexample |
| + before<CF><80><BF>after |
| +@end smallexample |
| + |
| @item -fdiagnostics-format=@var{FORMAT} |
| @opindex fdiagnostics-format |
| Select a different format for printing diagnostics. |
| @@ -5150,9 +5183,11 @@ might be printed in JSON form (after for |
| @} |
| @} |
| ], |
| + "escape-source": false, |
| "message": "...this statement, but the latter is @dots{}" |
| @} |
| ] |
| + "escape-source": false, |
| "column-origin": 1, |
| @}, |
| @dots{} |
| @@ -5239,6 +5274,7 @@ of the expression, which have labels. I |
| "label": "T @{aka struct t@}" |
| @} |
| ], |
| + "escape-source": false, |
| "message": "invalid operands to binary + @dots{}" |
| @} |
| @end smallexample |
| @@ -5292,6 +5328,7 @@ might be printed in JSON form as: |
| @} |
| @} |
| ], |
| + "escape-source": false, |
| "message": "\u2018struct s\u2019 has no member named @dots{}" |
| @} |
| @end smallexample |
| @@ -5349,6 +5386,10 @@ For example, the intraprocedural example |
| ] |
| @end smallexample |
| |
| +Diagnostics have a boolean attribute @code{escape-source}, hinting whether |
| +non-ASCII bytes should be escaped when printing the pertinent lines of |
| +source code (@code{true} for diagnostics involving source encoding issues). |
| + |
| @end table |
| |
| @node Warning Options |
| diff --git a/gcc/input.c b/gcc/input.c |
| --- a/gcc/input.c 2021-07-27 23:55:07.328287915 -0700 |
| +++ b/gcc/input.c 2021-12-14 01:16:01.553943061 -0800 |
| @@ -913,7 +913,8 @@ make_location (location_t caret, source_ |
| source line in order to calculate the display width. If that cannot be done |
| for any reason, then returns the byte column as a fallback. */ |
| int |
| -location_compute_display_column (expanded_location exploc, int tabstop) |
| +location_compute_display_column (expanded_location exploc, |
| + const cpp_char_column_policy &policy) |
| { |
| if (!(exploc.file && *exploc.file && exploc.line && exploc.column)) |
| return exploc.column; |
| @@ -921,7 +922,7 @@ location_compute_display_column (expande |
| /* If line is NULL, this function returns exploc.column which is the |
| desired fallback. */ |
| return cpp_byte_column_to_display_column (line.get_buffer (), line.length (), |
| - exploc.column, tabstop); |
| + exploc.column, policy); |
| } |
| |
| /* Dump statistics to stderr about the memory usage of the line_table |
| @@ -3611,43 +3612,50 @@ test_line_offset_overflow () |
| void test_cpp_utf8 () |
| { |
| const int def_tabstop = 8; |
| + cpp_char_column_policy policy (def_tabstop, cpp_wcwidth); |
| + |
| /* Verify that wcwidth of invalid UTF-8 or control bytes is 1. */ |
| { |
| - int w_bad = cpp_display_width ("\xf0!\x9f!\x98!\x82!", 8, def_tabstop); |
| + int w_bad = cpp_display_width ("\xf0!\x9f!\x98!\x82!", 8, policy); |
| ASSERT_EQ (8, w_bad); |
| - int w_ctrl = cpp_display_width ("\r\n\v\0\1", 5, def_tabstop); |
| + int w_ctrl = cpp_display_width ("\r\n\v\0\1", 5, policy); |
| ASSERT_EQ (5, w_ctrl); |
| } |
| |
| /* Verify that wcwidth of valid UTF-8 is as expected. */ |
| { |
| - const int w_pi = cpp_display_width ("\xcf\x80", 2, def_tabstop); |
| + const int w_pi = cpp_display_width ("\xcf\x80", 2, policy); |
| ASSERT_EQ (1, w_pi); |
| - const int w_emoji = cpp_display_width ("\xf0\x9f\x98\x82", 4, def_tabstop); |
| + const int w_emoji = cpp_display_width ("\xf0\x9f\x98\x82", 4, policy); |
| ASSERT_EQ (2, w_emoji); |
| const int w_umlaut_precomposed = cpp_display_width ("\xc3\xbf", 2, |
| - def_tabstop); |
| + policy); |
| ASSERT_EQ (1, w_umlaut_precomposed); |
| const int w_umlaut_combining = cpp_display_width ("y\xcc\x88", 3, |
| - def_tabstop); |
| + policy); |
| ASSERT_EQ (1, w_umlaut_combining); |
| - const int w_han = cpp_display_width ("\xe4\xb8\xba", 3, def_tabstop); |
| + const int w_han = cpp_display_width ("\xe4\xb8\xba", 3, policy); |
| ASSERT_EQ (2, w_han); |
| - const int w_ascii = cpp_display_width ("GCC", 3, def_tabstop); |
| + const int w_ascii = cpp_display_width ("GCC", 3, policy); |
| ASSERT_EQ (3, w_ascii); |
| const int w_mixed = cpp_display_width ("\xcf\x80 = 3.14 \xf0\x9f\x98\x82" |
| "\x9f! \xe4\xb8\xba y\xcc\x88", |
| - 24, def_tabstop); |
| + 24, policy); |
| ASSERT_EQ (18, w_mixed); |
| } |
| |
| /* Verify that display width properly expands tabs. */ |
| { |
| const char *tstr = "\tabc\td"; |
| - ASSERT_EQ (6, cpp_display_width (tstr, 6, 1)); |
| - ASSERT_EQ (10, cpp_display_width (tstr, 6, 3)); |
| - ASSERT_EQ (17, cpp_display_width (tstr, 6, 8)); |
| - ASSERT_EQ (1, cpp_display_column_to_byte_column (tstr, 6, 7, 8)); |
| + ASSERT_EQ (6, cpp_display_width (tstr, 6, |
| + cpp_char_column_policy (1, cpp_wcwidth))); |
| + ASSERT_EQ (10, cpp_display_width (tstr, 6, |
| + cpp_char_column_policy (3, cpp_wcwidth))); |
| + ASSERT_EQ (17, cpp_display_width (tstr, 6, |
| + cpp_char_column_policy (8, cpp_wcwidth))); |
| + ASSERT_EQ (1, |
| + cpp_display_column_to_byte_column |
| + (tstr, 6, 7, cpp_char_column_policy (8, cpp_wcwidth))); |
| } |
| |
| /* Verify that cpp_byte_column_to_display_column can go past the end, |
| @@ -3660,13 +3668,13 @@ void test_cpp_utf8 () |
| /* 111122223456 |
| Byte columns. */ |
| |
| - ASSERT_EQ (5, cpp_display_width (str, 6, def_tabstop)); |
| + ASSERT_EQ (5, cpp_display_width (str, 6, policy)); |
| ASSERT_EQ (105, |
| - cpp_byte_column_to_display_column (str, 6, 106, def_tabstop)); |
| + cpp_byte_column_to_display_column (str, 6, 106, policy)); |
| ASSERT_EQ (10000, |
| - cpp_byte_column_to_display_column (NULL, 0, 10000, def_tabstop)); |
| + cpp_byte_column_to_display_column (NULL, 0, 10000, policy)); |
| ASSERT_EQ (0, |
| - cpp_byte_column_to_display_column (NULL, 10000, 0, def_tabstop)); |
| + cpp_byte_column_to_display_column (NULL, 10000, 0, policy)); |
| } |
| |
| /* Verify that cpp_display_column_to_byte_column can go past the end, |
| @@ -3680,25 +3688,25 @@ void test_cpp_utf8 () |
| /* 000000000000000000000000000000000111111 |
| 111122223333444456666777788889999012345 |
| Byte columns. */ |
| - ASSERT_EQ (4, cpp_display_column_to_byte_column (str, 15, 2, def_tabstop)); |
| + ASSERT_EQ (4, cpp_display_column_to_byte_column (str, 15, 2, policy)); |
| ASSERT_EQ (15, |
| - cpp_display_column_to_byte_column (str, 15, 11, def_tabstop)); |
| + cpp_display_column_to_byte_column (str, 15, 11, policy)); |
| ASSERT_EQ (115, |
| - cpp_display_column_to_byte_column (str, 15, 111, def_tabstop)); |
| + cpp_display_column_to_byte_column (str, 15, 111, policy)); |
| ASSERT_EQ (10000, |
| - cpp_display_column_to_byte_column (NULL, 0, 10000, def_tabstop)); |
| + cpp_display_column_to_byte_column (NULL, 0, 10000, policy)); |
| ASSERT_EQ (0, |
| - cpp_display_column_to_byte_column (NULL, 10000, 0, def_tabstop)); |
| + cpp_display_column_to_byte_column (NULL, 10000, 0, policy)); |
| |
| /* Verify that we do not interrupt a UTF-8 sequence. */ |
| - ASSERT_EQ (4, cpp_display_column_to_byte_column (str, 15, 1, def_tabstop)); |
| + ASSERT_EQ (4, cpp_display_column_to_byte_column (str, 15, 1, policy)); |
| |
| for (int byte_col = 1; byte_col <= 15; ++byte_col) |
| { |
| const int disp_col |
| - = cpp_byte_column_to_display_column (str, 15, byte_col, def_tabstop); |
| + = cpp_byte_column_to_display_column (str, 15, byte_col, policy); |
| const int byte_col2 |
| - = cpp_display_column_to_byte_column (str, 15, disp_col, def_tabstop); |
| + = cpp_display_column_to_byte_column (str, 15, disp_col, policy); |
| |
| /* If we ask for the display column in the middle of a UTF-8 |
| sequence, it will return the length of the partial sequence, |
| diff --git a/gcc/input.h b/gcc/input.h |
| --- a/gcc/input.h 2021-07-27 23:55:07.328287915 -0700 |
| +++ b/gcc/input.h 2021-12-14 01:16:01.553943061 -0800 |
| @@ -39,8 +39,11 @@ STATIC_ASSERT (BUILTINS_LOCATION < RESER |
| extern bool is_location_from_builtin_token (location_t); |
| extern expanded_location expand_location (location_t); |
| |
| -extern int location_compute_display_column (expanded_location exploc, |
| - int tabstop); |
| +class cpp_char_column_policy; |
| + |
| +extern int |
| +location_compute_display_column (expanded_location exploc, |
| + const cpp_char_column_policy &policy); |
| |
| /* A class capturing the bounds of a buffer, to allow for run-time |
| bounds-checking in a checked build. */ |
| diff --git a/gcc/opts.c b/gcc/opts.c |
| --- a/gcc/opts.c 2021-07-27 23:55:07.364288417 -0700 |
| +++ b/gcc/opts.c 2021-12-14 01:16:01.553943061 -0800 |
| @@ -2573,6 +2573,10 @@ common_handle_option (struct gcc_options |
| dc->column_origin = value; |
| break; |
| |
| + case OPT_fdiagnostics_escape_format_: |
| + dc->escape_format = (enum diagnostics_escape_format)value; |
| + break; |
| + |
| case OPT_fdiagnostics_show_cwe: |
| dc->show_cwe = value; |
| break; |
| diff --git a/gcc/selftest.c b/gcc/selftest.c |
| --- a/gcc/selftest.c 2021-07-27 23:55:07.500290315 -0700 |
| +++ b/gcc/selftest.c 2021-12-14 01:16:01.557942991 -0800 |
| @@ -193,6 +193,21 @@ temp_source_file::temp_source_file (cons |
| fclose (out); |
| } |
| |
| +/* As above, but with a size, to allow for NUL bytes in CONTENT. */ |
| + |
| +temp_source_file::temp_source_file (const location &loc, |
| + const char *suffix, |
| + const char *content, |
| + size_t sz) |
| +: named_temp_file (suffix) |
| +{ |
| + FILE *out = fopen (get_filename (), "w"); |
| + if (!out) |
| + fail_formatted (loc, "unable to open tempfile: %s", get_filename ()); |
| + fwrite (content, sz, 1, out); |
| + fclose (out); |
| +} |
| + |
| /* Avoid introducing locale-specific differences in the results |
| by hardcoding open_quote and close_quote. */ |
| |
| diff --git a/gcc/selftest.h b/gcc/selftest.h |
| --- a/gcc/selftest.h 2021-07-27 23:55:07.500290315 -0700 |
| +++ b/gcc/selftest.h 2021-12-14 01:16:01.557942991 -0800 |
| @@ -112,6 +112,8 @@ class temp_source_file : public named_te |
| public: |
| temp_source_file (const location &loc, const char *suffix, |
| const char *content); |
| + temp_source_file (const location &loc, const char *suffix, |
| + const char *content, size_t sz); |
| }; |
| |
| /* RAII-style class for avoiding introducing locale-specific differences |
| diff --git a/gcc/testsuite/c-c++-common/diagnostic-format-json-1.c b/gcc/testsuite/c-c++-common/diagnostic-format-json-1.c |
| --- a/gcc/testsuite/c-c++-common/diagnostic-format-json-1.c 2021-07-27 23:55:07.596291654 -0700 |
| +++ b/gcc/testsuite/c-c++-common/diagnostic-format-json-1.c 2021-12-14 01:16:01.557942991 -0800 |
| @@ -9,6 +9,7 @@ |
| |
| /* { dg-regexp "\"kind\": \"error\"" } */ |
| /* { dg-regexp "\"column-origin\": 1" } */ |
| +/* { dg-regexp "\"escape-source\": false" } */ |
| /* { dg-regexp "\"message\": \"#error message\"" } */ |
| |
| /* { dg-regexp "\"caret\": \{" } */ |
| diff --git a/gcc/testsuite/c-c++-common/diagnostic-format-json-2.c b/gcc/testsuite/c-c++-common/diagnostic-format-json-2.c |
| --- a/gcc/testsuite/c-c++-common/diagnostic-format-json-2.c 2021-07-27 23:55:07.596291654 -0700 |
| +++ b/gcc/testsuite/c-c++-common/diagnostic-format-json-2.c 2021-12-14 01:16:01.557942991 -0800 |
| @@ -9,6 +9,7 @@ |
| |
| /* { dg-regexp "\"kind\": \"warning\"" } */ |
| /* { dg-regexp "\"column-origin\": 1" } */ |
| +/* { dg-regexp "\"escape-source\": false" } */ |
| /* { dg-regexp "\"message\": \"#warning message\"" } */ |
| /* { dg-regexp "\"option\": \"-Wcpp\"" } */ |
| /* { dg-regexp "\"option_url\": \"https:\[^\n\r\"\]*#index-Wcpp\"" } */ |
| diff --git a/gcc/testsuite/c-c++-common/diagnostic-format-json-3.c b/gcc/testsuite/c-c++-common/diagnostic-format-json-3.c |
| --- a/gcc/testsuite/c-c++-common/diagnostic-format-json-3.c 2021-07-27 23:55:07.596291654 -0700 |
| +++ b/gcc/testsuite/c-c++-common/diagnostic-format-json-3.c 2021-12-14 01:16:01.557942991 -0800 |
| @@ -9,6 +9,7 @@ |
| |
| /* { dg-regexp "\"kind\": \"error\"" } */ |
| /* { dg-regexp "\"column-origin\": 1" } */ |
| +/* { dg-regexp "\"escape-source\": false" } */ |
| /* { dg-regexp "\"message\": \"#warning message\"" } */ |
| /* { dg-regexp "\"option\": \"-Werror=cpp\"" } */ |
| /* { dg-regexp "\"option_url\": \"https:\[^\n\r\"\]*#index-Wcpp\"" } */ |
| diff --git a/gcc/testsuite/c-c++-common/diagnostic-format-json-4.c b/gcc/testsuite/c-c++-common/diagnostic-format-json-4.c |
| --- a/gcc/testsuite/c-c++-common/diagnostic-format-json-4.c 2021-07-27 23:55:07.596291654 -0700 |
| +++ b/gcc/testsuite/c-c++-common/diagnostic-format-json-4.c 2021-12-14 01:16:01.557942991 -0800 |
| @@ -19,6 +19,7 @@ int test (void) |
| |
| /* { dg-regexp "\"kind\": \"note\"" } */ |
| /* { dg-regexp "\"message\": \"...this statement, but the latter is misleadingly indented as if it were guarded by the 'if'\"" } */ |
| +/* { dg-regexp "\"escape-source\": false" } */ |
| |
| /* { dg-regexp "\"caret\": \{" } */ |
| /* { dg-regexp "\"file\": \"\[^\n\r\"\]*diagnostic-format-json-4.c\"" } */ |
| @@ -39,6 +40,7 @@ int test (void) |
| /* { dg-regexp "\"kind\": \"warning\"" } */ |
| /* { dg-regexp "\"column-origin\": 1" } */ |
| /* { dg-regexp "\"message\": \"this 'if' clause does not guard...\"" } */ |
| +/* { dg-regexp "\"escape-source\": false" } */ |
| /* { dg-regexp "\"option\": \"-Wmisleading-indentation\"" } */ |
| /* { dg-regexp "\"option_url\": \"https:\[^\n\r\"\]*#index-Wmisleading-indentation\"" } */ |
| |
| diff --git a/gcc/testsuite/c-c++-common/diagnostic-format-json-5.c b/gcc/testsuite/c-c++-common/diagnostic-format-json-5.c |
| --- a/gcc/testsuite/c-c++-common/diagnostic-format-json-5.c 2021-07-27 23:55:07.596291654 -0700 |
| +++ b/gcc/testsuite/c-c++-common/diagnostic-format-json-5.c 2021-12-14 01:16:01.557942991 -0800 |
| @@ -14,6 +14,7 @@ int test (struct s *ptr) |
| |
| /* { dg-regexp "\"kind\": \"error\"" } */ |
| /* { dg-regexp "\"column-origin\": 1" } */ |
| +/* { dg-regexp "\"escape-source\": false" } */ |
| /* { dg-regexp "\"message\": \".*\"" } */ |
| |
| /* Verify fix-it hints. */ |
| diff --git a/gcc/testsuite/gcc.dg/cpp/warn-normalized-4-bytes.c b/gcc/testsuite/gcc.dg/cpp/warn-normalized-4-bytes.c |
| --- a/gcc/testsuite/gcc.dg/cpp/warn-normalized-4-bytes.c 1969-12-31 16:00:00.000000000 -0800 |
| +++ b/gcc/testsuite/gcc.dg/cpp/warn-normalized-4-bytes.c 2021-12-14 01:16:01.557942991 -0800 |
| @@ -0,0 +1,21 @@ |
| +// { dg-do preprocess } |
| +// { dg-options "-std=gnu99 -Werror=normalized=nfc -fdiagnostics-show-caret -fdiagnostics-escape-format=bytes" } |
| +/* { dg-message "some warnings being treated as errors" "" {target "*-*-*"} 0 } */ |
| + |
| +/* འ= U+0F43 TIBETAN LETTER GHA, which has decomposition "0F42 0FB7" i.e. |
| + U+0F42 TIBETAN LETTER GA: འ|
| + U+0FB7 TIBETAN SUBJOINED LETTER HA: ྷ |
| + |
| + The UTF-8 encoding of U+0F43 TIBETAN LETTER GHA is: E0 BD 83. */ |
| + |
| +foo before_\u0F43_after bar // { dg-error "`before_.U00000f43_after' is not in NFC .-Werror=normalized=." } |
| +/* { dg-begin-multiline-output "" } |
| + foo before_\u0F43_after bar |
| + ^~~~~~~~~~~~~~~~~~~ |
| + { dg-end-multiline-output "" } */ |
| + |
| +foo before_à½_after bar // { dg-error "`before_.U00000f43_after' is not in NFC .-Werror=normalized=." } |
| +/* { dg-begin-multiline-output "" } |
| + foo before_<e0><bd><83>_after bar |
| + ^~~~~~~~~~~~~~~~~~~~~~~~~ |
| + { dg-end-multiline-output "" } */ |
| diff --git a/gcc/testsuite/gcc.dg/cpp/warn-normalized-4-unicode.c b/gcc/testsuite/gcc.dg/cpp/warn-normalized-4-unicode.c |
| --- a/gcc/testsuite/gcc.dg/cpp/warn-normalized-4-unicode.c 1969-12-31 16:00:00.000000000 -0800 |
| +++ b/gcc/testsuite/gcc.dg/cpp/warn-normalized-4-unicode.c 2021-12-14 01:16:01.557942991 -0800 |
| @@ -0,0 +1,19 @@ |
| +// { dg-do preprocess } |
| +// { dg-options "-std=gnu99 -Werror=normalized=nfc -fdiagnostics-show-caret -fdiagnostics-escape-format=unicode" } |
| +/* { dg-message "some warnings being treated as errors" "" {target "*-*-*"} 0 } */ |
| + |
| +/* འ= U+0F43 TIBETAN LETTER GHA, which has decomposition "0F42 0FB7" i.e. |
| + U+0F42 TIBETAN LETTER GA: འ|
| + U+0FB7 TIBETAN SUBJOINED LETTER HA: ྷ */ |
| + |
| +foo before_\u0F43_after bar // { dg-error "`before_.U00000f43_after' is not in NFC .-Werror=normalized=." } |
| +/* { dg-begin-multiline-output "" } |
| + foo before_\u0F43_after bar |
| + ^~~~~~~~~~~~~~~~~~~ |
| + { dg-end-multiline-output "" } */ |
| + |
| +foo before_à½_after bar // { dg-error "`before_.U00000f43_after' is not in NFC .-Werror=normalized=." } |
| +/* { dg-begin-multiline-output "" } |
| + foo before_<U+0F43>_after bar |
| + ^~~~~~~~~~~~~~~~~~~~~ |
| + { dg-end-multiline-output "" } */ |
| diff --git a/gcc/testsuite/gfortran.dg/diagnostic-format-json-1.F90 b/gcc/testsuite/gfortran.dg/diagnostic-format-json-1.F90 |
| --- a/gcc/testsuite/gfortran.dg/diagnostic-format-json-1.F90 2021-07-27 23:55:08.472303878 -0700 |
| +++ b/gcc/testsuite/gfortran.dg/diagnostic-format-json-1.F90 2021-12-14 01:16:01.557942991 -0800 |
| @@ -9,6 +9,7 @@ |
| |
| ! { dg-regexp "\"kind\": \"error\"" } |
| ! { dg-regexp "\"column-origin\": 1" } |
| +! { dg-regexp "\"escape-source\": false" } |
| ! { dg-regexp "\"message\": \"#error message\"" } |
| |
| ! { dg-regexp "\"caret\": \{" } |
| diff --git a/gcc/testsuite/gfortran.dg/diagnostic-format-json-2.F90 b/gcc/testsuite/gfortran.dg/diagnostic-format-json-2.F90 |
| --- a/gcc/testsuite/gfortran.dg/diagnostic-format-json-2.F90 2021-07-27 23:55:08.472303878 -0700 |
| +++ b/gcc/testsuite/gfortran.dg/diagnostic-format-json-2.F90 2021-12-14 01:16:01.557942991 -0800 |
| @@ -9,6 +9,7 @@ |
| |
| ! { dg-regexp "\"kind\": \"warning\"" } |
| ! { dg-regexp "\"column-origin\": 1" } |
| +! { dg-regexp "\"escape-source\": false" } |
| ! { dg-regexp "\"message\": \"#warning message\"" } |
| ! { dg-regexp "\"option\": \"-Wcpp\"" } |
| ! { dg-regexp "\"option_url\": \"\[^\n\r\"\]*#index-Wcpp\"" } |
| diff --git a/gcc/testsuite/gfortran.dg/diagnostic-format-json-3.F90 b/gcc/testsuite/gfortran.dg/diagnostic-format-json-3.F90 |
| --- a/gcc/testsuite/gfortran.dg/diagnostic-format-json-3.F90 2021-07-27 23:55:08.472303878 -0700 |
| +++ b/gcc/testsuite/gfortran.dg/diagnostic-format-json-3.F90 2021-12-14 01:16:01.557942991 -0800 |
| @@ -9,6 +9,7 @@ |
| |
| ! { dg-regexp "\"kind\": \"error\"" } |
| ! { dg-regexp "\"column-origin\": 1" } |
| +! { dg-regexp "\"escape-source\": false" } |
| ! { dg-regexp "\"message\": \"#warning message\"" } |
| ! { dg-regexp "\"option\": \"-Werror=cpp\"" } |
| ! { dg-regexp "\"option_url\": \"\[^\n\r\"\]*#index-Wcpp\"" } |
| diff --git a/libcpp/charset.c b/libcpp/charset.c |
| --- a/libcpp/charset.c 2021-07-27 23:55:08.712307227 -0700 |
| +++ b/libcpp/charset.c 2021-12-14 01:16:01.557942991 -0800 |
| @@ -1552,12 +1552,14 @@ convert_escape (cpp_reader *pfile, const |
| "unknown escape sequence: '\\%c'", (int) c); |
| else |
| { |
| + encoding_rich_location rich_loc (pfile); |
| + |
| /* diagnostic.c does not support "%03o". When it does, this |
| code can use %03o directly in the diagnostic again. */ |
| char buf[32]; |
| sprintf(buf, "%03o", (int) c); |
| - cpp_error (pfile, CPP_DL_PEDWARN, |
| - "unknown escape sequence: '\\%s'", buf); |
| + cpp_error_at (pfile, CPP_DL_PEDWARN, &rich_loc, |
| + "unknown escape sequence: '\\%s'", buf); |
| } |
| } |
| |
| @@ -2280,14 +2282,16 @@ cpp_string_location_reader::get_next () |
| } |
| |
| cpp_display_width_computation:: |
| -cpp_display_width_computation (const char *data, int data_length, int tabstop) : |
| +cpp_display_width_computation (const char *data, int data_length, |
| + const cpp_char_column_policy &policy) : |
| m_begin (data), |
| m_next (m_begin), |
| m_bytes_left (data_length), |
| - m_tabstop (tabstop), |
| + m_policy (policy), |
| m_display_cols (0) |
| { |
| - gcc_assert (m_tabstop > 0); |
| + gcc_assert (policy.m_tabstop > 0); |
| + gcc_assert (policy.m_width_cb); |
| } |
| |
| |
| @@ -2299,19 +2303,28 @@ cpp_display_width_computation (const cha |
| point to a valid UTF-8-encoded sequence, then it will be treated as a single |
| byte with display width 1. m_cur_display_col is the current display column, |
| relative to which tab stops should be expanded. Returns the display width of |
| - the codepoint just processed. */ |
| + the codepoint just processed. |
| + If OUT is non-NULL, it is populated. */ |
| |
| int |
| -cpp_display_width_computation::process_next_codepoint () |
| +cpp_display_width_computation::process_next_codepoint (cpp_decoded_char *out) |
| { |
| cppchar_t c; |
| int next_width; |
| |
| + if (out) |
| + out->m_start_byte = m_next; |
| + |
| if (*m_next == '\t') |
| { |
| ++m_next; |
| --m_bytes_left; |
| - next_width = m_tabstop - (m_display_cols % m_tabstop); |
| + next_width = m_policy.m_tabstop - (m_display_cols % m_policy.m_tabstop); |
| + if (out) |
| + { |
| + out->m_ch = '\t'; |
| + out->m_valid_ch = true; |
| + } |
| } |
| else if (one_utf8_to_cppchar ((const uchar **) &m_next, &m_bytes_left, &c) |
| != 0) |
| @@ -2321,14 +2334,24 @@ cpp_display_width_computation::process_n |
| of one. */ |
| ++m_next; |
| --m_bytes_left; |
| - next_width = 1; |
| + next_width = m_policy.m_undecoded_byte_width; |
| + if (out) |
| + out->m_valid_ch = false; |
| } |
| else |
| { |
| /* one_utf8_to_cppchar() has updated m_next and m_bytes_left for us. */ |
| - next_width = cpp_wcwidth (c); |
| + next_width = m_policy.m_width_cb (c); |
| + if (out) |
| + { |
| + out->m_ch = c; |
| + out->m_valid_ch = true; |
| + } |
| } |
| |
| + if (out) |
| + out->m_next_byte = m_next; |
| + |
| m_display_cols += next_width; |
| return next_width; |
| } |
| @@ -2344,7 +2367,7 @@ cpp_display_width_computation::advance_d |
| const int start = m_display_cols; |
| const int target = start + n; |
| while (m_display_cols < target && !done ()) |
| - process_next_codepoint (); |
| + process_next_codepoint (NULL); |
| return m_display_cols - start; |
| } |
| |
| @@ -2352,29 +2375,33 @@ cpp_display_width_computation::advance_d |
| how many display columns are occupied by the first COLUMN bytes. COLUMN |
| may exceed DATA_LENGTH, in which case the phantom bytes at the end are |
| treated as if they have display width 1. Tabs are expanded to the next tab |
| - stop, relative to the start of DATA. */ |
| + stop, relative to the start of DATA, and non-printable-ASCII characters |
| + will be escaped as per POLICY. */ |
| |
| int |
| cpp_byte_column_to_display_column (const char *data, int data_length, |
| - int column, int tabstop) |
| + int column, |
| + const cpp_char_column_policy &policy) |
| { |
| const int offset = MAX (0, column - data_length); |
| - cpp_display_width_computation dw (data, column - offset, tabstop); |
| + cpp_display_width_computation dw (data, column - offset, policy); |
| while (!dw.done ()) |
| - dw.process_next_codepoint (); |
| + dw.process_next_codepoint (NULL); |
| return dw.display_cols_processed () + offset; |
| } |
| |
| /* For the string of length DATA_LENGTH bytes that begins at DATA, compute |
| the least number of bytes that will result in at least DISPLAY_COL display |
| columns. The return value may exceed DATA_LENGTH if the entire string does |
| - not occupy enough display columns. */ |
| + not occupy enough display columns. Non-printable-ASCII characters |
| + will be escaped as per POLICY. */ |
| |
| int |
| cpp_display_column_to_byte_column (const char *data, int data_length, |
| - int display_col, int tabstop) |
| + int display_col, |
| + const cpp_char_column_policy &policy) |
| { |
| - cpp_display_width_computation dw (data, data_length, tabstop); |
| + cpp_display_width_computation dw (data, data_length, policy); |
| const int avail_display = dw.advance_display_cols (display_col); |
| return dw.bytes_processed () + MAX (0, display_col - avail_display); |
| } |
| diff --git a/libcpp/errors.c b/libcpp/errors.c |
| --- a/libcpp/errors.c 2021-07-27 23:55:08.712307227 -0700 |
| +++ b/libcpp/errors.c 2021-12-14 01:16:01.557942991 -0800 |
| @@ -27,6 +27,31 @@ along with this program; see the file CO |
| #include "cpplib.h" |
| #include "internal.h" |
| |
| +/* Get a location_t for the current location in PFILE, |
| + generally that of the previously lexed token. */ |
| + |
| +location_t |
| +cpp_diagnostic_get_current_location (cpp_reader *pfile) |
| +{ |
| + if (CPP_OPTION (pfile, traditional)) |
| + { |
| + if (pfile->state.in_directive) |
| + return pfile->directive_line; |
| + else |
| + return pfile->line_table->highest_line; |
| + } |
| + /* We don't want to refer to a token before the beginning of the |
| + current run -- that is invalid. */ |
| + else if (pfile->cur_token == pfile->cur_run->base) |
| + { |
| + return 0; |
| + } |
| + else |
| + { |
| + return pfile->cur_token[-1].src_loc; |
| + } |
| +} |
| + |
| /* Print a diagnostic at the given location. */ |
| |
| ATTRIBUTE_FPTR_PRINTF(5,0) |
| @@ -52,25 +77,7 @@ cpp_diagnostic (cpp_reader * pfile, enum |
| enum cpp_warning_reason reason, |
| const char *msgid, va_list *ap) |
| { |
| - location_t src_loc; |
| - |
| - if (CPP_OPTION (pfile, traditional)) |
| - { |
| - if (pfile->state.in_directive) |
| - src_loc = pfile->directive_line; |
| - else |
| - src_loc = pfile->line_table->highest_line; |
| - } |
| - /* We don't want to refer to a token before the beginning of the |
| - current run -- that is invalid. */ |
| - else if (pfile->cur_token == pfile->cur_run->base) |
| - { |
| - src_loc = 0; |
| - } |
| - else |
| - { |
| - src_loc = pfile->cur_token[-1].src_loc; |
| - } |
| + location_t src_loc = cpp_diagnostic_get_current_location (pfile); |
| rich_location richloc (pfile->line_table, src_loc); |
| return cpp_diagnostic_at (pfile, level, reason, &richloc, msgid, ap); |
| } |
| @@ -142,6 +149,43 @@ cpp_warning_syshdr (cpp_reader * pfile, |
| |
| va_end (ap); |
| return ret; |
| +} |
| + |
| +/* As cpp_warning above, but use RICHLOC as the location of the diagnostic. */ |
| + |
| +bool cpp_warning_at (cpp_reader *pfile, enum cpp_warning_reason reason, |
| + rich_location *richloc, const char *msgid, ...) |
| +{ |
| + va_list ap; |
| + bool ret; |
| + |
| + va_start (ap, msgid); |
| + |
| + ret = cpp_diagnostic_at (pfile, CPP_DL_WARNING, reason, richloc, |
| + msgid, &ap); |
| + |
| + va_end (ap); |
| + return ret; |
| + |
| +} |
| + |
| +/* As cpp_pedwarning above, but use RICHLOC as the location of the |
| + diagnostic. */ |
| + |
| +bool |
| +cpp_pedwarning_at (cpp_reader * pfile, enum cpp_warning_reason reason, |
| + rich_location *richloc, const char *msgid, ...) |
| +{ |
| + va_list ap; |
| + bool ret; |
| + |
| + va_start (ap, msgid); |
| + |
| + ret = cpp_diagnostic_at (pfile, CPP_DL_PEDWARN, reason, richloc, |
| + msgid, &ap); |
| + |
| + va_end (ap); |
| + return ret; |
| } |
| |
| /* Print a diagnostic at a specific location. */ |
| diff --git a/libcpp/include/cpplib.h b/libcpp/include/cpplib.h |
| --- a/libcpp/include/cpplib.h 2021-12-13 23:23:05.768437079 -0800 |
| +++ b/libcpp/include/cpplib.h 2021-12-14 01:20:16.189507386 -0800 |
| @@ -1275,6 +1275,14 @@ extern bool cpp_warning_syshdr (cpp_read |
| const char *msgid, ...) |
| ATTRIBUTE_PRINTF_3; |
| |
| +/* As their counterparts above, but use RICHLOC. */ |
| +extern bool cpp_warning_at (cpp_reader *, enum cpp_warning_reason, |
| + rich_location *richloc, const char *msgid, ...) |
| + ATTRIBUTE_PRINTF_4; |
| +extern bool cpp_pedwarning_at (cpp_reader *, enum cpp_warning_reason, |
| + rich_location *richloc, const char *msgid, ...) |
| + ATTRIBUTE_PRINTF_4; |
| + |
| /* Output a diagnostic with "MSGID: " preceding the |
| error string of errno. No location is printed. */ |
| extern bool cpp_errno (cpp_reader *, enum cpp_diagnostic_level, |
| @@ -1435,42 +1443,95 @@ extern const char * cpp_get_userdef_suff |
| |
| /* In charset.c */ |
| |
| +/* The result of attempting to decode a run of UTF-8 bytes. */ |
| + |
| +struct cpp_decoded_char |
| +{ |
| + const char *m_start_byte; |
| + const char *m_next_byte; |
| + |
| + bool m_valid_ch; |
| + cppchar_t m_ch; |
| +}; |
| + |
| +/* Information for mapping between code points and display columns. |
| + |
| + This is a tabstop value, along with a callback for getting the |
| + widths of characters. Normally this callback is cpp_wcwidth, but we |
| + support other schemes for escaping non-ASCII unicode as a series of |
| + ASCII chars when printing the user's source code in diagnostic-show-locus.c |
| + |
| + For example, consider: |
| + - the Unicode character U+03C0 "GREEK SMALL LETTER PI" (UTF-8: 0xCF 0x80) |
| + - the Unicode character U+1F642 "SLIGHTLY SMILING FACE" |
| + (UTF-8: 0xF0 0x9F 0x99 0x82) |
| + - the byte 0xBF (a stray trailing byte of a UTF-8 character) |
| + Normally U+03C0 would occupy one display column, U+1F642 |
| + would occupy two display columns, and the stray byte would be |
| + printed verbatim as one display column. |
| + |
| + However when escaping them as unicode code points as "<U+03C0>" |
| + and "<U+1F642>" they occupy 8 and 9 display columns respectively, |
| + and when escaping them as bytes as "<CF><80>" and "<F0><9F><99><82>" |
| + they occupy 8 and 16 display columns respectively. In both cases |
| + the stray byte is escaped to <BF> as 4 display columns. */ |
| + |
| +struct cpp_char_column_policy |
| +{ |
| + cpp_char_column_policy (int tabstop, |
| + int (*width_cb) (cppchar_t c)) |
| + : m_tabstop (tabstop), |
| + m_undecoded_byte_width (1), |
| + m_width_cb (width_cb) |
| + {} |
| + |
| + int m_tabstop; |
| + /* Width in display columns of a stray byte that isn't decodable |
| + as UTF-8. */ |
| + int m_undecoded_byte_width; |
| + int (*m_width_cb) (cppchar_t c); |
| +}; |
| + |
| /* A class to manage the state while converting a UTF-8 sequence to cppchar_t |
| and computing the display width one character at a time. */ |
| class cpp_display_width_computation { |
| public: |
| cpp_display_width_computation (const char *data, int data_length, |
| - int tabstop); |
| + const cpp_char_column_policy &policy); |
| const char *next_byte () const { return m_next; } |
| int bytes_processed () const { return m_next - m_begin; } |
| int bytes_left () const { return m_bytes_left; } |
| bool done () const { return !bytes_left (); } |
| int display_cols_processed () const { return m_display_cols; } |
| |
| - int process_next_codepoint (); |
| + int process_next_codepoint (cpp_decoded_char *out); |
| int advance_display_cols (int n); |
| |
| private: |
| const char *const m_begin; |
| const char *m_next; |
| size_t m_bytes_left; |
| - const int m_tabstop; |
| + const cpp_char_column_policy &m_policy; |
| int m_display_cols; |
| }; |
| |
| /* Convenience functions that are simple use cases for class |
| cpp_display_width_computation. Tab characters will be expanded to spaces |
| - as determined by TABSTOP. */ |
| + as determined by POLICY.m_tabstop, and non-printable-ASCII characters |
| + will be escaped as per POLICY. */ |
| + |
| int cpp_byte_column_to_display_column (const char *data, int data_length, |
| - int column, int tabstop); |
| + int column, |
| + const cpp_char_column_policy &policy); |
| inline int cpp_display_width (const char *data, int data_length, |
| - int tabstop) |
| + const cpp_char_column_policy &policy) |
| { |
| return cpp_byte_column_to_display_column (data, data_length, data_length, |
| - tabstop); |
| + policy); |
| } |
| int cpp_display_column_to_byte_column (const char *data, int data_length, |
| - int display_col, int tabstop); |
| + int display_col, |
| + const cpp_char_column_policy &policy); |
| int cpp_wcwidth (cppchar_t c); |
| |
| #endif /* ! LIBCPP_CPPLIB_H */ |
| diff --git a/libcpp/include/line-map.h b/libcpp/include/line-map.h |
| --- a/libcpp/include/line-map.h 2021-07-27 23:55:08.716307283 -0700 |
| +++ b/libcpp/include/line-map.h 2021-12-14 01:16:01.557942991 -0800 |
| @@ -1781,6 +1781,18 @@ class rich_location |
| const diagnostic_path *get_path () const { return m_path; } |
| void set_path (const diagnostic_path *path) { m_path = path; } |
| |
| + /* A flag for hinting that the diagnostic involves character encoding |
| + issues, and thus that it will be helpful to the user if we show some |
| + representation of how the characters in the pertinent source lines |
| + are encoded. |
| + The default is false (i.e. do not escape). |
| + When set to true, non-ASCII bytes in the pertinent source lines will |
| + be escaped in a manner controlled by the user-supplied option |
| + -fdiagnostics-escape-format=, so that the user can better understand |
| + what's going on with the encoding in their source file. */ |
| + bool escape_on_output_p () const { return m_escape_on_output; } |
| + void set_escape_on_output (bool flag) { m_escape_on_output = flag; } |
| + |
| private: |
| bool reject_impossible_fixit (location_t where); |
| void stop_supporting_fixits (); |
| @@ -1807,6 +1819,7 @@ protected: |
| bool m_fixits_cannot_be_auto_applied; |
| |
| const diagnostic_path *m_path; |
| + bool m_escape_on_output; |
| }; |
| |
| /* A struct for the result of range_label::get_text: a NUL-terminated buffer |
| diff --git a/libcpp/internal.h b/libcpp/internal.h |
| --- a/libcpp/internal.h 2021-12-13 23:23:05.768437079 -0800 |
| +++ b/libcpp/internal.h 2021-12-14 01:16:01.557942991 -0800 |
| @@ -776,6 +776,9 @@ extern void _cpp_do_file_change (cpp_rea |
| extern void _cpp_pop_buffer (cpp_reader *); |
| extern char *_cpp_bracket_include (cpp_reader *); |
| |
| +/* In errors.c */ |
| +extern location_t cpp_diagnostic_get_current_location (cpp_reader *); |
| + |
| /* In traditional.c. */ |
| extern bool _cpp_scan_out_logical_line (cpp_reader *, cpp_macro *, bool); |
| extern bool _cpp_read_logical_line_trad (cpp_reader *); |
| @@ -942,6 +945,26 @@ int linemap_get_expansion_line (class li |
| const char* linemap_get_expansion_filename (class line_maps *, |
| location_t); |
| |
| +/* A subclass of rich_location for emitting a diagnostic |
| + at the current location of the reader, but flagging |
| + it with set_escape_on_output (true). */ |
| +class encoding_rich_location : public rich_location |
| +{ |
| + public: |
| + encoding_rich_location (cpp_reader *pfile) |
| + : rich_location (pfile->line_table, |
| + cpp_diagnostic_get_current_location (pfile)) |
| + { |
| + set_escape_on_output (true); |
| + } |
| + |
| + encoding_rich_location (cpp_reader *pfile, location_t loc) |
| + : rich_location (pfile->line_table, loc) |
| + { |
| + set_escape_on_output (true); |
| + } |
| +}; |
| + |
| #ifdef __cplusplus |
| } |
| #endif |
| diff --git a/libcpp/lex.c b/libcpp/lex.c |
| --- a/libcpp/lex.c 2021-12-14 01:14:48.435225968 -0800 |
| +++ b/libcpp/lex.c 2021-12-14 01:24:37.220995816 -0800 |
| @@ -1774,7 +1774,11 @@ skip_whitespace (cpp_reader *pfile, cppc |
| while (is_nvspace (c)); |
| |
| if (saw_NUL) |
| - cpp_error (pfile, CPP_DL_WARNING, "null character(s) ignored"); |
| + { |
| + encoding_rich_location rich_loc (pfile); |
| + cpp_error_at (pfile, CPP_DL_WARNING, &rich_loc, |
| + "null character(s) ignored"); |
| + } |
| |
| buffer->cur--; |
| } |
| @@ -1803,6 +1807,28 @@ warn_about_normalization (cpp_reader *pf |
| if (CPP_OPTION (pfile, warn_normalize) < NORMALIZE_STATE_RESULT (s) |
| && !pfile->state.skipping) |
| { |
| + location_t loc = token->src_loc; |
| + |
| + /* If possible, create a location range for the token. */ |
| + if (loc >= RESERVED_LOCATION_COUNT |
| + && token->type != CPP_EOF |
| + /* There must be no line notes to process. */ |
| + && (!(pfile->buffer->cur |
| + >= pfile->buffer->notes[pfile->buffer->cur_note].pos |
| + && !pfile->overlaid_buffer))) |
| + { |
| + source_range tok_range; |
| + tok_range.m_start = loc; |
| + tok_range.m_finish |
| + = linemap_position_for_column (pfile->line_table, |
| + CPP_BUF_COLUMN (pfile->buffer, |
| + pfile->buffer->cur)); |
| + loc = COMBINE_LOCATION_DATA (pfile->line_table, |
| + loc, tok_range, NULL); |
| + } |
| + |
| + encoding_rich_location rich_loc (pfile, loc); |
| + |
| /* Make sure that the token is printed using UCNs, even |
| if we'd otherwise happily print UTF-8. */ |
| unsigned char *buf = XNEWVEC (unsigned char, cpp_token_len (token)); |
| @@ -1810,11 +1836,11 @@ warn_about_normalization (cpp_reader *pf |
| |
| sz = cpp_spell_token (pfile, token, buf, false) - buf; |
| if (NORMALIZE_STATE_RESULT (s) == normalized_C) |
| - cpp_warning_with_line (pfile, CPP_W_NORMALIZE, token->src_loc, 0, |
| - "`%.*s' is not in NFKC", (int) sz, buf); |
| + cpp_warning_at (pfile, CPP_W_NORMALIZE, &rich_loc, |
| + "`%.*s' is not in NFKC", (int) sz, buf); |
| else |
| - cpp_warning_with_line (pfile, CPP_W_NORMALIZE, token->src_loc, 0, |
| - "`%.*s' is not in NFC", (int) sz, buf); |
| + cpp_warning_at (pfile, CPP_W_NORMALIZE, &rich_loc, |
| + "`%.*s' is not in NFC", (int) sz, buf); |
| free (buf); |
| } |
| } |
| diff --git a/libcpp/line-map.c b/libcpp/line-map.c |
| --- a/libcpp/line-map.c 2021-07-27 23:55:08.716307283 -0700 |
| +++ b/libcpp/line-map.c 2021-12-14 01:16:01.561942921 -0800 |
| @@ -2086,7 +2086,8 @@ rich_location::rich_location (line_maps |
| m_fixit_hints (), |
| m_seen_impossible_fixit (false), |
| m_fixits_cannot_be_auto_applied (false), |
| - m_path (NULL) |
| + m_path (NULL), |
| + m_escape_on_output (false) |
| { |
| add_range (loc, SHOW_RANGE_WITH_CARET, label); |
| } |