Blame - meta-arm/meta-arm-toolchain/recipes-devtools/gcc/gcc-arm-11.3/0001-CVE-2021-42574.patch - stefanberger/openbmc

blob: 4d680ccc8f6157337f55980e65eb18eab53831ac [file] [log] [blame]

Brad Bishop	bec4ebc	2022-08-03 09:55:16 -0400	[diff] [blame]	1	From bd5e882cf6e0def3dd1bc106075d59a303fe0d1e Mon Sep 17 00:00:00 2001
				2	From: David Malcolm <dmalcolm@redhat.com>
				3	Date: Mon, 18 Oct 2021 18:55:31 -0400
				4	Subject: [PATCH] diagnostics: escape non-ASCII source bytes for certain
				5	diagnostics
				6	MIME-Version: 1.0
				7	Content-Type: text/plain; charset=utf8
				8	Content-Transfer-Encoding: 8bit
				9
				10	This patch adds support to GCC's diagnostic subsystem for escaping certain
				11	bytes and Unicode characters when quoting source code.
				12
				13	Specifically, this patch adds a new flag rich_location::m_escape_on_output
				14	which is a hint from a diagnostic that non-ASCII bytes in the pertinent
				15	lines of the user's source code should be escaped when printed.
				16
				17	The patch sets this for the following diagnostics:
				18	- when complaining about stray bytes in the program (when these
				19	are non-printable)
				20	- when complaining about "null character(s) ignored");
				21	- for -Wnormalized= (and generate source ranges for such warnings)
				22
				23	The escaping is controlled by a new option:
				24	-fdiagnostics-escape-format=[unicode\|bytes]
				25
				26	For example, consider a diagnostic involing a source line containing the
				27	string "before" followed by the Unicode character U+03C0 ("GREEK SMALL
				28	LETTER PI", with UTF-8 encoding 0xCF 0x80) followed by the byte 0xBF
				29	(a stray UTF-8 trailing byte), followed by the string "after", where the
				30	diagnostic highlights the U+03C0 character.
				31
				32	By default, this line will be printed verbatim to the user when
				33	reporting a diagnostic at it, as:
				34
				35	beforeÏXafter
				36	^
				37
				38	(using X for the stray byte to avoid putting invalid UTF-8 in this
				39	commit message)
				40
				41	If the diagnostic sets the "escape" flag, it will be printed as:
				42
				43	before<U+03C0><BF>after
				44	^~~~~~~~
				45
				46	with -fdiagnostics-escape-format=unicode (the default), or as:
				47
				48	before<CF><80><BF>after
				49	^~~~~~~~
				50
				51	if the user supplies -fdiagnostics-escape-format=bytes.
				52
				53	This only affects how the source is printed; it does not affect
				54	how column numbers that are printed (as per -fdiagnostics-column-unit=
				55	and -fdiagnostics-column-origin=).
				56
				57	gcc/c-family/ChangeLog:
				58	* c-lex.c (c_lex_with_flags): When complaining about non-printable
				59	CPP_OTHER tokens, set the "escape on output" flag.
				60
				61	gcc/ChangeLog:
				62	* common.opt (fdiagnostics-escape-format=): New.
				63	(diagnostics_escape_format): New enum.
				64	(DIAGNOSTICS_ESCAPE_FORMAT_UNICODE): New enum value.
				65	(DIAGNOSTICS_ESCAPE_FORMAT_BYTES): Likewise.
				66	* diagnostic-format-json.cc (json_end_diagnostic): Add
				67	"escape-source" attribute.
				68	* diagnostic-show-locus.c
				69	(exploc_with_display_col::exploc_with_display_col): Replace
				70	"tabstop" param with a cpp_char_column_policy and add an "aspect"
				71	param. Use these to compute m_display_col accordingly.
				72	(struct char_display_policy): New struct.
				73	(layout::m_policy): New field.
				74	(layout::m_escape_on_output): New field.
				75	(def_policy): New function.
				76	(make_range): Update for changes to exploc_with_display_col ctor.
				77	(default_print_decoded_ch): New.
				78	(width_per_escaped_byte): New.
				79	(escape_as_bytes_width): New.
				80	(escape_as_bytes_print): New.
				81	(escape_as_unicode_width): New.
				82	(escape_as_unicode_print): New.
				83	(make_policy): New.
				84	(layout::layout): Initialize new fields. Update m_exploc ctor
				85	call for above change to ctor.
				86	(layout::maybe_add_location_range): Update for changes to
				87	exploc_with_display_col ctor.
				88	(layout::calculate_x_offset_display): Update for change to
				89	cpp_display_width.
				90	(layout::print_source_line): Pass policy
				91	to cpp_display_width_computation. Capture cpp_decoded_char when
				92	calling process_next_codepoint. Move printing of source code to
				93	m_policy.m_print_cb.
				94	(line_label::line_label): Pass in policy rather than context.
				95	(layout::print_any_labels): Update for change to line_label ctor.
				96	(get_affected_range): Pass in policy rather than context, updating
				97	calls to location_compute_display_column accordingly.
				98	(get_printed_columns): Likewise, also for cpp_display_width.
				99	(correction::correction): Pass in policy rather than tabstop.
				100	(correction::compute_display_cols): Pass m_policy rather than
				101	m_tabstop to cpp_display_width.
				102	(correction::m_tabstop): Replace with...
				103	(correction::m_policy): ...this.
				104	(line_corrections::line_corrections): Pass in policy rather than
				105	context.
				106	(line_corrections::m_context): Replace with...
				107	(line_corrections::m_policy): ...this.
				108	(line_corrections::add_hint): Update to use m_policy rather than
				109	m_context.
				110	(line_corrections::add_hint): Likewise.
				111	(layout::print_trailing_fixits): Likewise.
				112	(selftest::test_display_widths): New.
				113	(selftest::test_layout_x_offset_display_utf8): Update to use
				114	policy rather than tabstop.
				115	(selftest::test_one_liner_labels_utf8): Add test of escaping
				116	source lines.
				117	(selftest::test_diagnostic_show_locus_one_liner_utf8): Update to
				118	use policy rather than tabstop.
				119	(selftest::test_overlapped_fixit_printing): Likewise.
				120	(selftest::test_overlapped_fixit_printing_utf8): Likewise.
				121	(selftest::test_overlapped_fixit_printing_2): Likewise.
				122	(selftest::test_tab_expansion): Likewise.
				123	(selftest::test_escaping_bytes_1): New.
				124	(selftest::test_escaping_bytes_2): New.
				125	(selftest::diagnostic_show_locus_c_tests): Call the new tests.
				126	* diagnostic.c (diagnostic_initialize): Initialize
				127	context->escape_format.
				128	(convert_column_unit): Update to use default character width policy.
				129	(selftest::test_diagnostic_get_location_text): Likewise.
				130	* diagnostic.h (enum diagnostics_escape_format): New enum.
				131	(diagnostic_context::escape_format): New field.
				132	* doc/invoke.texi (-fdiagnostics-escape-format=): New option.
				133	(-fdiagnostics-format=): Add "escape-source" attribute to examples
				134	of JSON output, and document it.
				135	* input.c (location_compute_display_column): Pass in "policy"
				136	rather than "tabstop", passing to
				137	cpp_byte_column_to_display_column.
				138	(selftest::test_cpp_utf8): Update to use cpp_char_column_policy.
				139	* input.h (class cpp_char_column_policy): New forward decl.
				140	(location_compute_display_column): Pass in "policy" rather than
				141	"tabstop".
				142	* opts.c (common_handle_option): Handle
				143	OPT_fdiagnostics_escape_format_.
				144	* selftest.c (temp_source_file::temp_source_file): New ctor
				145	overload taking a size_t.
				146	* selftest.h (temp_source_file::temp_source_file): Likewise.
				147
				148	gcc/testsuite/ChangeLog:
				149	* c-c++-common/diagnostic-format-json-1.c: Add regexp to consume
				150	"escape-source" attribute.
				151	* c-c++-common/diagnostic-format-json-2.c: Likewise.
				152	* c-c++-common/diagnostic-format-json-3.c: Likewise.
				153	* c-c++-common/diagnostic-format-json-4.c: Likewise, twice.
				154	* c-c++-common/diagnostic-format-json-5.c: Likewise.
				155	* gcc.dg/cpp/warn-normalized-4-bytes.c: New test.
				156	* gcc.dg/cpp/warn-normalized-4-unicode.c: New test.
				157	* gcc.dg/encoding-issues-bytes.c: New test.
				158	* gcc.dg/encoding-issues-unicode.c: New test.
				159	* gfortran.dg/diagnostic-format-json-1.F90: Add regexp to consume
				160	"escape-source" attribute.
				161	* gfortran.dg/diagnostic-format-json-2.F90: Likewise.
				162	* gfortran.dg/diagnostic-format-json-3.F90: Likewise.
				163
				164	libcpp/ChangeLog:
				165	* charset.c (convert_escape): Use encoding_rich_location when
				166	complaining about nonprintable unknown escape sequences.
				167	(cpp_display_width_computation::::cpp_display_width_computation):
				168	Pass in policy rather than tabstop.
				169	(cpp_display_width_computation::process_next_codepoint): Add "out"
				170	param and populate *out if non-NULL.
				171	(cpp_display_width_computation::advance_display_cols): Pass NULL
				172	to process_next_codepoint.
				173	(cpp_byte_column_to_display_column): Pass in policy rather than
				174	tabstop. Pass NULL to process_next_codepoint.
				175	(cpp_display_column_to_byte_column): Pass in policy rather than
				176	tabstop.
				177	* errors.c (cpp_diagnostic_get_current_location): New function,
				178	splitting out the logic from...
				179	(cpp_diagnostic): ...here.
				180	(cpp_warning_at): New function.
				181	(cpp_pedwarning_at): New function.
				182	* include/cpplib.h (cpp_warning_at): New decl for rich_location.
				183	(cpp_pedwarning_at): Likewise.
				184	(struct cpp_decoded_char): New.
				185	(struct cpp_char_column_policy): New.
				186	(cpp_display_width_computation::cpp_display_width_computation):
				187	Replace "tabstop" param with "policy".
				188	(cpp_display_width_computation::process_next_codepoint): Add "out"
				189	param.
				190	(cpp_display_width_computation::m_tabstop): Replace with...
				191	(cpp_display_width_computation::m_policy): ...this.
				192	(cpp_byte_column_to_display_column): Replace "tabstop" param with
				193	"policy".
				194	(cpp_display_width): Likewise.
				195	(cpp_display_column_to_byte_column): Likewise.
				196	* include/line-map.h (rich_location::escape_on_output_p): New.
				197	(rich_location::set_escape_on_output): New.
				198	(rich_location::m_escape_on_output): New.
				199	* internal.h (cpp_diagnostic_get_current_location): New decl.
				200	(class encoding_rich_location): New.
				201	* lex.c (skip_whitespace): Use encoding_rich_location when
				202	complaining about null characters.
				203	(warn_about_normalization): Generate a source range when
				204	complaining about improperly normalized tokens, rather than just a
				205	point, and use encoding_rich_location so that the source code
				206	is escaped on printing.
				207	* line-map.c (rich_location::rich_location): Initialize
				208	m_escape_on_output.
				209
				210	Signed-off-by: David Malcolm <dmalcolm@redhat.com>
				211
				212	CVE: CVE-2021-42574
				213	Upstream-Status: Backport [https://gcc.gnu.org/git/gitweb.cgi?p=gcc.git;h=bd5e882cf6e0def3dd1bc106075d59a303fe0d1e]
				214	Signed-off-by: Pgowda <pgowda.cve@gmail.com>
				215
				216	---
				217	gcc/c-family/c-lex.c \| 6 +-
				218	gcc/common.opt \| 13 +
				219	gcc/diagnostic-format-json.cc \| 3 +
				220	gcc/diagnostic-show-locus.c \| 580 +++++++++++++++---
				221	gcc/diagnostic.c \| 10 +-
				222	gcc/diagnostic.h \| 18 +
				223	gcc/doc/invoke.texi \| 43 +-
				224	gcc/input.c \| 62 +-
				225	gcc/input.h \| 7 +-
				226	gcc/opts.c \| 4 +
				227	gcc/selftest.c \| 15 +
				228	gcc/selftest.h \| 2 +
				229	.../c-c++-common/diagnostic-format-json-1.c \| 1 +
				230	.../c-c++-common/diagnostic-format-json-2.c \| 1 +
				231	.../c-c++-common/diagnostic-format-json-3.c \| 1 +
				232	.../c-c++-common/diagnostic-format-json-4.c \| 2 +
				233	.../c-c++-common/diagnostic-format-json-5.c \| 1 +
				234	.../gcc.dg/cpp/warn-normalized-4-bytes.c \| 21 +
				235	.../gcc.dg/cpp/warn-normalized-4-unicode.c \| 19 +
				236	gcc/testsuite/gcc.dg/encoding-issues-bytes.c \| Bin 0 -> 595 bytes
				237	.../gcc.dg/encoding-issues-unicode.c \| Bin 0 -> 613 bytes
				238	.../gfortran.dg/diagnostic-format-json-1.F90 \| 1 +
				239	.../gfortran.dg/diagnostic-format-json-2.F90 \| 1 +
				240	.../gfortran.dg/diagnostic-format-json-3.F90 \| 1 +
				241	libcpp/charset.c \| 63 +-
				242	libcpp/errors.c \| 82 ++-
				243	libcpp/include/cpplib.h \| 76 ++-
				244	libcpp/include/line-map.h \| 13 +
				245	libcpp/internal.h \| 23 +
				246	libcpp/lex.c \| 38 +-
				247	libcpp/line-map.c \| 3 +-
				248	31 files changed, 942 insertions(+), 168 deletions(-)
				249	create mode 100644 gcc/testsuite/gcc.dg/cpp/warn-normalized-4-bytes.c
				250	create mode 100644 gcc/testsuite/gcc.dg/cpp/warn-normalized-4-unicode.c
				251	create mode 100644 gcc/testsuite/gcc.dg/encoding-issues-bytes.c
				252	create mode 100644 gcc/testsuite/gcc.dg/encoding-issues-unicode.c
				253
				254	diff --git a/gcc/c-family/c-lex.c b/gcc/c-family/c-lex.c
				255	--- a/gcc/c-family/c-lex.c 2021-07-27 23:55:06.980283060 -0700
				256	+++ b/gcc/c-family/c-lex.c 2021-12-14 01:16:01.541943272 -0800
				257	@@ -603,7 +603,11 @@ c_lex_with_flags (tree *value, location_
				258	else if (ISGRAPH (c))
				259	error_at (*loc, "stray %qc in program", (int) c);
				260	else
				261	- error_at (*loc, "stray %<\\%o%> in program", (int) c);
				262	+ {
				263	+ rich_location rich_loc (line_table, *loc);
				264	+ rich_loc.set_escape_on_output (true);
				265	+ error_at (&rich_loc, "stray %<\\%o%> in program", (int) c);
				266	+ }
				267	}
				268	goto retry;
				269
				270	diff --git a/gcc/common.opt b/gcc/common.opt
				271	--- a/gcc/common.opt 2021-12-13 22:08:44.939137107 -0800
				272	+++ b/gcc/common.opt 2021-12-14 01:16:01.541943272 -0800
				273	@@ -1348,6 +1348,10 @@ fdiagnostics-format=
				274	Common Joined RejectNegative Enum(diagnostics_output_format)
				275	-fdiagnostics-format=[text\|json] Select output format.
				276
				277	+fdiagnostics-escape-format=
				278	+Common Joined RejectNegative Enum(diagnostics_escape_format)
				279	+-fdiagnostics-escape-format=[unicode\|bytes] Select how to escape non-printable-ASCII bytes in the source for diagnostics that suggest it.
				280	+
				281	; Required for these enum values.
				282	SourceInclude
				283	diagnostic.h
				284	@@ -1362,6 +1366,15 @@ EnumValue
				285	Enum(diagnostics_column_unit) String(byte) Value(DIAGNOSTICS_COLUMN_UNIT_BYTE)
				286
				287	Enum
				288	+Name(diagnostics_escape_format) Type(int)
				289	+
				290	+EnumValue
				291	+Enum(diagnostics_escape_format) String(unicode) Value(DIAGNOSTICS_ESCAPE_FORMAT_UNICODE)
				292	+
				293	+EnumValue
				294	+Enum(diagnostics_escape_format) String(bytes) Value(DIAGNOSTICS_ESCAPE_FORMAT_BYTES)
				295	+
				296	+Enum
				297	Name(diagnostics_output_format) Type(int)
				298
				299	EnumValue
				300	diff --git a/gcc/diagnostic.c b/gcc/diagnostic.c
				301	--- a/gcc/diagnostic.c 2021-07-27 23:55:07.232286576 -0700
				302	+++ b/gcc/diagnostic.c 2021-12-14 01:16:01.545943202 -0800
				303	@@ -230,6 +230,7 @@ diagnostic_initialize (diagnostic_contex
				304	context->column_unit = DIAGNOSTICS_COLUMN_UNIT_DISPLAY;
				305	context->column_origin = 1;
				306	context->tabstop = 8;
				307	+ context->escape_format = DIAGNOSTICS_ESCAPE_FORMAT_UNICODE;
				308	context->edit_context_ptr = NULL;
				309	context->diagnostic_group_nesting_depth = 0;
				310	context->diagnostic_group_emission_count = 0;
				311	@@ -382,7 +383,10 @@ convert_column_unit (enum diagnostics_co
				312	gcc_unreachable ();
				313
				314	case DIAGNOSTICS_COLUMN_UNIT_DISPLAY:
				315	- return location_compute_display_column (s, tabstop);
				316	+ {
				317	+ cpp_char_column_policy policy (tabstop, cpp_wcwidth);
				318	+ return location_compute_display_column (s, policy);
				319	+ }
				320
				321	case DIAGNOSTICS_COLUMN_UNIT_BYTE:
				322	return s.column;
				323	@@ -2275,8 +2279,8 @@ test_diagnostic_get_location_text ()
				324	const char *const content = "smile \xf0\x9f\x98\x82\n";
				325	const int line_bytes = strlen (content) - 1;
				326	const int def_tabstop = 8;
				327	- const int display_width = cpp_display_width (content, line_bytes,
				328	- def_tabstop);
				329	+ const cpp_char_column_policy policy (def_tabstop, cpp_wcwidth);
				330	+ const int display_width = cpp_display_width (content, line_bytes, policy);
				331	ASSERT_EQ (line_bytes - 2, display_width);
				332	temp_source_file tmp (SELFTEST_LOCATION, ".c", content);
				333	const char *const fname = tmp.get_filename ();
				334	diff --git a/gcc/diagnostic-format-json.cc b/gcc/diagnostic-format-json.cc
				335	--- a/gcc/diagnostic-format-json.cc 2021-07-27 23:55:07.232286576 -0700
				336	+++ b/gcc/diagnostic-format-json.cc 2021-12-14 01:16:01.541943272 -0800
				337	@@ -264,6 +264,9 @@ json_end_diagnostic (diagnostic_context
				338	json::value *path_value = context->make_json_for_path (context, path);
				339	diag_obj->set ("path", path_value);
				340	}
				341	+
				342	+ diag_obj->set ("escape-source",
				343	+ new json::literal (richloc->escape_on_output_p ()));
				344	}
				345
				346	/* No-op implementation of "begin_group_cb" for JSON output. */
				347	diff --git a/gcc/diagnostic.h b/gcc/diagnostic.h
				348	--- a/gcc/diagnostic.h 2021-07-27 23:55:07.236286632 -0700
				349	+++ b/gcc/diagnostic.h 2021-12-14 01:16:01.545943202 -0800
				350	@@ -38,6 +38,20 @@ enum diagnostics_column_unit
				351	DIAGNOSTICS_COLUMN_UNIT_BYTE
				352	};
				353
				354	+/* An enum for controlling how to print non-ASCII characters/bytes when
				355	+ a diagnostic suggests escaping the source code on output. */
				356	+
				357	+enum diagnostics_escape_format
				358	+{
				359	+ /* Escape non-ASCII Unicode characters in the form <U+XXXX> and
				360	+ non-UTF-8 bytes in the form <XX>. */
				361	+ DIAGNOSTICS_ESCAPE_FORMAT_UNICODE,
				362	+
				363	+ /* Escape non-ASCII bytes in the form <XX> (thus showing the underlying
				364	+ encoding of non-ASCII Unicode characters). */
				365	+ DIAGNOSTICS_ESCAPE_FORMAT_BYTES
				366	+};
				367	+
				368	/* Enum for overriding the standard output format. */
				369
				370	enum diagnostics_output_format
				371	@@ -320,6 +334,10 @@ struct diagnostic_context
				372	/* The size of the tabstop for tab expansion. */
				373	int tabstop;
				374
				375	+ /* How should non-ASCII/non-printable bytes be escaped when
				376	+ a diagnostic suggests escaping the source code on output. */
				377	+ enum diagnostics_escape_format escape_format;
				378	+
				379	/* If non-NULL, an edit_context to which fix-it hints should be
				380	applied, for generating patches. */
				381	edit_context *edit_context_ptr;
				382	diff --git a/gcc/diagnostic-show-locus.c b/gcc/diagnostic-show-locus.c
				383	--- a/gcc/diagnostic-show-locus.c 2021-07-27 23:55:07.232286576 -0700
				384	+++ b/gcc/diagnostic-show-locus.c 2021-12-14 01:16:01.545943202 -0800
				385	@@ -175,10 +175,26 @@ enum column_unit {
				386	class exploc_with_display_col : public expanded_location
				387	{
				388	public:
				389	- exploc_with_display_col (const expanded_location &exploc, int tabstop)
				390	- : expanded_location (exploc),
				391	- m_display_col (location_compute_display_column (exploc, tabstop))
				392	- {}
				393	+ exploc_with_display_col (const expanded_location &exploc,
				394	+ const cpp_char_column_policy &policy,
				395	+ enum location_aspect aspect)
				396	+ : expanded_location (exploc),
				397	+ m_display_col (location_compute_display_column (exploc, policy))
				398	+ {
				399	+ if (exploc.column > 0)
				400	+ {
				401	+ /* m_display_col is now the final column of the byte.
				402	+ If escaping has happened, we may want the first column instead. */
				403	+ if (aspect != LOCATION_ASPECT_FINISH)
				404	+ {
				405	+ expanded_location prev_exploc (exploc);
				406	+ prev_exploc.column--;
				407	+ int prev_display_col
				408	+ = (location_compute_display_column (prev_exploc, policy));
				409	+ m_display_col = prev_display_col + 1;
				410	+ }
				411	+ }
				412	+ }
				413
				414	int m_display_col;
				415	};
				416	@@ -313,6 +329,31 @@ test_line_span ()
				417
				418	#endif /* #if CHECKING_P */
				419
				420	+/* A bundle of information containing how to print unicode
				421	+ characters and bytes when quoting source code.
				422	+
				423	+ Provides a unified place to support escaping some subset
				424	+ of characters to some format.
				425	+
				426	+ Extends char_column_policy; printing is split out to avoid
				427	+ libcpp having to know about pretty_printer. */
				428	+
				429	+struct char_display_policy : public cpp_char_column_policy
				430	+{
				431	+ public:
				432	+ char_display_policy (int tabstop,
				433	+ int (*width_cb) (cppchar_t c),
				434	+ void (print_cb) (pretty_printer pp,
				435	+ const cpp_decoded_char &cp))
				436	+ : cpp_char_column_policy (tabstop, width_cb),
				437	+ m_print_cb (print_cb)
				438	+ {
				439	+ }
				440	+
				441	+ void (m_print_cb) (pretty_printer pp,
				442	+ const cpp_decoded_char &cp);
				443	+};
				444	+
				445	/* A class to control the overall layout when printing a diagnostic.
				446
				447	The layout is determined within the constructor.
				448	@@ -345,6 +386,8 @@ class layout
				449
				450	void print_line (linenum_type row);
				451
				452	+ void on_bad_codepoint (const char *ptr, cppchar_t ch, size_t ch_sz);
				453	+
				454	private:
				455	bool will_show_line_p (linenum_type row) const;
				456	void print_leading_fixits (linenum_type row);
				457	@@ -386,6 +429,7 @@ class layout
				458	private:
				459	diagnostic_context *m_context;
				460	pretty_printer *m_pp;
				461	+ char_display_policy m_policy;
				462	location_t m_primary_loc;
				463	exploc_with_display_col m_exploc;
				464	colorizer m_colorizer;
				465	@@ -398,6 +442,7 @@ class layout
				466	auto_vec <line_span> m_line_spans;
				467	int m_linenum_width;
				468	int m_x_offset_display;
				469	+ bool m_escape_on_output;
				470	};
				471
				472	/* Implementation of "class colorizer". */
				473	@@ -646,6 +691,11 @@ layout_range::intersects_line_p (linenum
				474	/* Default for when we don't care what the tab expansion is set to. */
				475	static const int def_tabstop = 8;
				476
				477	+static cpp_char_column_policy def_policy ()
				478	+{
				479	+ return cpp_char_column_policy (8, cpp_wcwidth);
				480	+}
				481	+
				482	/* Create some expanded locations for testing layout_range. The filename
				483	member of the explocs is set to the empty string. This member will only be
				484	inspected by the calls to location_compute_display_column() made from the
				485	@@ -662,10 +712,13 @@ make_range (int start_line, int start_co
				486	= {"", start_line, start_col, NULL, false};
				487	const expanded_location finish_exploc
				488	= {"", end_line, end_col, NULL, false};
				489	- return layout_range (exploc_with_display_col (start_exploc, def_tabstop),
				490	- exploc_with_display_col (finish_exploc, def_tabstop),
				491	+ return layout_range (exploc_with_display_col (start_exploc, def_policy (),
				492	+ LOCATION_ASPECT_START),
				493	+ exploc_with_display_col (finish_exploc, def_policy (),
				494	+ LOCATION_ASPECT_FINISH),
				495	SHOW_RANGE_WITHOUT_CARET,
				496	- exploc_with_display_col (start_exploc, def_tabstop),
				497	+ exploc_with_display_col (start_exploc, def_policy (),
				498	+ LOCATION_ASPECT_CARET),
				499	0, NULL);
				500	}
				501
				502	@@ -959,6 +1012,164 @@ fixit_cmp (const void p_a, const void
				503	return hint_a->get_start_loc () - hint_b->get_start_loc ();
				504	}
				505
				506	+/* Callbacks for use when not escaping the source. */
				507	+
				508	+/* The default callback for char_column_policy::m_width_cb is cpp_wcwidth. */
				509	+
				510	+/* Callback for char_display_policy::m_print_cb for printing source chars
				511	+ when not escaping the source. */
				512	+
				513	+static void
				514	+default_print_decoded_ch (pretty_printer *pp,
				515	+ const cpp_decoded_char &decoded_ch)
				516	+{
				517	+ for (const char *ptr = decoded_ch.m_start_byte;
				518	+ ptr != decoded_ch.m_next_byte; ptr++)
				519	+ {
				520	+ if (ptr == '\0' \|\| ptr == '\r')
				521	+ {
				522	+ pp_space (pp);
				523	+ continue;
				524	+ }
				525	+
				526	+ pp_character (pp, *ptr);
				527	+ }
				528	+}
				529	+
				530	+/* Callbacks for use with DIAGNOSTICS_ESCAPE_FORMAT_BYTES. */
				531	+
				532	+static const int width_per_escaped_byte = 4;
				533	+
				534	+/* Callback for char_column_policy::m_width_cb for determining the
				535	+ display width when escaping with DIAGNOSTICS_ESCAPE_FORMAT_BYTES. */
				536	+
				537	+static int
				538	+escape_as_bytes_width (cppchar_t ch)
				539	+{
				540	+ if (ch < 0x80 && ISPRINT (ch))
				541	+ return cpp_wcwidth (ch);
				542	+ else
				543	+ {
				544	+ if (ch <= 0x7F) return 1 * width_per_escaped_byte;
				545	+ if (ch <= 0x7FF) return 2 * width_per_escaped_byte;
				546	+ if (ch <= 0xFFFF) return 3 * width_per_escaped_byte;
				547	+ return 4 * width_per_escaped_byte;
				548	+ }
				549	+}
				550	+
				551	+/* Callback for char_display_policy::m_print_cb for printing source chars
				552	+ when escaping with DIAGNOSTICS_ESCAPE_FORMAT_BYTES. */
				553	+
				554	+static void
				555	+escape_as_bytes_print (pretty_printer *pp,
				556	+ const cpp_decoded_char &decoded_ch)
				557	+{
				558	+ if (!decoded_ch.m_valid_ch)
				559	+ {
				560	+ for (const char *iter = decoded_ch.m_start_byte;
				561	+ iter != decoded_ch.m_next_byte; ++iter)
				562	+ {
				563	+ char buf[16];
				564	+ sprintf (buf, "<%02x>", (unsigned char)*iter);
				565	+ pp_string (pp, buf);
				566	+ }
				567	+ return;
				568	+ }
				569	+
				570	+ cppchar_t ch = decoded_ch.m_ch;
				571	+ if (ch < 0x80 && ISPRINT (ch))
				572	+ pp_character (pp, ch);
				573	+ else
				574	+ {
				575	+ for (const char *iter = decoded_ch.m_start_byte;
				576	+ iter < decoded_ch.m_next_byte; ++iter)
				577	+ {
				578	+ char buf[16];
				579	+ sprintf (buf, "<%02x>", (unsigned char)*iter);
				580	+ pp_string (pp, buf);
				581	+ }
				582	+ }
				583	+}
				584	+
				585	+/* Callbacks for use with DIAGNOSTICS_ESCAPE_FORMAT_UNICODE. */
				586	+
				587	+/* Callback for char_column_policy::m_width_cb for determining the
				588	+ display width when escaping with DIAGNOSTICS_ESCAPE_FORMAT_UNICODE. */
				589	+
				590	+static int
				591	+escape_as_unicode_width (cppchar_t ch)
				592	+{
				593	+ if (ch < 0x80 && ISPRINT (ch))
				594	+ return cpp_wcwidth (ch);
				595	+ else
				596	+ {
				597	+ // Width of "<U+%04x>"
				598	+ if (ch > 0xfffff)
				599	+ return 10;
				600	+ else if (ch > 0xffff)
				601	+ return 9;
				602	+ else
				603	+ return 8;
				604	+ }
				605	+}
				606	+
				607	+/* Callback for char_display_policy::m_print_cb for printing source chars
				608	+ when escaping with DIAGNOSTICS_ESCAPE_FORMAT_UNICODE. */
				609	+
				610	+static void
				611	+escape_as_unicode_print (pretty_printer *pp,
				612	+ const cpp_decoded_char &decoded_ch)
				613	+{
				614	+ if (!decoded_ch.m_valid_ch)
				615	+ {
				616	+ escape_as_bytes_print (pp, decoded_ch);
				617	+ return;
				618	+ }
				619	+
				620	+ cppchar_t ch = decoded_ch.m_ch;
				621	+ if (ch < 0x80 && ISPRINT (ch))
				622	+ pp_character (pp, ch);
				623	+ else
				624	+ {
				625	+ char buf[16];
				626	+ sprintf (buf, "<U+%04X>", ch);
				627	+ pp_string (pp, buf);
				628	+ }
				629	+}
				630	+
				631	+/* Populate a char_display_policy based on DC and RICHLOC. */
				632	+
				633	+static char_display_policy
				634	+make_policy (const diagnostic_context &dc,
				635	+ const rich_location &richloc)
				636	+{
				637	+ /* The default is to not escape non-ASCII bytes. */
				638	+ char_display_policy result
				639	+ (dc.tabstop, cpp_wcwidth, default_print_decoded_ch);
				640	+
				641	+ /* If the diagnostic suggests escaping non-ASCII bytes, then
				642	+ use policy from user-supplied options. */
				643	+ if (richloc.escape_on_output_p ())
				644	+ {
				645	+ result.m_undecoded_byte_width = width_per_escaped_byte;
				646	+ switch (dc.escape_format)
				647	+ {
				648	+ default:
				649	+ gcc_unreachable ();
				650	+ case DIAGNOSTICS_ESCAPE_FORMAT_UNICODE:
				651	+ result.m_width_cb = escape_as_unicode_width;
				652	+ result.m_print_cb = escape_as_unicode_print;
				653	+ break;
				654	+ case DIAGNOSTICS_ESCAPE_FORMAT_BYTES:
				655	+ result.m_width_cb = escape_as_bytes_width;
				656	+ result.m_print_cb = escape_as_bytes_print;
				657	+ break;
				658	+ }
				659	+ }
				660	+
				661	+ return result;
				662	+}
				663	+
				664	/* Implementation of class layout. */
				665
				666	/* Constructor for class layout.
				667	@@ -975,8 +1186,10 @@ layout::layout (diagnostic_context * con
				668	diagnostic_t diagnostic_kind)
				669	: m_context (context),
				670	m_pp (context->printer),
				671	+ m_policy (make_policy (context, richloc)),
				672	m_primary_loc (richloc->get_range (0)->m_loc),
				673	- m_exploc (richloc->get_expanded_location (0), context->tabstop),
				674	+ m_exploc (richloc->get_expanded_location (0), m_policy,
				675	+ LOCATION_ASPECT_CARET),
				676	m_colorizer (context, diagnostic_kind),
				677	m_colorize_source_p (context->colorize_source_p),
				678	m_show_labels_p (context->show_labels_p),
				679	@@ -986,7 +1199,8 @@ layout::layout (diagnostic_context * con
				680	m_fixit_hints (richloc->get_num_fixit_hints ()),
				681	m_line_spans (1 + richloc->get_num_locations ()),
				682	m_linenum_width (0),
				683	- m_x_offset_display (0)
				684	+ m_x_offset_display (0),
				685	+ m_escape_on_output (richloc->escape_on_output_p ())
				686	{
				687	for (unsigned int idx = 0; idx < richloc->get_num_locations (); idx++)
				688	{
				689	@@ -1072,10 +1286,13 @@ layout::maybe_add_location_range (const
				690
				691	/* Everything is now known to be in the correct source file,
				692	but it may require further sanitization. */
				693	- layout_range ri (exploc_with_display_col (start, m_context->tabstop),
				694	- exploc_with_display_col (finish, m_context->tabstop),
				695	+ layout_range ri (exploc_with_display_col (start, m_policy,
				696	+ LOCATION_ASPECT_START),
				697	+ exploc_with_display_col (finish, m_policy,
				698	+ LOCATION_ASPECT_FINISH),
				699	loc_range->m_range_display_kind,
				700	- exploc_with_display_col (caret, m_context->tabstop),
				701	+ exploc_with_display_col (caret, m_policy,
				702	+ LOCATION_ASPECT_CARET),
				703	original_idx, loc_range->m_label);
				704
				705	/* If we have a range that finishes before it starts (perhaps
				706	@@ -1409,7 +1626,7 @@ layout::calculate_x_offset_display ()
				707	= get_line_bytes_without_trailing_whitespace (line.get_buffer (),
				708	line.length ());
				709	int eol_display_column
				710	- = cpp_display_width (line.get_buffer (), line_bytes, m_context->tabstop);
				711	+ = cpp_display_width (line.get_buffer (), line_bytes, m_policy);
				712	if (caret_display_column > eol_display_column
				713	\|\| !caret_display_column)
				714	{
				715	@@ -1488,7 +1705,7 @@ layout::print_source_line (linenum_type
				716	/* This object helps to keep track of which display column we are at, which is
				717	necessary for computing the line bounds in display units, for doing
				718	tab expansion, and for implementing m_x_offset_display. */
				719	- cpp_display_width_computation dw (line, line_bytes, m_context->tabstop);
				720	+ cpp_display_width_computation dw (line, line_bytes, m_policy);
				721
				722	/* Skip the first m_x_offset_display display columns. In case the leading
				723	portion that will be skipped ends with a character with wcwidth > 1, then
				724	@@ -1536,7 +1753,8 @@ layout::print_source_line (linenum_type
				725	tabs and replacing some control bytes with spaces as necessary. */
				726	const char *c = dw.next_byte ();
				727	const int start_disp_col = dw.display_cols_processed () + 1;
				728	- const int this_display_width = dw.process_next_codepoint ();
				729	+ cpp_decoded_char cp;
				730	+ const int this_display_width = dw.process_next_codepoint (&cp);
				731	if (*c == '\t')
				732	{
				733	/* The returned display width is the number of spaces into which the
				734	@@ -1545,15 +1763,6 @@ layout::print_source_line (linenum_type
				735	pp_space (m_pp);
				736	continue;
				737	}
				738	- if (c == '\0' \|\| c == '\r')
				739	- {
				740	- /* cpp_wcwidth() promises to return 1 for all control bytes, and we
				741	- want to output these as a single space too, so this case is
				742	- actually the same as the '\t' case. */
				743	- gcc_assert (this_display_width == 1);
				744	- pp_space (m_pp);
				745	- continue;
				746	- }
				747
				748	/* We have a (possibly multibyte) character to output; update the line
				749	bounds if it is not whitespace. */
				750	@@ -1565,7 +1774,8 @@ layout::print_source_line (linenum_type
				751	}
				752
				753	/* Output the character. */
				754	- while (c != dw.next_byte ()) pp_character (m_pp, *c++);
				755	+ m_policy.m_print_cb (m_pp, cp);
				756	+ c = dw.next_byte ();
				757	}
				758	print_newline ();
				759	return lbounds;
				760	@@ -1664,14 +1874,14 @@ layout::print_annotation_line (linenum_t
				761	class line_label
				762	{
				763	public:
				764	- line_label (diagnostic_context *context, int state_idx, int column,
				765	+ line_label (const cpp_char_column_policy &policy,
				766	+ int state_idx, int column,
				767	label_text text)
				768	: m_state_idx (state_idx), m_column (column),
				769	m_text (text), m_label_line (0), m_has_vbar (true)
				770	{
				771	const int bytes = strlen (text.m_buffer);
				772	- m_display_width
				773	- = cpp_display_width (text.m_buffer, bytes, context->tabstop);
				774	+ m_display_width = cpp_display_width (text.m_buffer, bytes, policy);
				775	}
				776
				777	/* Sorting is primarily by column, then by state index. */
				778	@@ -1731,7 +1941,7 @@ layout::print_any_labels (linenum_type r
				779	if (text.m_buffer == NULL)
				780	continue;
				781
				782	- labels.safe_push (line_label (m_context, i, disp_col, text));
				783	+ labels.safe_push (line_label (m_policy, i, disp_col, text));
				784	}
				785	}
				786
				787	@@ -2011,7 +2221,7 @@ public:
				788
				789	/* Get the range of bytes or display columns that HINT would affect. */
				790	static column_range
				791	-get_affected_range (diagnostic_context *context,
				792	+get_affected_range (const cpp_char_column_policy &policy,
				793	const fixit_hint *hint, enum column_unit col_unit)
				794	{
				795	expanded_location exploc_start = expand_location (hint->get_start_loc ());
				796	@@ -2022,13 +2232,11 @@ get_affected_range (diagnostic_context *
				797	int finish_column;
				798	if (col_unit == CU_DISPLAY_COLS)
				799	{
				800	- start_column
				801	- = location_compute_display_column (exploc_start, context->tabstop);
				802	+ start_column = location_compute_display_column (exploc_start, policy);
				803	if (hint->insertion_p ())
				804	finish_column = start_column - 1;
				805	else
				806	- finish_column
				807	- = location_compute_display_column (exploc_finish, context->tabstop);
				808	+ finish_column = location_compute_display_column (exploc_finish, policy);
				809	}
				810	else
				811	{
				812	@@ -2041,12 +2249,13 @@ get_affected_range (diagnostic_context *
				813	/* Get the range of display columns that would be printed for HINT. */
				814
				815	static column_range
				816	-get_printed_columns (diagnostic_context context, const fixit_hint hint)
				817	+get_printed_columns (const cpp_char_column_policy &policy,
				818	+ const fixit_hint *hint)
				819	{
				820	expanded_location exploc = expand_location (hint->get_start_loc ());
				821	- int start_column = location_compute_display_column (exploc, context->tabstop);
				822	+ int start_column = location_compute_display_column (exploc, policy);
				823	int hint_width = cpp_display_width (hint->get_string (), hint->get_length (),
				824	- context->tabstop);
				825	+ policy);
				826	int final_hint_column = start_column + hint_width - 1;
				827	if (hint->insertion_p ())
				828	{
				829	@@ -2056,8 +2265,7 @@ get_printed_columns (diagnostic_context
				830	{
				831	exploc = expand_location (hint->get_next_loc ());
				832	--exploc.column;
				833	- int finish_column
				834	- = location_compute_display_column (exploc, context->tabstop);
				835	+ int finish_column = location_compute_display_column (exploc, policy);
				836	return column_range (start_column,
				837	MAX (finish_column, final_hint_column));
				838	}
				839	@@ -2075,13 +2283,13 @@ public:
				840	column_range affected_columns,
				841	column_range printed_columns,
				842	const char *new_text, size_t new_text_len,
				843	- int tabstop)
				844	+ const cpp_char_column_policy &policy)
				845	: m_affected_bytes (affected_bytes),
				846	m_affected_columns (affected_columns),
				847	m_printed_columns (printed_columns),
				848	m_text (xstrdup (new_text)),
				849	m_byte_length (new_text_len),
				850	- m_tabstop (tabstop),
				851	+ m_policy (policy),
				852	m_alloc_sz (new_text_len + 1)
				853	{
				854	compute_display_cols ();
				855	@@ -2099,7 +2307,7 @@ public:
				856
				857	void compute_display_cols ()
				858	{
				859	- m_display_cols = cpp_display_width (m_text, m_byte_length, m_tabstop);
				860	+ m_display_cols = cpp_display_width (m_text, m_byte_length, m_policy);
				861	}
				862
				863	void overwrite (int dst_offset, const char_span &src_span)
				864	@@ -2127,7 +2335,7 @@ public:
				865	char *m_text;
				866	size_t m_byte_length; /* Not including null-terminator. */
				867	int m_display_cols;
				868	- int m_tabstop;
				869	+ const cpp_char_column_policy &m_policy;
				870	size_t m_alloc_sz;
				871	};
				872
				873	@@ -2163,15 +2371,16 @@ correction::ensure_terminated ()
				874	class line_corrections
				875	{
				876	public:
				877	- line_corrections (diagnostic_context context, const char filename,
				878	+ line_corrections (const char_display_policy &policy,
				879	+ const char *filename,
				880	linenum_type row)
				881	- : m_context (context), m_filename (filename), m_row (row)
				882	+ : m_policy (policy), m_filename (filename), m_row (row)
				883	{}
				884	~line_corrections ();
				885
				886	void add_hint (const fixit_hint *hint);
				887
				888	- diagnostic_context *m_context;
				889	+ const char_display_policy &m_policy;
				890	const char *m_filename;
				891	linenum_type m_row;
				892	auto_vec <correction *> m_corrections;
				893	@@ -2217,10 +2426,10 @@ source_line::source_line (const char *fi
				894	void
				895	line_corrections::add_hint (const fixit_hint *hint)
				896	{
				897	- column_range affected_bytes = get_affected_range (m_context, hint, CU_BYTES);
				898	- column_range affected_columns = get_affected_range (m_context, hint,
				899	+ column_range affected_bytes = get_affected_range (m_policy, hint, CU_BYTES);
				900	+ column_range affected_columns = get_affected_range (m_policy, hint,
				901	CU_DISPLAY_COLS);
				902	- column_range printed_columns = get_printed_columns (m_context, hint);
				903	+ column_range printed_columns = get_printed_columns (m_policy, hint);
				904
				905	/* Potentially consolidate. */
				906	if (!m_corrections.is_empty ())
				907	@@ -2289,7 +2498,7 @@ line_corrections::add_hint (const fixit_
				908	printed_columns,
				909	hint->get_string (),
				910	hint->get_length (),
				911	- m_context->tabstop));
				912	+ m_policy));
				913	}
				914
				915	/* If there are any fixit hints on source line ROW, print them.
				916	@@ -2303,7 +2512,7 @@ layout::print_trailing_fixits (linenum_t
				917	{
				918	/* Build a list of correction instances for the line,
				919	potentially consolidating hints (for the sake of readability). */
				920	- line_corrections corrections (m_context, m_exploc.file, row);
				921	+ line_corrections corrections (m_policy, m_exploc.file, row);
				922	for (unsigned int i = 0; i < m_fixit_hints.length (); i++)
				923	{
				924	const fixit_hint *hint = m_fixit_hints[i];
				925	@@ -2646,6 +2855,59 @@ namespace selftest {
				926
				927	/* Selftests for diagnostic_show_locus. */
				928
				929	+/* Verify that cpp_display_width correctly handles escaping. */
				930	+
				931	+static void
				932	+test_display_widths ()
				933	+{
				934	+ gcc_rich_location richloc (UNKNOWN_LOCATION);
				935	+
				936	+ /* U+03C0 "GREEK SMALL LETTER PI". */
				937	+ const char *pi = "\xCF\x80";
				938	+ /* U+1F642 "SLIGHTLY SMILING FACE". */
				939	+ const char *emoji = "\xF0\x9F\x99\x82";
				940	+ /* Stray trailing byte of a UTF-8 character. */
				941	+ const char *stray = "\xBF";
				942	+ /* U+10FFFF. */
				943	+ const char *max_codepoint = "\xF4\x8F\xBF\xBF";
				944	+
				945	+ /* No escaping. */
				946	+ {
				947	+ test_diagnostic_context dc;
				948	+ char_display_policy policy (make_policy (dc, richloc));
				949	+ ASSERT_EQ (cpp_display_width (pi, strlen (pi), policy), 1);
				950	+ ASSERT_EQ (cpp_display_width (emoji, strlen (emoji), policy), 2);
				951	+ ASSERT_EQ (cpp_display_width (stray, strlen (stray), policy), 1);
				952	+ /* Don't check width of U+10FFFF; it's in a private use plane. */
				953	+ }
				954	+
				955	+ richloc.set_escape_on_output (true);
				956	+
				957	+ {
				958	+ test_diagnostic_context dc;
				959	+ dc.escape_format = DIAGNOSTICS_ESCAPE_FORMAT_UNICODE;
				960	+ char_display_policy policy (make_policy (dc, richloc));
				961	+ ASSERT_EQ (cpp_display_width (pi, strlen (pi), policy), 8);
				962	+ ASSERT_EQ (cpp_display_width (emoji, strlen (emoji), policy), 9);
				963	+ ASSERT_EQ (cpp_display_width (stray, strlen (stray), policy), 4);
				964	+ ASSERT_EQ (cpp_display_width (max_codepoint, strlen (max_codepoint),
				965	+ policy),
				966	+ strlen ("<U+10FFFF>"));
				967	+ }
				968	+
				969	+ {
				970	+ test_diagnostic_context dc;
				971	+ dc.escape_format = DIAGNOSTICS_ESCAPE_FORMAT_BYTES;
				972	+ char_display_policy policy (make_policy (dc, richloc));
				973	+ ASSERT_EQ (cpp_display_width (pi, strlen (pi), policy), 8);
				974	+ ASSERT_EQ (cpp_display_width (emoji, strlen (emoji), policy), 16);
				975	+ ASSERT_EQ (cpp_display_width (stray, strlen (stray), policy), 4);
				976	+ ASSERT_EQ (cpp_display_width (max_codepoint, strlen (max_codepoint),
				977	+ policy),
				978	+ 16);
				979	+ }
				980	+}
				981	+
				982	/* For precise tests of the layout, make clear where the source line will
				983	start. test_left_margin sets the total byte count from the left side of the
				984	screen to the start of source lines, after the line number and the separator,
				985	@@ -2715,10 +2977,10 @@ test_layout_x_offset_display_utf8 (const
				986	char_span lspan = location_get_source_line (tmp.get_filename (), 1);
				987	ASSERT_EQ (line_display_cols,
				988	cpp_display_width (lspan.get_buffer (), lspan.length (),
				989	- def_tabstop));
				990	+ def_policy ()));
				991	ASSERT_EQ (line_display_cols,
				992	location_compute_display_column (expand_location (line_end),
				993	- def_tabstop));
				994	+ def_policy ()));
				995	ASSERT_EQ (0, memcmp (lspan.get_buffer () + (emoji_col - 1),
				996	"\xf0\x9f\x98\x82\xf0\x9f\x98\x82", 8));
				997
				998	@@ -2866,12 +3128,13 @@ test_layout_x_offset_display_tab (const
				999	ASSERT_EQ ('\t', *(lspan.get_buffer () + (tab_col - 1)));
				1000	for (int tabstop = 1; tabstop != num_tabstops; ++tabstop)
				1001	{
				1002	+ cpp_char_column_policy policy (tabstop, cpp_wcwidth);
				1003	ASSERT_EQ (line_bytes + extra_width[tabstop],
				1004	cpp_display_width (lspan.get_buffer (), lspan.length (),
				1005	- tabstop));
				1006	+ policy));
				1007	ASSERT_EQ (line_bytes + extra_width[tabstop],
				1008	location_compute_display_column (expand_location (line_end),
				1009	- tabstop));
				1010	+ policy));
				1011	}
				1012
				1013	/* Check that the tab is expanded to the expected number of spaces. */
				1014	@@ -4003,6 +4266,43 @@ test_one_liner_labels_utf8 ()
				1015	" bb\xf0\x9f\x98\x82\xf0\x9f\x98\x82\n",
				1016	pp_formatted_text (dc.printer));
				1017	}
				1018	+
				1019	+ /* Example of escaping the source lines. */
				1020	+ {
				1021	+ text_range_label label0 ("label 0\xf0\x9f\x98\x82");
				1022	+ text_range_label label1 ("label 1\xcf\x80");
				1023	+ text_range_label label2 ("label 2\xcf\x80");
				1024	+ gcc_rich_location richloc (foo, &label0);
				1025	+ richloc.add_range (bar, SHOW_RANGE_WITHOUT_CARET, &label1);
				1026	+ richloc.add_range (field, SHOW_RANGE_WITHOUT_CARET, &label2);
				1027	+ richloc.set_escape_on_output (true);
				1028	+
				1029	+ {
				1030	+ test_diagnostic_context dc;
				1031	+ dc.escape_format = DIAGNOSTICS_ESCAPE_FORMAT_UNICODE;
				1032	+ diagnostic_show_locus (&dc, &richloc, DK_ERROR);
				1033	+ ASSERT_STREQ (" <U+1F602>_foo = <U+03C0>_bar.<U+1F602>_field<U+03C0>;\n"
				1034	+ " ^~~~~~~~~~~~~ ~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~~~\n"
				1035	+ " \| \| \|\n"
				1036	+ " \| \| label 2\xcf\x80\n"
				1037	+ " \| label 1\xcf\x80\n"
				1038	+ " label 0\xf0\x9f\x98\x82\n",
				1039	+ pp_formatted_text (dc.printer));
				1040	+ }
				1041	+ {
				1042	+ test_diagnostic_context dc;
				1043	+ dc.escape_format = DIAGNOSTICS_ESCAPE_FORMAT_BYTES;
				1044	+ diagnostic_show_locus (&dc, &richloc, DK_ERROR);
				1045	+ ASSERT_STREQ
				1046	+ (" <f0><9f><98><82>_foo = <cf><80>_bar.<f0><9f><98><82>_field<cf><80>;\n"
				1047	+ " ^~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n"
				1048	+ " \| \| \|\n"
				1049	+ " \| \| label 2\xcf\x80\n"
				1050	+ " \| label 1\xcf\x80\n"
				1051	+ " label 0\xf0\x9f\x98\x82\n",
				1052	+ pp_formatted_text (dc.printer));
				1053	+ }
				1054	+ }
				1055	}
				1056
				1057	/* Make sure that colorization codes don't interrupt a multibyte
				1058	@@ -4057,9 +4357,9 @@ test_diagnostic_show_locus_one_liner_utf
				1059
				1060	char_span lspan = location_get_source_line (tmp.get_filename (), 1);
				1061	ASSERT_EQ (25, cpp_display_width (lspan.get_buffer (), lspan.length (),
				1062	- def_tabstop));
				1063	+ def_policy ()));
				1064	ASSERT_EQ (25, location_compute_display_column (expand_location (line_end),
				1065	- def_tabstop));
				1066	+ def_policy ()));
				1067
				1068	test_one_liner_simple_caret_utf8 ();
				1069	test_one_liner_caret_and_range_utf8 ();
				1070	@@ -4445,30 +4745,31 @@ test_overlapped_fixit_printing (const li
				1071	pp_formatted_text (dc.printer));
				1072
				1073	/* Unit-test the line_corrections machinery. */
				1074	+ char_display_policy policy (make_policy (dc, richloc));
				1075	ASSERT_EQ (3, richloc.get_num_fixit_hints ());
				1076	const fixit_hint *hint_0 = richloc.get_fixit_hint (0);
				1077	ASSERT_EQ (column_range (12, 12),
				1078	- get_affected_range (&dc, hint_0, CU_BYTES));
				1079	+ get_affected_range (policy, hint_0, CU_BYTES));
				1080	ASSERT_EQ (column_range (12, 12),
				1081	- get_affected_range (&dc, hint_0, CU_DISPLAY_COLS));
				1082	- ASSERT_EQ (column_range (12, 22), get_printed_columns (&dc, hint_0));
				1083	+ get_affected_range (policy, hint_0, CU_DISPLAY_COLS));
				1084	+ ASSERT_EQ (column_range (12, 22), get_printed_columns (policy, hint_0));
				1085	const fixit_hint *hint_1 = richloc.get_fixit_hint (1);
				1086	ASSERT_EQ (column_range (18, 18),
				1087	- get_affected_range (&dc, hint_1, CU_BYTES));
				1088	+ get_affected_range (policy, hint_1, CU_BYTES));
				1089	ASSERT_EQ (column_range (18, 18),
				1090	- get_affected_range (&dc, hint_1, CU_DISPLAY_COLS));
				1091	- ASSERT_EQ (column_range (18, 20), get_printed_columns (&dc, hint_1));
				1092	+ get_affected_range (policy, hint_1, CU_DISPLAY_COLS));
				1093	+ ASSERT_EQ (column_range (18, 20), get_printed_columns (policy, hint_1));
				1094	const fixit_hint *hint_2 = richloc.get_fixit_hint (2);
				1095	ASSERT_EQ (column_range (29, 28),
				1096	- get_affected_range (&dc, hint_2, CU_BYTES));
				1097	+ get_affected_range (policy, hint_2, CU_BYTES));
				1098	ASSERT_EQ (column_range (29, 28),
				1099	- get_affected_range (&dc, hint_2, CU_DISPLAY_COLS));
				1100	- ASSERT_EQ (column_range (29, 29), get_printed_columns (&dc, hint_2));
				1101	+ get_affected_range (policy, hint_2, CU_DISPLAY_COLS));
				1102	+ ASSERT_EQ (column_range (29, 29), get_printed_columns (policy, hint_2));
				1103
				1104	/* Add each hint in turn to a line_corrections instance,
				1105	and verify that they are consolidated into one correction instance
				1106	as expected. */
				1107	- line_corrections lc (&dc, tmp.get_filename (), 1);
				1108	+ line_corrections lc (policy, tmp.get_filename (), 1);
				1109
				1110	/* The first replace hint by itself. */
				1111	lc.add_hint (hint_0);
				1112	@@ -4660,30 +4961,31 @@ test_overlapped_fixit_printing_utf8 (con
				1113	pp_formatted_text (dc.printer));
				1114
				1115	/* Unit-test the line_corrections machinery. */
				1116	+ char_display_policy policy (make_policy (dc, richloc));
				1117	ASSERT_EQ (3, richloc.get_num_fixit_hints ());
				1118	const fixit_hint *hint_0 = richloc.get_fixit_hint (0);
				1119	ASSERT_EQ (column_range (14, 14),
				1120	- get_affected_range (&dc, hint_0, CU_BYTES));
				1121	+ get_affected_range (policy, hint_0, CU_BYTES));
				1122	ASSERT_EQ (column_range (12, 12),
				1123	- get_affected_range (&dc, hint_0, CU_DISPLAY_COLS));
				1124	- ASSERT_EQ (column_range (12, 22), get_printed_columns (&dc, hint_0));
				1125	+ get_affected_range (policy, hint_0, CU_DISPLAY_COLS));
				1126	+ ASSERT_EQ (column_range (12, 22), get_printed_columns (policy, hint_0));
				1127	const fixit_hint *hint_1 = richloc.get_fixit_hint (1);
				1128	ASSERT_EQ (column_range (22, 22),
				1129	- get_affected_range (&dc, hint_1, CU_BYTES));
				1130	+ get_affected_range (policy, hint_1, CU_BYTES));
				1131	ASSERT_EQ (column_range (18, 18),
				1132	- get_affected_range (&dc, hint_1, CU_DISPLAY_COLS));
				1133	- ASSERT_EQ (column_range (18, 20), get_printed_columns (&dc, hint_1));
				1134	+ get_affected_range (policy, hint_1, CU_DISPLAY_COLS));
				1135	+ ASSERT_EQ (column_range (18, 20), get_printed_columns (policy, hint_1));
				1136	const fixit_hint *hint_2 = richloc.get_fixit_hint (2);
				1137	ASSERT_EQ (column_range (35, 34),
				1138	- get_affected_range (&dc, hint_2, CU_BYTES));
				1139	+ get_affected_range (policy, hint_2, CU_BYTES));
				1140	ASSERT_EQ (column_range (30, 29),
				1141	- get_affected_range (&dc, hint_2, CU_DISPLAY_COLS));
				1142	- ASSERT_EQ (column_range (30, 30), get_printed_columns (&dc, hint_2));
				1143	+ get_affected_range (policy, hint_2, CU_DISPLAY_COLS));
				1144	+ ASSERT_EQ (column_range (30, 30), get_printed_columns (policy, hint_2));
				1145
				1146	/* Add each hint in turn to a line_corrections instance,
				1147	and verify that they are consolidated into one correction instance
				1148	as expected. */
				1149	- line_corrections lc (&dc, tmp.get_filename (), 1);
				1150	+ line_corrections lc (policy, tmp.get_filename (), 1);
				1151
				1152	/* The first replace hint by itself. */
				1153	lc.add_hint (hint_0);
				1154	@@ -4877,15 +5179,16 @@ test_overlapped_fixit_printing_2 (const
				1155	richloc.add_fixit_insert_before (col_21, "}");
				1156
				1157	/* These fixits should be accepted; they can't be consolidated. */
				1158	+ char_display_policy policy (make_policy (dc, richloc));
				1159	ASSERT_EQ (2, richloc.get_num_fixit_hints ());
				1160	const fixit_hint *hint_0 = richloc.get_fixit_hint (0);
				1161	ASSERT_EQ (column_range (23, 22),
				1162	- get_affected_range (&dc, hint_0, CU_BYTES));
				1163	- ASSERT_EQ (column_range (23, 23), get_printed_columns (&dc, hint_0));
				1164	+ get_affected_range (policy, hint_0, CU_BYTES));
				1165	+ ASSERT_EQ (column_range (23, 23), get_printed_columns (policy, hint_0));
				1166	const fixit_hint *hint_1 = richloc.get_fixit_hint (1);
				1167	ASSERT_EQ (column_range (21, 20),
				1168	- get_affected_range (&dc, hint_1, CU_BYTES));
				1169	- ASSERT_EQ (column_range (21, 21), get_printed_columns (&dc, hint_1));
				1170	+ get_affected_range (policy, hint_1, CU_BYTES));
				1171	+ ASSERT_EQ (column_range (21, 21), get_printed_columns (policy, hint_1));
				1172
				1173	/* Verify that they're printed correctly. */
				1174	diagnostic_show_locus (&dc, &richloc, DK_ERROR);
				1175	@@ -5152,10 +5455,11 @@ test_tab_expansion (const line_table_cas
				1176	....................123 45678901234 56789012345 columns */
				1177
				1178	const int tabstop = 8;
				1179	+ cpp_char_column_policy policy (tabstop, cpp_wcwidth);
				1180	const int first_non_ws_byte_col = 7;
				1181	const int right_quote_byte_col = 15;
				1182	const int last_byte_col = 25;
				1183	- ASSERT_EQ (35, cpp_display_width (content, last_byte_col, tabstop));
				1184	+ ASSERT_EQ (35, cpp_display_width (content, last_byte_col, policy));
				1185
				1186	temp_source_file tmp (SELFTEST_LOCATION, ".c", content);
				1187	line_table_test ltt (case_);
				1188	@@ -5198,6 +5502,114 @@ test_tab_expansion (const line_table_cas
				1189	}
				1190	}
				1191
				1192	+/* Verify that the escaping machinery can cope with a variety of different
				1193	+ invalid bytes. */
				1194	+
				1195	+static void
				1196	+test_escaping_bytes_1 (const line_table_case &case_)
				1197	+{
				1198	+ const char content[] = "before\0\1\2\3\r\x80\xff""after\n";
				1199	+ const size_t sz = sizeof (content);
				1200	+ temp_source_file tmp (SELFTEST_LOCATION, ".c", content, sz);
				1201	+ line_table_test ltt (case_);
				1202	+ const line_map_ordinary *ord_map = linemap_check_ordinary
				1203	+ (linemap_add (line_table, LC_ENTER, false, tmp.get_filename (), 0));
				1204	+ linemap_line_start (line_table, 1, 100);
				1205	+
				1206	+ location_t finish
				1207	+ = linemap_position_for_line_and_column (line_table, ord_map, 1,
				1208	+ strlen (content));
				1209	+
				1210	+ if (finish > LINE_MAP_MAX_LOCATION_WITH_COLS)
				1211	+ return;
				1212	+
				1213	+ /* Locations of the NUL and \r bytes. */
				1214	+ location_t nul_loc
				1215	+ = linemap_position_for_line_and_column (line_table, ord_map, 1, 7);
				1216	+ location_t r_loc
				1217	+ = linemap_position_for_line_and_column (line_table, ord_map, 1, 11);
				1218	+ gcc_rich_location richloc (nul_loc);
				1219	+ richloc.add_range (r_loc);
				1220	+
				1221	+ {
				1222	+ test_diagnostic_context dc;
				1223	+ diagnostic_show_locus (&dc, &richloc, DK_ERROR);
				1224	+ ASSERT_STREQ (" before \1\2\3 \x80\xff""after\n"
				1225	+ " ^ ~\n",
				1226	+ pp_formatted_text (dc.printer));
				1227	+ }
				1228	+ richloc.set_escape_on_output (true);
				1229	+ {
				1230	+ test_diagnostic_context dc;
				1231	+ dc.escape_format = DIAGNOSTICS_ESCAPE_FORMAT_UNICODE;
				1232	+ diagnostic_show_locus (&dc, &richloc, DK_ERROR);
				1233	+ ASSERT_STREQ
				1234	+ (" before<U+0000><U+0001><U+0002><U+0003><U+000D><80><ff>after\n"
				1235	+ " ^~~~~~~~ ~~~~~~~~\n",
				1236	+ pp_formatted_text (dc.printer));
				1237	+ }
				1238	+ {
				1239	+ test_diagnostic_context dc;
				1240	+ dc.escape_format = DIAGNOSTICS_ESCAPE_FORMAT_BYTES;
				1241	+ diagnostic_show_locus (&dc, &richloc, DK_ERROR);
				1242	+ ASSERT_STREQ (" before<00><01><02><03><0d><80><ff>after\n"
				1243	+ " ^~~~ ~~~~\n",
				1244	+ pp_formatted_text (dc.printer));
				1245	+ }
				1246	+}
				1247	+
				1248	+/* As above, but verify that we handle the initial byte of a line
				1249	+ correctly. */
				1250	+
				1251	+static void
				1252	+test_escaping_bytes_2 (const line_table_case &case_)
				1253	+{
				1254	+ const char content[] = "\0after\n";
				1255	+ const size_t sz = sizeof (content);
				1256	+ temp_source_file tmp (SELFTEST_LOCATION, ".c", content, sz);
				1257	+ line_table_test ltt (case_);
				1258	+ const line_map_ordinary *ord_map = linemap_check_ordinary
				1259	+ (linemap_add (line_table, LC_ENTER, false, tmp.get_filename (), 0));
				1260	+ linemap_line_start (line_table, 1, 100);
				1261	+
				1262	+ location_t finish
				1263	+ = linemap_position_for_line_and_column (line_table, ord_map, 1,
				1264	+ strlen (content));
				1265	+
				1266	+ if (finish > LINE_MAP_MAX_LOCATION_WITH_COLS)
				1267	+ return;
				1268	+
				1269	+ /* Location of the NUL byte. */
				1270	+ location_t nul_loc
				1271	+ = linemap_position_for_line_and_column (line_table, ord_map, 1, 1);
				1272	+ gcc_rich_location richloc (nul_loc);
				1273	+
				1274	+ {
				1275	+ test_diagnostic_context dc;
				1276	+ diagnostic_show_locus (&dc, &richloc, DK_ERROR);
				1277	+ ASSERT_STREQ (" after\n"
				1278	+ " ^\n",
				1279	+ pp_formatted_text (dc.printer));
				1280	+ }
				1281	+ richloc.set_escape_on_output (true);
				1282	+ {
				1283	+ test_diagnostic_context dc;
				1284	+ dc.escape_format = DIAGNOSTICS_ESCAPE_FORMAT_UNICODE;
				1285	+ diagnostic_show_locus (&dc, &richloc, DK_ERROR);
				1286	+ ASSERT_STREQ (" <U+0000>after\n"
				1287	+ " ^~~~~~~~\n",
				1288	+ pp_formatted_text (dc.printer));
				1289	+ }
				1290	+ {
				1291	+ test_diagnostic_context dc;
				1292	+ dc.escape_format = DIAGNOSTICS_ESCAPE_FORMAT_BYTES;
				1293	+ diagnostic_show_locus (&dc, &richloc, DK_ERROR);
				1294	+ ASSERT_STREQ (" <00>after\n"
				1295	+ " ^~~~\n",
				1296	+ pp_formatted_text (dc.printer));
				1297	+ }
				1298	+}
				1299	+
				1300	/* Verify that line numbers are correctly printed for the case of
				1301	a multiline range in which the width of the line numbers changes
				1302	(e.g. from "9" to "10"). */
				1303	@@ -5254,6 +5666,8 @@ diagnostic_show_locus_c_tests ()
				1304	test_layout_range_for_single_line ();
				1305	test_layout_range_for_multiple_lines ();
				1306
				1307	+ test_display_widths ();
				1308	+
				1309	for_each_line_table_case (test_layout_x_offset_display_utf8);
				1310	for_each_line_table_case (test_layout_x_offset_display_tab);
				1311
				1312	@@ -5274,6 +5688,8 @@ diagnostic_show_locus_c_tests ()
				1313	for_each_line_table_case (test_fixit_replace_containing_newline);
				1314	for_each_line_table_case (test_fixit_deletion_affecting_newline);
				1315	for_each_line_table_case (test_tab_expansion);
				1316	+ for_each_line_table_case (test_escaping_bytes_1);
				1317	+ for_each_line_table_case (test_escaping_bytes_2);
				1318
				1319	test_line_numbers_multiline_range ();
				1320	}
				1321	diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
				1322	--- a/gcc/doc/invoke.texi 2021-12-13 23:23:05.764437151 -0800
				1323	+++ b/gcc/doc/invoke.texi 2021-12-14 01:16:01.553943061 -0800
				1324	@@ -312,7 +312,8 @@ Objective-C and Objective-C++ Dialects}.
				1325	-fdiagnostics-show-path-depths @gol
				1326	-fno-show-column @gol
				1327	-fdiagnostics-column-unit=@r{[}display@r{\|}byte@r{]} @gol
				1328	--fdiagnostics-column-origin=@var{origin}}
				1329	+-fdiagnostics-column-origin=@var{origin} @gol
				1330	+-fdiagnostics-escape-format=@r{[}unicode@r{\|}bytes@r{]}}
				1331
				1332	@item Warning Options
				1333	@xref{Warning Options,,Options to Request or Suppress Warnings}.
				1334	@@ -5083,6 +5084,38 @@ first column. The default value of 1 co
				1335	behavior and to the GNU style guide. Some utilities may perform better with an
				1336	origin of 0; any non-negative value may be specified.
				1337
				1338	+@item -fdiagnostics-escape-format=@var{FORMAT}
				1339	+@opindex fdiagnostics-escape-format
				1340	+When GCC prints pertinent source lines for a diagnostic it normally attempts
				1341	+to print the source bytes directly. However, some diagnostics relate to encoding
				1342	+issues in the source file, such as malformed UTF-8, or issues with Unicode
				1343	+normalization. These diagnostics are flagged so that GCC will escape bytes
				1344	+that are not printable ASCII when printing their pertinent source lines.
				1345	+
				1346	+This option controls how such bytes should be escaped.
				1347	+
				1348	+The default @var{FORMAT}, @samp{unicode} displays Unicode characters that
				1349	+are not printable ASCII in the form @samp{<U+XXXX>}, and bytes that do not
				1350	+correspond to a Unicode character validly-encoded in UTF-8-encoded will be
				1351	+displayed as hexadecimal in the form @samp{<XX>}.
				1352	+
				1353	+For example, a source line containing the string @samp{before} followed by the
				1354	+Unicode character U+03C0 (``GREEK SMALL LETTER PI'', with UTF-8 encoding
				1355	+0xCF 0x80) followed by the byte 0xBF (a stray UTF-8 trailing byte), followed by
				1356	+the string @samp{after} will be printed for such a diagnostic as:
				1357	+
				1358	+@smallexample
				1359	+ before<U+03C0><BF>after
				1360	+@end smallexample
				1361	+
				1362	+Setting @var{FORMAT} to @samp{bytes} will display all non-printable-ASCII bytes
				1363	+in the form @samp{<XX>}, thus showing the underlying encoding of non-ASCII
				1364	+Unicode characters. For the example above, the following will be printed:
				1365	+
				1366	+@smallexample
				1367	+ before<CF><80><BF>after
				1368	+@end smallexample
				1369	+
				1370	@item -fdiagnostics-format=@var{FORMAT}
				1371	@opindex fdiagnostics-format
				1372	Select a different format for printing diagnostics.
				1373	@@ -5150,9 +5183,11 @@ might be printed in JSON form (after for
				1374	@}
				1375	@}
				1376	],
				1377	+ "escape-source": false,
				1378	"message": "...this statement, but the latter is @dots{}"
				1379	@}
				1380	]
				1381	+ "escape-source": false,
				1382	"column-origin": 1,
				1383	@},
				1384	@dots{}
				1385	@@ -5239,6 +5274,7 @@ of the expression, which have labels. I
				1386	"label": "T @{aka struct t@}"
				1387	@}
				1388	],
				1389	+ "escape-source": false,
				1390	"message": "invalid operands to binary + @dots{}"
				1391	@}
				1392	@end smallexample
				1393	@@ -5292,6 +5328,7 @@ might be printed in JSON form as:
				1394	@}
				1395	@}
				1396	],
				1397	+ "escape-source": false,
				1398	"message": "\u2018struct s\u2019 has no member named @dots{}"
				1399	@}
				1400	@end smallexample
				1401	@@ -5349,6 +5386,10 @@ For example, the intraprocedural example
				1402	]
				1403	@end smallexample
				1404
				1405	+Diagnostics have a boolean attribute @code{escape-source}, hinting whether
				1406	+non-ASCII bytes should be escaped when printing the pertinent lines of
				1407	+source code (@code{true} for diagnostics involving source encoding issues).
				1408	+
				1409	@end table
				1410
				1411	@node Warning Options
				1412	diff --git a/gcc/input.c b/gcc/input.c
				1413	--- a/gcc/input.c 2021-07-27 23:55:07.328287915 -0700
				1414	+++ b/gcc/input.c 2021-12-14 01:16:01.553943061 -0800
				1415	@@ -913,7 +913,8 @@ make_location (location_t caret, source_
				1416	source line in order to calculate the display width. If that cannot be done
				1417	for any reason, then returns the byte column as a fallback. */
				1418	int
				1419	-location_compute_display_column (expanded_location exploc, int tabstop)
				1420	+location_compute_display_column (expanded_location exploc,
				1421	+ const cpp_char_column_policy &policy)
				1422	{
				1423	if (!(exploc.file && *exploc.file && exploc.line && exploc.column))
				1424	return exploc.column;
				1425	@@ -921,7 +922,7 @@ location_compute_display_column (expande
				1426	/* If line is NULL, this function returns exploc.column which is the
				1427	desired fallback. */
				1428	return cpp_byte_column_to_display_column (line.get_buffer (), line.length (),
				1429	- exploc.column, tabstop);
				1430	+ exploc.column, policy);
				1431	}
				1432
				1433	/* Dump statistics to stderr about the memory usage of the line_table
				1434	@@ -3611,43 +3612,50 @@ test_line_offset_overflow ()
				1435	void test_cpp_utf8 ()
				1436	{
				1437	const int def_tabstop = 8;
				1438	+ cpp_char_column_policy policy (def_tabstop, cpp_wcwidth);
				1439	+
				1440	/* Verify that wcwidth of invalid UTF-8 or control bytes is 1. */
				1441	{
				1442	- int w_bad = cpp_display_width ("\xf0!\x9f!\x98!\x82!", 8, def_tabstop);
				1443	+ int w_bad = cpp_display_width ("\xf0!\x9f!\x98!\x82!", 8, policy);
				1444	ASSERT_EQ (8, w_bad);
				1445	- int w_ctrl = cpp_display_width ("\r\n\v\0\1", 5, def_tabstop);
				1446	+ int w_ctrl = cpp_display_width ("\r\n\v\0\1", 5, policy);
				1447	ASSERT_EQ (5, w_ctrl);
				1448	}
				1449
				1450	/* Verify that wcwidth of valid UTF-8 is as expected. */
				1451	{
				1452	- const int w_pi = cpp_display_width ("\xcf\x80", 2, def_tabstop);
				1453	+ const int w_pi = cpp_display_width ("\xcf\x80", 2, policy);
				1454	ASSERT_EQ (1, w_pi);
				1455	- const int w_emoji = cpp_display_width ("\xf0\x9f\x98\x82", 4, def_tabstop);
				1456	+ const int w_emoji = cpp_display_width ("\xf0\x9f\x98\x82", 4, policy);
				1457	ASSERT_EQ (2, w_emoji);
				1458	const int w_umlaut_precomposed = cpp_display_width ("\xc3\xbf", 2,
				1459	- def_tabstop);
				1460	+ policy);
				1461	ASSERT_EQ (1, w_umlaut_precomposed);
				1462	const int w_umlaut_combining = cpp_display_width ("y\xcc\x88", 3,
				1463	- def_tabstop);
				1464	+ policy);
				1465	ASSERT_EQ (1, w_umlaut_combining);
				1466	- const int w_han = cpp_display_width ("\xe4\xb8\xba", 3, def_tabstop);
				1467	+ const int w_han = cpp_display_width ("\xe4\xb8\xba", 3, policy);
				1468	ASSERT_EQ (2, w_han);
				1469	- const int w_ascii = cpp_display_width ("GCC", 3, def_tabstop);
				1470	+ const int w_ascii = cpp_display_width ("GCC", 3, policy);
				1471	ASSERT_EQ (3, w_ascii);
				1472	const int w_mixed = cpp_display_width ("\xcf\x80 = 3.14 \xf0\x9f\x98\x82"
				1473	"\x9f! \xe4\xb8\xba y\xcc\x88",
				1474	- 24, def_tabstop);
				1475	+ 24, policy);
				1476	ASSERT_EQ (18, w_mixed);
				1477	}
				1478
				1479	/* Verify that display width properly expands tabs. */
				1480	{
				1481	const char *tstr = "\tabc\td";
				1482	- ASSERT_EQ (6, cpp_display_width (tstr, 6, 1));
				1483	- ASSERT_EQ (10, cpp_display_width (tstr, 6, 3));
				1484	- ASSERT_EQ (17, cpp_display_width (tstr, 6, 8));
				1485	- ASSERT_EQ (1, cpp_display_column_to_byte_column (tstr, 6, 7, 8));
				1486	+ ASSERT_EQ (6, cpp_display_width (tstr, 6,
				1487	+ cpp_char_column_policy (1, cpp_wcwidth)));
				1488	+ ASSERT_EQ (10, cpp_display_width (tstr, 6,
				1489	+ cpp_char_column_policy (3, cpp_wcwidth)));
				1490	+ ASSERT_EQ (17, cpp_display_width (tstr, 6,
				1491	+ cpp_char_column_policy (8, cpp_wcwidth)));
				1492	+ ASSERT_EQ (1,
				1493	+ cpp_display_column_to_byte_column
				1494	+ (tstr, 6, 7, cpp_char_column_policy (8, cpp_wcwidth)));
				1495	}
				1496
				1497	/* Verify that cpp_byte_column_to_display_column can go past the end,
				1498	@@ -3660,13 +3668,13 @@ void test_cpp_utf8 ()
				1499	/* 111122223456
				1500	Byte columns. */
				1501
				1502	- ASSERT_EQ (5, cpp_display_width (str, 6, def_tabstop));
				1503	+ ASSERT_EQ (5, cpp_display_width (str, 6, policy));
				1504	ASSERT_EQ (105,
				1505	- cpp_byte_column_to_display_column (str, 6, 106, def_tabstop));
				1506	+ cpp_byte_column_to_display_column (str, 6, 106, policy));
				1507	ASSERT_EQ (10000,
				1508	- cpp_byte_column_to_display_column (NULL, 0, 10000, def_tabstop));
				1509	+ cpp_byte_column_to_display_column (NULL, 0, 10000, policy));
				1510	ASSERT_EQ (0,
				1511	- cpp_byte_column_to_display_column (NULL, 10000, 0, def_tabstop));
				1512	+ cpp_byte_column_to_display_column (NULL, 10000, 0, policy));
				1513	}
				1514
				1515	/* Verify that cpp_display_column_to_byte_column can go past the end,
				1516	@@ -3680,25 +3688,25 @@ void test_cpp_utf8 ()
				1517	/* 000000000000000000000000000000000111111
				1518	111122223333444456666777788889999012345
				1519	Byte columns. */
				1520	- ASSERT_EQ (4, cpp_display_column_to_byte_column (str, 15, 2, def_tabstop));
				1521	+ ASSERT_EQ (4, cpp_display_column_to_byte_column (str, 15, 2, policy));
				1522	ASSERT_EQ (15,
				1523	- cpp_display_column_to_byte_column (str, 15, 11, def_tabstop));
				1524	+ cpp_display_column_to_byte_column (str, 15, 11, policy));
				1525	ASSERT_EQ (115,
				1526	- cpp_display_column_to_byte_column (str, 15, 111, def_tabstop));
				1527	+ cpp_display_column_to_byte_column (str, 15, 111, policy));
				1528	ASSERT_EQ (10000,
				1529	- cpp_display_column_to_byte_column (NULL, 0, 10000, def_tabstop));
				1530	+ cpp_display_column_to_byte_column (NULL, 0, 10000, policy));
				1531	ASSERT_EQ (0,
				1532	- cpp_display_column_to_byte_column (NULL, 10000, 0, def_tabstop));
				1533	+ cpp_display_column_to_byte_column (NULL, 10000, 0, policy));
				1534
				1535	/* Verify that we do not interrupt a UTF-8 sequence. */
				1536	- ASSERT_EQ (4, cpp_display_column_to_byte_column (str, 15, 1, def_tabstop));
				1537	+ ASSERT_EQ (4, cpp_display_column_to_byte_column (str, 15, 1, policy));
				1538
				1539	for (int byte_col = 1; byte_col <= 15; ++byte_col)
				1540	{
				1541	const int disp_col
				1542	- = cpp_byte_column_to_display_column (str, 15, byte_col, def_tabstop);
				1543	+ = cpp_byte_column_to_display_column (str, 15, byte_col, policy);
				1544	const int byte_col2
				1545	- = cpp_display_column_to_byte_column (str, 15, disp_col, def_tabstop);
				1546	+ = cpp_display_column_to_byte_column (str, 15, disp_col, policy);
				1547
				1548	/* If we ask for the display column in the middle of a UTF-8
				1549	sequence, it will return the length of the partial sequence,
				1550	diff --git a/gcc/input.h b/gcc/input.h
				1551	--- a/gcc/input.h 2021-07-27 23:55:07.328287915 -0700
				1552	+++ b/gcc/input.h 2021-12-14 01:16:01.553943061 -0800
				1553	@@ -39,8 +39,11 @@ STATIC_ASSERT (BUILTINS_LOCATION < RESER
				1554	extern bool is_location_from_builtin_token (location_t);
				1555	extern expanded_location expand_location (location_t);
				1556
				1557	-extern int location_compute_display_column (expanded_location exploc,
				1558	- int tabstop);
				1559	+class cpp_char_column_policy;
				1560	+
				1561	+extern int
				1562	+location_compute_display_column (expanded_location exploc,
				1563	+ const cpp_char_column_policy &policy);
				1564
				1565	/* A class capturing the bounds of a buffer, to allow for run-time
				1566	bounds-checking in a checked build. */
				1567	diff --git a/gcc/opts.c b/gcc/opts.c
				1568	--- a/gcc/opts.c 2021-07-27 23:55:07.364288417 -0700
				1569	+++ b/gcc/opts.c 2021-12-14 01:16:01.553943061 -0800
				1570	@@ -2573,6 +2573,10 @@ common_handle_option (struct gcc_options
				1571	dc->column_origin = value;
				1572	break;
				1573
				1574	+ case OPT_fdiagnostics_escape_format_:
				1575	+ dc->escape_format = (enum diagnostics_escape_format)value;
				1576	+ break;
				1577	+
				1578	case OPT_fdiagnostics_show_cwe:
				1579	dc->show_cwe = value;
				1580	break;
				1581	diff --git a/gcc/selftest.c b/gcc/selftest.c
				1582	--- a/gcc/selftest.c 2021-07-27 23:55:07.500290315 -0700
				1583	+++ b/gcc/selftest.c 2021-12-14 01:16:01.557942991 -0800
				1584	@@ -193,6 +193,21 @@ temp_source_file::temp_source_file (cons
				1585	fclose (out);
				1586	}
				1587
				1588	+/* As above, but with a size, to allow for NUL bytes in CONTENT. */
				1589	+
				1590	+temp_source_file::temp_source_file (const location &loc,
				1591	+ const char *suffix,
				1592	+ const char *content,
				1593	+ size_t sz)
				1594	+: named_temp_file (suffix)
				1595	+{
				1596	+ FILE *out = fopen (get_filename (), "w");
				1597	+ if (!out)
				1598	+ fail_formatted (loc, "unable to open tempfile: %s", get_filename ());
				1599	+ fwrite (content, sz, 1, out);
				1600	+ fclose (out);
				1601	+}
				1602	+
				1603	/* Avoid introducing locale-specific differences in the results
				1604	by hardcoding open_quote and close_quote. */
				1605
				1606	diff --git a/gcc/selftest.h b/gcc/selftest.h
				1607	--- a/gcc/selftest.h 2021-07-27 23:55:07.500290315 -0700
				1608	+++ b/gcc/selftest.h 2021-12-14 01:16:01.557942991 -0800
				1609	@@ -112,6 +112,8 @@ class temp_source_file : public named_te
				1610	public:
				1611	temp_source_file (const location &loc, const char *suffix,
				1612	const char *content);
				1613	+ temp_source_file (const location &loc, const char *suffix,
				1614	+ const char *content, size_t sz);
				1615	};
				1616
				1617	/* RAII-style class for avoiding introducing locale-specific differences
				1618	diff --git a/gcc/testsuite/c-c++-common/diagnostic-format-json-1.c b/gcc/testsuite/c-c++-common/diagnostic-format-json-1.c
				1619	--- a/gcc/testsuite/c-c++-common/diagnostic-format-json-1.c 2021-07-27 23:55:07.596291654 -0700
				1620	+++ b/gcc/testsuite/c-c++-common/diagnostic-format-json-1.c 2021-12-14 01:16:01.557942991 -0800
				1621	@@ -9,6 +9,7 @@
				1622
				1623	/* { dg-regexp "\"kind\": \"error\"" } */
				1624	/* { dg-regexp "\"column-origin\": 1" } */
				1625	+/* { dg-regexp "\"escape-source\": false" } */
				1626	/* { dg-regexp "\"message\": \"#error message\"" } */
				1627
				1628	/* { dg-regexp "\"caret\": \{" } */
				1629	diff --git a/gcc/testsuite/c-c++-common/diagnostic-format-json-2.c b/gcc/testsuite/c-c++-common/diagnostic-format-json-2.c
				1630	--- a/gcc/testsuite/c-c++-common/diagnostic-format-json-2.c 2021-07-27 23:55:07.596291654 -0700
				1631	+++ b/gcc/testsuite/c-c++-common/diagnostic-format-json-2.c 2021-12-14 01:16:01.557942991 -0800
				1632	@@ -9,6 +9,7 @@
				1633
				1634	/* { dg-regexp "\"kind\": \"warning\"" } */
				1635	/* { dg-regexp "\"column-origin\": 1" } */
				1636	+/* { dg-regexp "\"escape-source\": false" } */
				1637	/* { dg-regexp "\"message\": \"#warning message\"" } */
				1638	/* { dg-regexp "\"option\": \"-Wcpp\"" } */
				1639	/* { dg-regexp "\"option_url\": \"https:\[^\n\r\"\]#index-Wcpp\"" } /
				1640	diff --git a/gcc/testsuite/c-c++-common/diagnostic-format-json-3.c b/gcc/testsuite/c-c++-common/diagnostic-format-json-3.c
				1641	--- a/gcc/testsuite/c-c++-common/diagnostic-format-json-3.c 2021-07-27 23:55:07.596291654 -0700
				1642	+++ b/gcc/testsuite/c-c++-common/diagnostic-format-json-3.c 2021-12-14 01:16:01.557942991 -0800
				1643	@@ -9,6 +9,7 @@
				1644
				1645	/* { dg-regexp "\"kind\": \"error\"" } */
				1646	/* { dg-regexp "\"column-origin\": 1" } */
				1647	+/* { dg-regexp "\"escape-source\": false" } */
				1648	/* { dg-regexp "\"message\": \"#warning message\"" } */
				1649	/* { dg-regexp "\"option\": \"-Werror=cpp\"" } */
				1650	/* { dg-regexp "\"option_url\": \"https:\[^\n\r\"\]#index-Wcpp\"" } /
				1651	diff --git a/gcc/testsuite/c-c++-common/diagnostic-format-json-4.c b/gcc/testsuite/c-c++-common/diagnostic-format-json-4.c
				1652	--- a/gcc/testsuite/c-c++-common/diagnostic-format-json-4.c 2021-07-27 23:55:07.596291654 -0700
				1653	+++ b/gcc/testsuite/c-c++-common/diagnostic-format-json-4.c 2021-12-14 01:16:01.557942991 -0800
				1654	@@ -19,6 +19,7 @@ int test (void)
				1655
				1656	/* { dg-regexp "\"kind\": \"note\"" } */
				1657	/* { dg-regexp "\"message\": \"...this statement, but the latter is misleadingly indented as if it were guarded by the 'if'\"" } */
				1658	+/* { dg-regexp "\"escape-source\": false" } */
				1659
				1660	/* { dg-regexp "\"caret\": \{" } */
				1661	/* { dg-regexp "\"file\": \"\[^\n\r\"\]diagnostic-format-json-4.c\"" } /
				1662	@@ -39,6 +40,7 @@ int test (void)
				1663	/* { dg-regexp "\"kind\": \"warning\"" } */
				1664	/* { dg-regexp "\"column-origin\": 1" } */
				1665	/* { dg-regexp "\"message\": \"this 'if' clause does not guard...\"" } */
				1666	+/* { dg-regexp "\"escape-source\": false" } */
				1667	/* { dg-regexp "\"option\": \"-Wmisleading-indentation\"" } */
				1668	/* { dg-regexp "\"option_url\": \"https:\[^\n\r\"\]#index-Wmisleading-indentation\"" } /
				1669
				1670	diff --git a/gcc/testsuite/c-c++-common/diagnostic-format-json-5.c b/gcc/testsuite/c-c++-common/diagnostic-format-json-5.c
				1671	--- a/gcc/testsuite/c-c++-common/diagnostic-format-json-5.c 2021-07-27 23:55:07.596291654 -0700
				1672	+++ b/gcc/testsuite/c-c++-common/diagnostic-format-json-5.c 2021-12-14 01:16:01.557942991 -0800
				1673	@@ -14,6 +14,7 @@ int test (struct s *ptr)
				1674
				1675	/* { dg-regexp "\"kind\": \"error\"" } */
				1676	/* { dg-regexp "\"column-origin\": 1" } */
				1677	+/* { dg-regexp "\"escape-source\": false" } */
				1678	/* { dg-regexp "\"message\": \".\"" } /
				1679
				1680	/* Verify fix-it hints. */
				1681	diff --git a/gcc/testsuite/gcc.dg/cpp/warn-normalized-4-bytes.c b/gcc/testsuite/gcc.dg/cpp/warn-normalized-4-bytes.c
				1682	--- a/gcc/testsuite/gcc.dg/cpp/warn-normalized-4-bytes.c 1969-12-31 16:00:00.000000000 -0800
				1683	+++ b/gcc/testsuite/gcc.dg/cpp/warn-normalized-4-bytes.c 2021-12-14 01:16:01.557942991 -0800
				1684	@@ -0,0 +1,21 @@
				1685	+// { dg-do preprocess }
				1686	+// { dg-options "-std=gnu99 -Werror=normalized=nfc -fdiagnostics-show-caret -fdiagnostics-escape-format=bytes" }
				1687	+/* { dg-message "some warnings being treated as errors" "" {target "--"} 0 } /
				1688	+
				1689	+/* à½ = U+0F43 TIBETAN LETTER GHA, which has decomposition "0F42 0FB7" i.e.
				1690	+ U+0F42 TIBETAN LETTER GA: à½
				1691	+ U+0FB7 TIBETAN SUBJOINED LETTER HA: à¾·
				1692	+
				1693	+ The UTF-8 encoding of U+0F43 TIBETAN LETTER GHA is: E0 BD 83. */
				1694	+
				1695	+foo before_\u0F43_after bar // { dg-error "`before_.U00000f43_after' is not in NFC .-Werror=normalized=." }
				1696	+/* { dg-begin-multiline-output "" }
				1697	+ foo before_\u0F43_after bar
				1698	+ ^~~~~~~~~~~~~~~~~~~
				1699	+ { dg-end-multiline-output "" } */
				1700	+
				1701	+foo before_à½_after bar // { dg-error "`before_.U00000f43_after' is not in NFC .-Werror=normalized=." }
				1702	+/* { dg-begin-multiline-output "" }
				1703	+ foo before_<e0><bd><83>_after bar
				1704	+ ^~~~~~~~~~~~~~~~~~~~~~~~~
				1705	+ { dg-end-multiline-output "" } */
				1706	diff --git a/gcc/testsuite/gcc.dg/cpp/warn-normalized-4-unicode.c b/gcc/testsuite/gcc.dg/cpp/warn-normalized-4-unicode.c
				1707	--- a/gcc/testsuite/gcc.dg/cpp/warn-normalized-4-unicode.c 1969-12-31 16:00:00.000000000 -0800
				1708	+++ b/gcc/testsuite/gcc.dg/cpp/warn-normalized-4-unicode.c 2021-12-14 01:16:01.557942991 -0800
				1709	@@ -0,0 +1,19 @@
				1710	+// { dg-do preprocess }
				1711	+// { dg-options "-std=gnu99 -Werror=normalized=nfc -fdiagnostics-show-caret -fdiagnostics-escape-format=unicode" }
				1712	+/* { dg-message "some warnings being treated as errors" "" {target "--"} 0 } /
				1713	+
				1714	+/* à½ = U+0F43 TIBETAN LETTER GHA, which has decomposition "0F42 0FB7" i.e.
				1715	+ U+0F42 TIBETAN LETTER GA: à½
				1716	+ U+0FB7 TIBETAN SUBJOINED LETTER HA: à¾· */
				1717	+
				1718	+foo before_\u0F43_after bar // { dg-error "`before_.U00000f43_after' is not in NFC .-Werror=normalized=." }
				1719	+/* { dg-begin-multiline-output "" }
				1720	+ foo before_\u0F43_after bar
				1721	+ ^~~~~~~~~~~~~~~~~~~
				1722	+ { dg-end-multiline-output "" } */
				1723	+
				1724	+foo before_à½_after bar // { dg-error "`before_.U00000f43_after' is not in NFC .-Werror=normalized=." }
				1725	+/* { dg-begin-multiline-output "" }
				1726	+ foo before_<U+0F43>_after bar
				1727	+ ^~~~~~~~~~~~~~~~~~~~~
				1728	+ { dg-end-multiline-output "" } */
				1729	diff --git a/gcc/testsuite/gfortran.dg/diagnostic-format-json-1.F90 b/gcc/testsuite/gfortran.dg/diagnostic-format-json-1.F90
				1730	--- a/gcc/testsuite/gfortran.dg/diagnostic-format-json-1.F90 2021-07-27 23:55:08.472303878 -0700
				1731	+++ b/gcc/testsuite/gfortran.dg/diagnostic-format-json-1.F90 2021-12-14 01:16:01.557942991 -0800
				1732	@@ -9,6 +9,7 @@
				1733
				1734	! { dg-regexp "\"kind\": \"error\"" }
				1735	! { dg-regexp "\"column-origin\": 1" }
				1736	+! { dg-regexp "\"escape-source\": false" }
				1737	! { dg-regexp "\"message\": \"#error message\"" }
				1738
				1739	! { dg-regexp "\"caret\": \{" }
				1740	diff --git a/gcc/testsuite/gfortran.dg/diagnostic-format-json-2.F90 b/gcc/testsuite/gfortran.dg/diagnostic-format-json-2.F90
				1741	--- a/gcc/testsuite/gfortran.dg/diagnostic-format-json-2.F90 2021-07-27 23:55:08.472303878 -0700
				1742	+++ b/gcc/testsuite/gfortran.dg/diagnostic-format-json-2.F90 2021-12-14 01:16:01.557942991 -0800
				1743	@@ -9,6 +9,7 @@
				1744
				1745	! { dg-regexp "\"kind\": \"warning\"" }
				1746	! { dg-regexp "\"column-origin\": 1" }
				1747	+! { dg-regexp "\"escape-source\": false" }
				1748	! { dg-regexp "\"message\": \"#warning message\"" }
				1749	! { dg-regexp "\"option\": \"-Wcpp\"" }
				1750	! { dg-regexp "\"option_url\": \"\[^\n\r\"\]*#index-Wcpp\"" }
				1751	diff --git a/gcc/testsuite/gfortran.dg/diagnostic-format-json-3.F90 b/gcc/testsuite/gfortran.dg/diagnostic-format-json-3.F90
				1752	--- a/gcc/testsuite/gfortran.dg/diagnostic-format-json-3.F90 2021-07-27 23:55:08.472303878 -0700
				1753	+++ b/gcc/testsuite/gfortran.dg/diagnostic-format-json-3.F90 2021-12-14 01:16:01.557942991 -0800
				1754	@@ -9,6 +9,7 @@
				1755
				1756	! { dg-regexp "\"kind\": \"error\"" }
				1757	! { dg-regexp "\"column-origin\": 1" }
				1758	+! { dg-regexp "\"escape-source\": false" }
				1759	! { dg-regexp "\"message\": \"#warning message\"" }
				1760	! { dg-regexp "\"option\": \"-Werror=cpp\"" }
				1761	! { dg-regexp "\"option_url\": \"\[^\n\r\"\]*#index-Wcpp\"" }
				1762	diff --git a/libcpp/charset.c b/libcpp/charset.c
				1763	--- a/libcpp/charset.c 2021-07-27 23:55:08.712307227 -0700
				1764	+++ b/libcpp/charset.c 2021-12-14 01:16:01.557942991 -0800
				1765	@@ -1552,12 +1552,14 @@ convert_escape (cpp_reader *pfile, const
				1766	"unknown escape sequence: '\\%c'", (int) c);
				1767	else
				1768	{
				1769	+ encoding_rich_location rich_loc (pfile);
				1770	+
				1771	/* diagnostic.c does not support "%03o". When it does, this
				1772	code can use %03o directly in the diagnostic again. */
				1773	char buf[32];
				1774	sprintf(buf, "%03o", (int) c);
				1775	- cpp_error (pfile, CPP_DL_PEDWARN,
				1776	- "unknown escape sequence: '\\%s'", buf);
				1777	+ cpp_error_at (pfile, CPP_DL_PEDWARN, &rich_loc,
				1778	+ "unknown escape sequence: '\\%s'", buf);
				1779	}
				1780	}
				1781
				1782	@@ -2280,14 +2282,16 @@ cpp_string_location_reader::get_next ()
				1783	}
				1784
				1785	cpp_display_width_computation::
				1786	-cpp_display_width_computation (const char *data, int data_length, int tabstop) :
				1787	+cpp_display_width_computation (const char *data, int data_length,
				1788	+ const cpp_char_column_policy &policy) :
				1789	m_begin (data),
				1790	m_next (m_begin),
				1791	m_bytes_left (data_length),
				1792	- m_tabstop (tabstop),
				1793	+ m_policy (policy),
				1794	m_display_cols (0)
				1795	{
				1796	- gcc_assert (m_tabstop > 0);
				1797	+ gcc_assert (policy.m_tabstop > 0);
				1798	+ gcc_assert (policy.m_width_cb);
				1799	}
				1800
				1801
				1802	@@ -2299,19 +2303,28 @@ cpp_display_width_computation (const cha
				1803	point to a valid UTF-8-encoded sequence, then it will be treated as a single
				1804	byte with display width 1. m_cur_display_col is the current display column,
				1805	relative to which tab stops should be expanded. Returns the display width of
				1806	- the codepoint just processed. */
				1807	+ the codepoint just processed.
				1808	+ If OUT is non-NULL, it is populated. */
				1809
				1810	int
				1811	-cpp_display_width_computation::process_next_codepoint ()
				1812	+cpp_display_width_computation::process_next_codepoint (cpp_decoded_char *out)
				1813	{
				1814	cppchar_t c;
				1815	int next_width;
				1816
				1817	+ if (out)
				1818	+ out->m_start_byte = m_next;
				1819	+
				1820	if (*m_next == '\t')
				1821	{
				1822	++m_next;
				1823	--m_bytes_left;
				1824	- next_width = m_tabstop - (m_display_cols % m_tabstop);
				1825	+ next_width = m_policy.m_tabstop - (m_display_cols % m_policy.m_tabstop);
				1826	+ if (out)
				1827	+ {
				1828	+ out->m_ch = '\t';
				1829	+ out->m_valid_ch = true;
				1830	+ }
				1831	}
				1832	else if (one_utf8_to_cppchar ((const uchar **) &m_next, &m_bytes_left, &c)
				1833	!= 0)
				1834	@@ -2321,14 +2334,24 @@ cpp_display_width_computation::process_n
				1835	of one. */
				1836	++m_next;
				1837	--m_bytes_left;
				1838	- next_width = 1;
				1839	+ next_width = m_policy.m_undecoded_byte_width;
				1840	+ if (out)
				1841	+ out->m_valid_ch = false;
				1842	}
				1843	else
				1844	{
				1845	/* one_utf8_to_cppchar() has updated m_next and m_bytes_left for us. */
				1846	- next_width = cpp_wcwidth (c);
				1847	+ next_width = m_policy.m_width_cb (c);
				1848	+ if (out)
				1849	+ {
				1850	+ out->m_ch = c;
				1851	+ out->m_valid_ch = true;
				1852	+ }
				1853	}
				1854
				1855	+ if (out)
				1856	+ out->m_next_byte = m_next;
				1857	+
				1858	m_display_cols += next_width;
				1859	return next_width;
				1860	}
				1861	@@ -2344,7 +2367,7 @@ cpp_display_width_computation::advance_d
				1862	const int start = m_display_cols;
				1863	const int target = start + n;
				1864	while (m_display_cols < target && !done ())
				1865	- process_next_codepoint ();
				1866	+ process_next_codepoint (NULL);
				1867	return m_display_cols - start;
				1868	}
				1869
				1870	@@ -2352,29 +2375,33 @@ cpp_display_width_computation::advance_d
				1871	how many display columns are occupied by the first COLUMN bytes. COLUMN
				1872	may exceed DATA_LENGTH, in which case the phantom bytes at the end are
				1873	treated as if they have display width 1. Tabs are expanded to the next tab
				1874	- stop, relative to the start of DATA. */
				1875	+ stop, relative to the start of DATA, and non-printable-ASCII characters
				1876	+ will be escaped as per POLICY. */
				1877
				1878	int
				1879	cpp_byte_column_to_display_column (const char *data, int data_length,
				1880	- int column, int tabstop)
				1881	+ int column,
				1882	+ const cpp_char_column_policy &policy)
				1883	{
				1884	const int offset = MAX (0, column - data_length);
				1885	- cpp_display_width_computation dw (data, column - offset, tabstop);
				1886	+ cpp_display_width_computation dw (data, column - offset, policy);
				1887	while (!dw.done ())
				1888	- dw.process_next_codepoint ();
				1889	+ dw.process_next_codepoint (NULL);
				1890	return dw.display_cols_processed () + offset;
				1891	}
				1892
				1893	/* For the string of length DATA_LENGTH bytes that begins at DATA, compute
				1894	the least number of bytes that will result in at least DISPLAY_COL display
				1895	columns. The return value may exceed DATA_LENGTH if the entire string does
				1896	- not occupy enough display columns. */
				1897	+ not occupy enough display columns. Non-printable-ASCII characters
				1898	+ will be escaped as per POLICY. */
				1899
				1900	int
				1901	cpp_display_column_to_byte_column (const char *data, int data_length,
				1902	- int display_col, int tabstop)
				1903	+ int display_col,
				1904	+ const cpp_char_column_policy &policy)
				1905	{
				1906	- cpp_display_width_computation dw (data, data_length, tabstop);
				1907	+ cpp_display_width_computation dw (data, data_length, policy);
				1908	const int avail_display = dw.advance_display_cols (display_col);
				1909	return dw.bytes_processed () + MAX (0, display_col - avail_display);
				1910	}
				1911	diff --git a/libcpp/errors.c b/libcpp/errors.c
				1912	--- a/libcpp/errors.c 2021-07-27 23:55:08.712307227 -0700
				1913	+++ b/libcpp/errors.c 2021-12-14 01:16:01.557942991 -0800
				1914	@@ -27,6 +27,31 @@ along with this program; see the file CO
				1915	#include "cpplib.h"
				1916	#include "internal.h"
				1917
				1918	+/* Get a location_t for the current location in PFILE,
				1919	+ generally that of the previously lexed token. */
				1920	+
				1921	+location_t
				1922	+cpp_diagnostic_get_current_location (cpp_reader *pfile)
				1923	+{
				1924	+ if (CPP_OPTION (pfile, traditional))
				1925	+ {
				1926	+ if (pfile->state.in_directive)
				1927	+ return pfile->directive_line;
				1928	+ else
				1929	+ return pfile->line_table->highest_line;
				1930	+ }
				1931	+ /* We don't want to refer to a token before the beginning of the
				1932	+ current run -- that is invalid. */
				1933	+ else if (pfile->cur_token == pfile->cur_run->base)
				1934	+ {
				1935	+ return 0;
				1936	+ }
				1937	+ else
				1938	+ {
				1939	+ return pfile->cur_token[-1].src_loc;
				1940	+ }
				1941	+}
				1942	+
				1943	/* Print a diagnostic at the given location. */
				1944
				1945	ATTRIBUTE_FPTR_PRINTF(5,0)
				1946	@@ -52,25 +77,7 @@ cpp_diagnostic (cpp_reader * pfile, enum
				1947	enum cpp_warning_reason reason,
				1948	const char msgid, va_list ap)
				1949	{
				1950	- location_t src_loc;
				1951	-
				1952	- if (CPP_OPTION (pfile, traditional))
				1953	- {
				1954	- if (pfile->state.in_directive)
				1955	- src_loc = pfile->directive_line;
				1956	- else
				1957	- src_loc = pfile->line_table->highest_line;
				1958	- }
				1959	- /* We don't want to refer to a token before the beginning of the
				1960	- current run -- that is invalid. */
				1961	- else if (pfile->cur_token == pfile->cur_run->base)
				1962	- {
				1963	- src_loc = 0;
				1964	- }
				1965	- else
				1966	- {
				1967	- src_loc = pfile->cur_token[-1].src_loc;
				1968	- }
				1969	+ location_t src_loc = cpp_diagnostic_get_current_location (pfile);
				1970	rich_location richloc (pfile->line_table, src_loc);
				1971	return cpp_diagnostic_at (pfile, level, reason, &richloc, msgid, ap);
				1972	}
				1973	@@ -142,6 +149,43 @@ cpp_warning_syshdr (cpp_reader * pfile,
				1974
				1975	va_end (ap);
				1976	return ret;
				1977	+}
				1978	+
				1979	+/* As cpp_warning above, but use RICHLOC as the location of the diagnostic. */
				1980	+
				1981	+bool cpp_warning_at (cpp_reader *pfile, enum cpp_warning_reason reason,
				1982	+ rich_location richloc, const char msgid, ...)
				1983	+{
				1984	+ va_list ap;
				1985	+ bool ret;
				1986	+
				1987	+ va_start (ap, msgid);
				1988	+
				1989	+ ret = cpp_diagnostic_at (pfile, CPP_DL_WARNING, reason, richloc,
				1990	+ msgid, &ap);
				1991	+
				1992	+ va_end (ap);
				1993	+ return ret;
				1994	+
				1995	+}
				1996	+
				1997	+/* As cpp_pedwarning above, but use RICHLOC as the location of the
				1998	+ diagnostic. */
				1999	+
				2000	+bool
				2001	+cpp_pedwarning_at (cpp_reader * pfile, enum cpp_warning_reason reason,
				2002	+ rich_location richloc, const char msgid, ...)
				2003	+{
				2004	+ va_list ap;
				2005	+ bool ret;
				2006	+
				2007	+ va_start (ap, msgid);
				2008	+
				2009	+ ret = cpp_diagnostic_at (pfile, CPP_DL_PEDWARN, reason, richloc,
				2010	+ msgid, &ap);
				2011	+
				2012	+ va_end (ap);
				2013	+ return ret;
				2014	}
				2015
				2016	/* Print a diagnostic at a specific location. */
				2017	diff --git a/libcpp/include/cpplib.h b/libcpp/include/cpplib.h
				2018	--- a/libcpp/include/cpplib.h 2021-12-13 23:23:05.768437079 -0800
				2019	+++ b/libcpp/include/cpplib.h 2021-12-14 01:20:16.189507386 -0800
				2020	@@ -1275,6 +1275,14 @@ extern bool cpp_warning_syshdr (cpp_read
				2021	const char *msgid, ...)
				2022	ATTRIBUTE_PRINTF_3;
				2023
				2024	+/* As their counterparts above, but use RICHLOC. */
				2025	+extern bool cpp_warning_at (cpp_reader *, enum cpp_warning_reason,
				2026	+ rich_location richloc, const char msgid, ...)
				2027	+ ATTRIBUTE_PRINTF_4;
				2028	+extern bool cpp_pedwarning_at (cpp_reader *, enum cpp_warning_reason,
				2029	+ rich_location richloc, const char msgid, ...)
				2030	+ ATTRIBUTE_PRINTF_4;
				2031	+
				2032	/* Output a diagnostic with "MSGID: " preceding the
				2033	error string of errno. No location is printed. */
				2034	extern bool cpp_errno (cpp_reader *, enum cpp_diagnostic_level,
				2035	@@ -1435,42 +1443,95 @@ extern const char * cpp_get_userdef_suff
				2036
				2037	/* In charset.c */
				2038
				2039	+/* The result of attempting to decode a run of UTF-8 bytes. */
				2040	+
				2041	+struct cpp_decoded_char
				2042	+{
				2043	+ const char *m_start_byte;
				2044	+ const char *m_next_byte;
				2045	+
				2046	+ bool m_valid_ch;
				2047	+ cppchar_t m_ch;
				2048	+};
				2049	+
				2050	+/* Information for mapping between code points and display columns.
				2051	+
				2052	+ This is a tabstop value, along with a callback for getting the
				2053	+ widths of characters. Normally this callback is cpp_wcwidth, but we
				2054	+ support other schemes for escaping non-ASCII unicode as a series of
				2055	+ ASCII chars when printing the user's source code in diagnostic-show-locus.c
				2056	+
				2057	+ For example, consider:
				2058	+ - the Unicode character U+03C0 "GREEK SMALL LETTER PI" (UTF-8: 0xCF 0x80)
				2059	+ - the Unicode character U+1F642 "SLIGHTLY SMILING FACE"
				2060	+ (UTF-8: 0xF0 0x9F 0x99 0x82)
				2061	+ - the byte 0xBF (a stray trailing byte of a UTF-8 character)
				2062	+ Normally U+03C0 would occupy one display column, U+1F642
				2063	+ would occupy two display columns, and the stray byte would be
				2064	+ printed verbatim as one display column.
				2065	+
				2066	+ However when escaping them as unicode code points as "<U+03C0>"
				2067	+ and "<U+1F642>" they occupy 8 and 9 display columns respectively,
				2068	+ and when escaping them as bytes as "<CF><80>" and "<F0><9F><99><82>"
				2069	+ they occupy 8 and 16 display columns respectively. In both cases
				2070	+ the stray byte is escaped to <BF> as 4 display columns. */
				2071	+
				2072	+struct cpp_char_column_policy
				2073	+{
				2074	+ cpp_char_column_policy (int tabstop,
				2075	+ int (*width_cb) (cppchar_t c))
				2076	+ : m_tabstop (tabstop),
				2077	+ m_undecoded_byte_width (1),
				2078	+ m_width_cb (width_cb)
				2079	+ {}
				2080	+
				2081	+ int m_tabstop;
				2082	+ /* Width in display columns of a stray byte that isn't decodable
				2083	+ as UTF-8. */
				2084	+ int m_undecoded_byte_width;
				2085	+ int (*m_width_cb) (cppchar_t c);
				2086	+};
				2087	+
				2088	/* A class to manage the state while converting a UTF-8 sequence to cppchar_t
				2089	and computing the display width one character at a time. */
				2090	class cpp_display_width_computation {
				2091	public:
				2092	cpp_display_width_computation (const char *data, int data_length,
				2093	- int tabstop);
				2094	+ const cpp_char_column_policy &policy);
				2095	const char *next_byte () const { return m_next; }
				2096	int bytes_processed () const { return m_next - m_begin; }
				2097	int bytes_left () const { return m_bytes_left; }
				2098	bool done () const { return !bytes_left (); }
				2099	int display_cols_processed () const { return m_display_cols; }
				2100
				2101	- int process_next_codepoint ();
				2102	+ int process_next_codepoint (cpp_decoded_char *out);
				2103	int advance_display_cols (int n);
				2104
				2105	private:
				2106	const char *const m_begin;
				2107	const char *m_next;
				2108	size_t m_bytes_left;
				2109	- const int m_tabstop;
				2110	+ const cpp_char_column_policy &m_policy;
				2111	int m_display_cols;
				2112	};
				2113
				2114	/* Convenience functions that are simple use cases for class
				2115	cpp_display_width_computation. Tab characters will be expanded to spaces
				2116	- as determined by TABSTOP. */
				2117	+ as determined by POLICY.m_tabstop, and non-printable-ASCII characters
				2118	+ will be escaped as per POLICY. */
				2119	+
				2120	int cpp_byte_column_to_display_column (const char *data, int data_length,
				2121	- int column, int tabstop);
				2122	+ int column,
				2123	+ const cpp_char_column_policy &policy);
				2124	inline int cpp_display_width (const char *data, int data_length,
				2125	- int tabstop)
				2126	+ const cpp_char_column_policy &policy)
				2127	{
				2128	return cpp_byte_column_to_display_column (data, data_length, data_length,
				2129	- tabstop);
				2130	+ policy);
				2131	}
				2132	int cpp_display_column_to_byte_column (const char *data, int data_length,
				2133	- int display_col, int tabstop);
				2134	+ int display_col,
				2135	+ const cpp_char_column_policy &policy);
				2136	int cpp_wcwidth (cppchar_t c);
				2137
				2138	#endif /* ! LIBCPP_CPPLIB_H */
				2139	diff --git a/libcpp/include/line-map.h b/libcpp/include/line-map.h
				2140	--- a/libcpp/include/line-map.h 2021-07-27 23:55:08.716307283 -0700
				2141	+++ b/libcpp/include/line-map.h 2021-12-14 01:16:01.557942991 -0800
				2142	@@ -1781,6 +1781,18 @@ class rich_location
				2143	const diagnostic_path *get_path () const { return m_path; }
				2144	void set_path (const diagnostic_path *path) { m_path = path; }
				2145
				2146	+ /* A flag for hinting that the diagnostic involves character encoding
				2147	+ issues, and thus that it will be helpful to the user if we show some
				2148	+ representation of how the characters in the pertinent source lines
				2149	+ are encoded.
				2150	+ The default is false (i.e. do not escape).
				2151	+ When set to true, non-ASCII bytes in the pertinent source lines will
				2152	+ be escaped in a manner controlled by the user-supplied option
				2153	+ -fdiagnostics-escape-format=, so that the user can better understand
				2154	+ what's going on with the encoding in their source file. */
				2155	+ bool escape_on_output_p () const { return m_escape_on_output; }
				2156	+ void set_escape_on_output (bool flag) { m_escape_on_output = flag; }
				2157	+
				2158	private:
				2159	bool reject_impossible_fixit (location_t where);
				2160	void stop_supporting_fixits ();
				2161	@@ -1807,6 +1819,7 @@ protected:
				2162	bool m_fixits_cannot_be_auto_applied;
				2163
				2164	const diagnostic_path *m_path;
				2165	+ bool m_escape_on_output;
				2166	};
				2167
				2168	/* A struct for the result of range_label::get_text: a NUL-terminated buffer
				2169	diff --git a/libcpp/internal.h b/libcpp/internal.h
				2170	--- a/libcpp/internal.h 2021-12-13 23:23:05.768437079 -0800
				2171	+++ b/libcpp/internal.h 2021-12-14 01:16:01.557942991 -0800
				2172	@@ -776,6 +776,9 @@ extern void _cpp_do_file_change (cpp_rea
				2173	extern void _cpp_pop_buffer (cpp_reader *);
				2174	extern char _cpp_bracket_include (cpp_reader );
				2175
				2176	+/* In errors.c */
				2177	+extern location_t cpp_diagnostic_get_current_location (cpp_reader *);
				2178	+
				2179	/* In traditional.c. */
				2180	extern bool _cpp_scan_out_logical_line (cpp_reader , cpp_macro , bool);
				2181	extern bool _cpp_read_logical_line_trad (cpp_reader *);
				2182	@@ -942,6 +945,26 @@ int linemap_get_expansion_line (class li
				2183	const char* linemap_get_expansion_filename (class line_maps *,
				2184	location_t);
				2185
				2186	+/* A subclass of rich_location for emitting a diagnostic
				2187	+ at the current location of the reader, but flagging
				2188	+ it with set_escape_on_output (true). */
				2189	+class encoding_rich_location : public rich_location
				2190	+{
				2191	+ public:
				2192	+ encoding_rich_location (cpp_reader *pfile)
				2193	+ : rich_location (pfile->line_table,
				2194	+ cpp_diagnostic_get_current_location (pfile))
				2195	+ {
				2196	+ set_escape_on_output (true);
				2197	+ }
				2198	+
				2199	+ encoding_rich_location (cpp_reader *pfile, location_t loc)
				2200	+ : rich_location (pfile->line_table, loc)
				2201	+ {
				2202	+ set_escape_on_output (true);
				2203	+ }
				2204	+};
				2205	+
				2206	#ifdef __cplusplus
				2207	}
				2208	#endif
				2209	diff --git a/libcpp/lex.c b/libcpp/lex.c
				2210	--- a/libcpp/lex.c 2021-12-14 01:14:48.435225968 -0800
				2211	+++ b/libcpp/lex.c 2021-12-14 01:24:37.220995816 -0800
				2212	@@ -1774,7 +1774,11 @@ skip_whitespace (cpp_reader *pfile, cppc
				2213	while (is_nvspace (c));
				2214
				2215	if (saw_NUL)
				2216	- cpp_error (pfile, CPP_DL_WARNING, "null character(s) ignored");
				2217	+ {
				2218	+ encoding_rich_location rich_loc (pfile);
				2219	+ cpp_error_at (pfile, CPP_DL_WARNING, &rich_loc,
				2220	+ "null character(s) ignored");
				2221	+ }
				2222
				2223	buffer->cur--;
				2224	}
				2225	@@ -1803,6 +1807,28 @@ warn_about_normalization (cpp_reader *pf
				2226	if (CPP_OPTION (pfile, warn_normalize) < NORMALIZE_STATE_RESULT (s)
				2227	&& !pfile->state.skipping)
				2228	{
				2229	+ location_t loc = token->src_loc;
				2230	+
				2231	+ /* If possible, create a location range for the token. */
				2232	+ if (loc >= RESERVED_LOCATION_COUNT
				2233	+ && token->type != CPP_EOF
				2234	+ /* There must be no line notes to process. */
				2235	+ && (!(pfile->buffer->cur
				2236	+ >= pfile->buffer->notes[pfile->buffer->cur_note].pos
				2237	+ && !pfile->overlaid_buffer)))
				2238	+ {
				2239	+ source_range tok_range;
				2240	+ tok_range.m_start = loc;
				2241	+ tok_range.m_finish
				2242	+ = linemap_position_for_column (pfile->line_table,
				2243	+ CPP_BUF_COLUMN (pfile->buffer,
				2244	+ pfile->buffer->cur));
				2245	+ loc = COMBINE_LOCATION_DATA (pfile->line_table,
				2246	+ loc, tok_range, NULL);
				2247	+ }
				2248	+
				2249	+ encoding_rich_location rich_loc (pfile, loc);
				2250	+
				2251	/* Make sure that the token is printed using UCNs, even
				2252	if we'd otherwise happily print UTF-8. */
				2253	unsigned char *buf = XNEWVEC (unsigned char, cpp_token_len (token));
				2254	@@ -1810,11 +1836,11 @@ warn_about_normalization (cpp_reader *pf
				2255
				2256	sz = cpp_spell_token (pfile, token, buf, false) - buf;
				2257	if (NORMALIZE_STATE_RESULT (s) == normalized_C)
				2258	- cpp_warning_with_line (pfile, CPP_W_NORMALIZE, token->src_loc, 0,
				2259	- "`%.*s' is not in NFKC", (int) sz, buf);
				2260	+ cpp_warning_at (pfile, CPP_W_NORMALIZE, &rich_loc,
				2261	+ "`%.*s' is not in NFKC", (int) sz, buf);
				2262	else
				2263	- cpp_warning_with_line (pfile, CPP_W_NORMALIZE, token->src_loc, 0,
				2264	- "`%.*s' is not in NFC", (int) sz, buf);
				2265	+ cpp_warning_at (pfile, CPP_W_NORMALIZE, &rich_loc,
				2266	+ "`%.*s' is not in NFC", (int) sz, buf);
				2267	free (buf);
				2268	}
				2269	}
				2270	diff --git a/libcpp/line-map.c b/libcpp/line-map.c
				2271	--- a/libcpp/line-map.c 2021-07-27 23:55:08.716307283 -0700
				2272	+++ b/libcpp/line-map.c 2021-12-14 01:16:01.561942921 -0800
				2273	@@ -2086,7 +2086,8 @@ rich_location::rich_location (line_maps
				2274	m_fixit_hints (),
				2275	m_seen_impossible_fixit (false),
				2276	m_fixits_cannot_be_auto_applied (false),
				2277	- m_path (NULL)
				2278	+ m_path (NULL),
				2279	+ m_escape_on_output (false)
				2280	{
				2281	add_range (loc, SHOW_RANGE_WITH_CARET, label);
				2282	}