Andrew Geissler | 595f630 | 2022-01-24 19:11:47 +0000 | [diff] [blame] | 1 | From b3aa80b45c4f46029efeb204bb9f2d2c4278a0e5 Mon Sep 17 00:00:00 2001 |
| 2 | From: Nick Clifton <nickc@redhat.com> |
| 3 | Date: Tue, 9 Nov 2021 13:25:42 +0000 |
| 4 | Subject: [PATCH] Add --unicode option to control how unicode characters are |
| 5 | handled by display tools. |
| 6 | |
| 7 | * nm.c: Add --unicode option to control how unicode characters are |
| 8 | handled. |
| 9 | * objdump.c: Likewise. |
| 10 | * readelf.c: Likewise. |
| 11 | * strings.c: Likewise. |
| 12 | * binutils.texi: Document the new feature. |
| 13 | * NEWS: Document the new feature. |
| 14 | * testsuite/binutils-all/unicode.exp: New file. |
| 15 | * testsuite/binutils-all/nm.hex.unicode |
| 16 | * testsuite/binutils-all/strings.escape.unicode |
| 17 | * testsuite/binutils-all/objdump.highlight.unicode |
| 18 | * testsuite/binutils-all/readelf.invalid.unicode |
| 19 | |
| 20 | CVE: CVE-2021-42574 |
| 21 | Upstream-Status: Backport [https://sourceware.org/git/?p=binutils-gdb.git;a=commit;h=b3aa80b45c4f46029efeb204bb9f2d2c4278a0e5] |
| 22 | |
| 23 | RP: Added tweak uint -> unsigned int partial backport of |
| 24 | https://sourceware.org/git/?p=binutils-gdb.git;a=commitdiff;h=795588aec4f894206863c938bd6d716895886009 |
| 25 | |
| 26 | Signed-off-by: pgowda <pgowda.cve@gmail.com> |
| 27 | --- |
| 28 | binutils/ChangeLog | 15 + |
| 29 | binutils/NEWS | 9 + |
| 30 | binutils/doc/binutils.texi | 78 ++++ |
| 31 | binutils/nm.c | 228 ++++++++++- |
| 32 | binutils/objdump.c | 235 ++++++++++-- |
| 33 | binutils/readelf.c | 190 +++++++++- |
| 34 | binutils/strings.c | 757 ++++++++++++++++++++++++++++++++++--- |
| 35 | 7 files changed, 1409 insertions(+), 103 deletions(-) |
| 36 | |
| 37 | diff --git a/binutils/ChangeLog b/binutils/ChangeLog |
| 38 | --- a/binutils/ChangeLog 2021-12-19 19:00:27.038540406 -0800 |
| 39 | +++ b/binutils/ChangeLog 2021-12-19 19:28:42.733565078 -0800 |
| 40 | @@ -1,3 +1,18 @@ |
| 41 | +2021-11-09 Nick Clifton <nickc@redhat.com> |
| 42 | + |
| 43 | + * nm.c: Add --unicode option to control how unicode characters are |
| 44 | + handled. |
| 45 | + * objdump.c: Likewise. |
| 46 | + * readelf.c: Likewise. |
| 47 | + * strings.c: Likewise. |
| 48 | + * binutils.texi: Document the new feature. |
| 49 | + * NEWS: Document the new feature. |
| 50 | + * testsuite/binutils-all/unicode.exp: New file. |
| 51 | + * testsuite/binutils-all/nm.hex.unicode |
| 52 | + * testsuite/binutils-all/strings.escape.unicode |
| 53 | + * testsuite/binutils-all/objdump.highlight.unicode |
| 54 | + * testsuite/binutils-all/readelf.invalid.unicode |
| 55 | + |
| 56 | 2021-07-16 Nick Clifton <nickc@redhat.com> |
| 57 | |
| 58 | * po/sv.po: Updated Swedish translation. |
| 59 | diff --git a/binutils/doc/binutils.texi b/binutils/doc/binutils.texi |
| 60 | --- a/binutils/doc/binutils.texi 2021-12-19 19:00:27.042540338 -0800 |
| 61 | +++ b/binutils/doc/binutils.texi 2021-12-19 19:27:56.526354667 -0800 |
| 62 | @@ -812,6 +812,7 @@ nm [@option{-A}|@option{-o}|@option{--pr |
| 63 | [@option{-s}|@option{--print-armap}] |
| 64 | [@option{-t} @var{radix}|@option{--radix=}@var{radix}] |
| 65 | [@option{-u}|@option{--undefined-only}] |
| 66 | + [@option{-U} @var{method}] [@option{--unicode=}@var{method}] |
| 67 | [@option{-V}|@option{--version}] |
| 68 | [@option{-X 32_64}] |
| 69 | [@option{--defined-only}] |
| 70 | @@ -1132,6 +1133,21 @@ Use @var{radix} as the radix for printin |
| 71 | @cindex undefined symbols |
| 72 | Display only undefined symbols (those external to each object file). |
| 73 | |
| 74 | +@item -U @var{[d|i|l|e|x|h]} |
| 75 | +@itemx --unicode=@var{[default|invalid|locale|escape|hex|highlight]} |
| 76 | +Controls the display of UTF-8 encoded mulibyte characters in strings. |
| 77 | +The default (@option{--unicode=default}) is to give them no special |
| 78 | +treatment. The @option{--unicode=locale} option displays the sequence |
| 79 | +in the current locale, which may or may not support them. The options |
| 80 | +@option{--unicode=hex} and @option{--unicode=invalid} display them as |
| 81 | +hex byte sequences enclosed by either angle brackets or curly braces. |
| 82 | + |
| 83 | +The @option{--unicode=escape} option displays them as escape sequences |
| 84 | +(@var{\uxxxx}) and the @option{--unicode=highlight} option displays |
| 85 | +them as escape sequences highlighted in red (if supported by the |
| 86 | +output device). The colouring is intended to draw attention to the |
| 87 | +presence of unicode sequences where they might not be expected. |
| 88 | + |
| 89 | @item -V |
| 90 | @itemx --version |
| 91 | Show the version number of @command{nm} and exit. |
| 92 | @@ -2247,6 +2263,7 @@ objdump [@option{-a}|@option{--archive-h |
| 93 | [@option{--prefix-strip=}@var{level}] |
| 94 | [@option{--insn-width=}@var{width}] |
| 95 | [@option{--visualize-jumps[=color|=extended-color|=off]} |
| 96 | + [@option{-U} @var{method}] [@option{--unicode=}@var{method}] |
| 97 | [@option{-V}|@option{--version}] |
| 98 | [@option{-H}|@option{--help}] |
| 99 | @var{objfile}@dots{} |
| 100 | @@ -2921,6 +2938,21 @@ When displaying symbols include those wh |
| 101 | special in some way and which would not normally be of interest to the |
| 102 | user. |
| 103 | |
| 104 | +@item -U @var{[d|i|l|e|x|h]} |
| 105 | +@itemx --unicode=@var{[default|invalid|locale|escape|hex|highlight]} |
| 106 | +Controls the display of UTF-8 encoded mulibyte characters in strings. |
| 107 | +The default (@option{--unicode=default}) is to give them no special |
| 108 | +treatment. The @option{--unicode=locale} option displays the sequence |
| 109 | +in the current locale, which may or may not support them. The options |
| 110 | +@option{--unicode=hex} and @option{--unicode=invalid} display them as |
| 111 | +hex byte sequences enclosed by either angle brackets or curly braces. |
| 112 | + |
| 113 | +The @option{--unicode=escape} option displays them as escape sequences |
| 114 | +(@var{\uxxxx}) and the @option{--unicode=highlight} option displays |
| 115 | +them as escape sequences highlighted in red (if supported by the |
| 116 | +output device). The colouring is intended to draw attention to the |
| 117 | +presence of unicode sequences where they might not be expected. |
| 118 | + |
| 119 | @item -V |
| 120 | @itemx --version |
| 121 | Print the version number of @command{objdump} and exit. |
| 122 | @@ -3197,6 +3229,7 @@ strings [@option{-afovV}] [@option{-}@va |
| 123 | [@option{-n} @var{min-len}] [@option{--bytes=}@var{min-len}] |
| 124 | [@option{-t} @var{radix}] [@option{--radix=}@var{radix}] |
| 125 | [@option{-e} @var{encoding}] [@option{--encoding=}@var{encoding}] |
| 126 | + [@option{-U} @var{method}] [@option{--unicode=}@var{method}] |
| 127 | [@option{-}] [@option{--all}] [@option{--print-file-name}] |
| 128 | [@option{-T} @var{bfdname}] [@option{--target=}@var{bfdname}] |
| 129 | [@option{-w}] [@option{--include-all-whitespace}] |
| 130 | @@ -3288,6 +3321,28 @@ single-8-bit-byte characters, @samp{b} = |
| 131 | littleendian. Useful for finding wide character strings. (@samp{l} |
| 132 | and @samp{b} apply to, for example, Unicode UTF-16/UCS-2 encodings). |
| 133 | |
| 134 | +@item -U @var{[d|i|l|e|x|h]} |
| 135 | +@itemx --unicode=@var{[default|invalid|locale|escape|hex|highlight]} |
| 136 | +Controls the display of UTF-8 encoded mulibyte characters in strings. |
| 137 | +The default (@option{--unicode=default}) is to give them no special |
| 138 | +treatment, and instead rely upon the setting of the |
| 139 | +@option{--encoding} option. The other values for this option |
| 140 | +automatically enable @option{--encoding=S}. |
| 141 | + |
| 142 | +The @option{--unicode=invalid} option treats them as non-graphic |
| 143 | +characters and hence not part of a valid string. All the remaining |
| 144 | +options treat them as valid string characters. |
| 145 | + |
| 146 | +The @option{--unicode=locale} option displays them in the current |
| 147 | +locale, which may or may not support UTF-8 encoding. The |
| 148 | +@option{--unicode=hex} option displays them as hex byte sequences |
| 149 | +enclosed between @var{<>} characters. The @option{--unicode=escape} |
| 150 | +option displays them as escape sequences (@var{\uxxxx}) and the |
| 151 | +@option{--unicode=highlight} option displays them as escape sequences |
| 152 | +highlighted in red (if supported by the output device). The colouring |
| 153 | +is intended to draw attention to the presence of unicode sequences |
| 154 | +where they might not be expected. |
| 155 | + |
| 156 | @item -T @var{bfdname} |
| 157 | @itemx --target=@var{bfdname} |
| 158 | @cindex object code format |
| 159 | @@ -4796,6 +4851,7 @@ readelf [@option{-a}|@option{--all}] |
| 160 | [@option{--demangle@var{=style}}|@option{--no-demangle}] |
| 161 | [@option{--quiet}] |
| 162 | [@option{--recurse-limit}|@option{--no-recurse-limit}] |
| 163 | + [@option{-U} @var{method}|@option{--unicode=}@var{method}] |
| 164 | [@option{-n}|@option{--notes}] |
| 165 | [@option{-r}|@option{--relocs}] |
| 166 | [@option{-u}|@option{--unwind}] |
| 167 | @@ -4962,6 +5018,28 @@ necessary in order to demangle truly com |
| 168 | that if the recursion limit is disabled then stack exhaustion is |
| 169 | possible and any bug reports about such an event will be rejected. |
| 170 | |
| 171 | +@item -U @var{[d|i|l|e|x|h]} |
| 172 | +@itemx --unicode=[default|invalid|locale|escape|hex|highlight] |
| 173 | +Controls the display of non-ASCII characters in identifier names. |
| 174 | +The default (@option{--unicode=locale} or @option{--unicode=default}) is |
| 175 | +to treat them as multibyte characters and display them in the current |
| 176 | +locale. All other versions of this option treat the bytes as UTF-8 |
| 177 | +encoded values and attempt to interpret them. If they cannot be |
| 178 | +interpreted or if the @option{--unicode=invalid} option is used then |
| 179 | +they are displayed as a sequence of hex bytes, encloses in curly |
| 180 | +parethesis characters. |
| 181 | + |
| 182 | +Using the @option{--unicode=escape} option will display the characters |
| 183 | +as as unicode escape sequences (@var{\uxxxx}). Using the |
| 184 | +@option{--unicode=hex} will display the characters as hex byte |
| 185 | +sequences enclosed between angle brackets. |
| 186 | + |
| 187 | +Using the @option{--unicode=highlight} will display the characters as |
| 188 | +unicode escape sequences but it will also highlighted them in red, |
| 189 | +assuming that colouring is supported by the output device. The |
| 190 | +colouring is intended to draw attention to the presence of unicode |
| 191 | +sequences when they might not be expected. |
| 192 | + |
| 193 | @item -e |
| 194 | @itemx --headers |
| 195 | Display all the headers in the file. Equivalent to @option{-h -l -S}. |
| 196 | diff --git a/binutils/NEWS b/binutils/NEWS |
| 197 | --- a/binutils/NEWS 2021-12-19 19:00:27.038540406 -0800 |
| 198 | +++ b/binutils/NEWS 2021-12-19 19:30:04.764162972 -0800 |
| 199 | @@ -1,5 +1,14 @@ |
| 200 | -*- text -*- |
| 201 | |
| 202 | +* Tools which display symbols or strings (readelf, strings, nm, objdump) |
| 203 | + have a new command line option which controls how unicode characters are |
| 204 | + handled. By default they are treated as normal for the tool. Using |
| 205 | + --unicode=locale will display them according to the current locale. |
| 206 | + Using --unicode=hex will display them as hex byte values, whilst |
| 207 | + --unicode=escape will display them as escape sequences. In addition |
| 208 | + using --unicode=highlight will display them as unicode escape sequences |
| 209 | + highlighted in red (if supported by the output device). |
| 210 | + |
| 211 | Changes in 2.37: |
| 212 | |
| 213 | * The readelf tool has a new command line option which can be used to specify |
| 214 | diff --git a/binutils/nm.c b/binutils/nm.c |
| 215 | --- a/binutils/nm.c 2021-12-19 19:00:27.046540270 -0800 |
| 216 | +++ b/binutils/nm.c 2021-12-19 19:36:34.797491555 -0800 |
| 217 | @@ -38,6 +38,11 @@ |
| 218 | #include "bucomm.h" |
| 219 | #include "plugin-api.h" |
| 220 | #include "plugin.h" |
| 221 | +#include "safe-ctype.h" |
| 222 | + |
| 223 | +#ifndef streq |
| 224 | +#define streq(a,b) (strcmp ((a),(b)) == 0) |
| 225 | +#endif |
| 226 | |
| 227 | /* When sorting by size, we use this structure to hold the size and a |
| 228 | pointer to the minisymbol. */ |
| 229 | @@ -216,6 +221,18 @@ static const char *plugin_target = NULL; |
| 230 | static bfd *lineno_cache_bfd; |
| 231 | static bfd *lineno_cache_rel_bfd; |
| 232 | |
| 233 | +typedef enum unicode_display_type |
| 234 | +{ |
| 235 | + unicode_default = 0, |
| 236 | + unicode_locale, |
| 237 | + unicode_escape, |
| 238 | + unicode_hex, |
| 239 | + unicode_highlight, |
| 240 | + unicode_invalid |
| 241 | +} unicode_display_type; |
| 242 | + |
| 243 | +static unicode_display_type unicode_display = unicode_default; |
| 244 | + |
| 245 | enum long_option_values |
| 246 | { |
| 247 | OPTION_TARGET = 200, |
| 248 | @@ -260,6 +277,7 @@ static struct option long_options[] = |
| 249 | {"target", required_argument, 0, OPTION_TARGET}, |
| 250 | {"defined-only", no_argument, &defined_only, 1}, |
| 251 | {"undefined-only", no_argument, &undefined_only, 1}, |
| 252 | + {"unicode", required_argument, NULL, 'U'}, |
| 253 | {"version", no_argument, &show_version, 1}, |
| 254 | {"with-symbol-versions", no_argument, &with_symbol_versions, 1}, |
| 255 | {"without-symbol-versions", no_argument, &with_symbol_versions, 0}, |
| 256 | @@ -313,6 +331,8 @@ usage (FILE *stream, int status) |
| 257 | -t, --radix=RADIX Use RADIX for printing symbol values\n\ |
| 258 | --target=BFDNAME Specify the target object format as BFDNAME\n\ |
| 259 | -u, --undefined-only Display only undefined symbols\n\ |
| 260 | + -U {d|s|i|x|e|h} Specify how to treat UTF-8 encoded unicode characters\n\ |
| 261 | + --unicode={default|show|invalid|hex|escape|highlight}\n\ |
| 262 | --with-symbol-versions Display version strings after symbol names\n\ |
| 263 | -X 32_64 (ignored)\n\ |
| 264 | @FILE Read options from FILE\n\ |
| 265 | @@ -432,6 +452,187 @@ get_coff_symbol_type (const struct inter |
| 266 | return bufp; |
| 267 | } |
| 268 | |
| 269 | +/* Convert a potential UTF-8 encoded sequence in IN into characters in OUT. |
| 270 | + The conversion format is controlled by the unicode_display variable. |
| 271 | + Returns the number of characters added to OUT. |
| 272 | + Returns the number of bytes consumed from IN in CONSUMED. |
| 273 | + Always consumes at least one byte and displays at least one character. */ |
| 274 | + |
| 275 | +static unsigned int |
| 276 | +display_utf8 (const unsigned char * in, char * out, unsigned int * consumed) |
| 277 | +{ |
| 278 | + char * orig_out = out; |
| 279 | + unsigned int nchars = 0; |
| 280 | + unsigned int j; |
| 281 | + |
| 282 | + if (unicode_display == unicode_default) |
| 283 | + goto invalid; |
| 284 | + |
| 285 | + if (in[0] < 0xc0) |
| 286 | + goto invalid; |
| 287 | + |
| 288 | + if ((in[1] & 0xc0) != 0x80) |
| 289 | + goto invalid; |
| 290 | + |
| 291 | + if ((in[0] & 0x20) == 0) |
| 292 | + { |
| 293 | + nchars = 2; |
| 294 | + goto valid; |
| 295 | + } |
| 296 | + |
| 297 | + if ((in[2] & 0xc0) != 0x80) |
| 298 | + goto invalid; |
| 299 | + |
| 300 | + if ((in[0] & 0x10) == 0) |
| 301 | + { |
| 302 | + nchars = 3; |
| 303 | + goto valid; |
| 304 | + } |
| 305 | + |
| 306 | + if ((in[3] & 0xc0) != 0x80) |
| 307 | + goto invalid; |
| 308 | + |
| 309 | + nchars = 4; |
| 310 | + |
| 311 | + valid: |
| 312 | + switch (unicode_display) |
| 313 | + { |
| 314 | + case unicode_locale: |
| 315 | + /* Copy the bytes into the output buffer as is. */ |
| 316 | + memcpy (out, in, nchars); |
| 317 | + out += nchars; |
| 318 | + break; |
| 319 | + |
| 320 | + case unicode_invalid: |
| 321 | + case unicode_hex: |
| 322 | + out += sprintf (out, "%c", unicode_display == unicode_hex ? '<' : '{'); |
| 323 | + out += sprintf (out, "0x"); |
| 324 | + for (j = 0; j < nchars; j++) |
| 325 | + out += sprintf (out, "%02x", in [j]); |
| 326 | + out += sprintf (out, "%c", unicode_display == unicode_hex ? '>' : '}'); |
| 327 | + break; |
| 328 | + |
| 329 | + case unicode_highlight: |
| 330 | + if (isatty (1)) |
| 331 | + out += sprintf (out, "\x1B[31;47m"); /* Red. */ |
| 332 | + /* Fall through. */ |
| 333 | + case unicode_escape: |
| 334 | + switch (nchars) |
| 335 | + { |
| 336 | + case 2: |
| 337 | + out += sprintf (out, "\\u%02x%02x", |
| 338 | + ((in[0] & 0x1c) >> 2), |
| 339 | + ((in[0] & 0x03) << 6) | (in[1] & 0x3f)); |
| 340 | + break; |
| 341 | + |
| 342 | + case 3: |
| 343 | + out += sprintf (out, "\\u%02x%02x", |
| 344 | + ((in[0] & 0x0f) << 4) | ((in[1] & 0x3c) >> 2), |
| 345 | + ((in[1] & 0x03) << 6) | ((in[2] & 0x3f))); |
| 346 | + break; |
| 347 | + |
| 348 | + case 4: |
| 349 | + out += sprintf (out, "\\u%02x%02x%02x", |
| 350 | + ((in[0] & 0x07) << 6) | ((in[1] & 0x3c) >> 2), |
| 351 | + ((in[1] & 0x03) << 6) | ((in[2] & 0x3c) >> 2), |
| 352 | + ((in[2] & 0x03) << 6) | ((in[3] & 0x3f))); |
| 353 | + break; |
| 354 | + default: |
| 355 | + /* URG. */ |
| 356 | + break; |
| 357 | + } |
| 358 | + |
| 359 | + if (unicode_display == unicode_highlight && isatty (1)) |
| 360 | + out += sprintf (out, "\033[0m"); /* Default colour. */ |
| 361 | + break; |
| 362 | + |
| 363 | + default: |
| 364 | + /* URG */ |
| 365 | + break; |
| 366 | + } |
| 367 | + |
| 368 | + * consumed = nchars; |
| 369 | + return out - orig_out; |
| 370 | + |
| 371 | + invalid: |
| 372 | + /* Not a valid UTF-8 sequence. */ |
| 373 | + *out = *in; |
| 374 | + * consumed = 1; |
| 375 | + return 1; |
| 376 | +} |
| 377 | + |
| 378 | +/* Convert any UTF-8 encoded characters in NAME into the form specified by |
| 379 | + unicode_display. Also converts control characters. Returns a static |
| 380 | + buffer if conversion was necessary. |
| 381 | + Code stolen from objdump.c:sanitize_string(). */ |
| 382 | + |
| 383 | +static const char * |
| 384 | +convert_utf8 (const char * in) |
| 385 | +{ |
| 386 | + static char * buffer = NULL; |
| 387 | + static size_t buffer_len = 0; |
| 388 | + const char * original = in; |
| 389 | + char * out; |
| 390 | + |
| 391 | + /* Paranoia. */ |
| 392 | + if (in == NULL) |
| 393 | + return ""; |
| 394 | + |
| 395 | + /* See if any conversion is necessary. |
| 396 | + In the majority of cases it will not be needed. */ |
| 397 | + do |
| 398 | + { |
| 399 | + unsigned char c = *in++; |
| 400 | + |
| 401 | + if (c == 0) |
| 402 | + return original; |
| 403 | + |
| 404 | + if (ISCNTRL (c)) |
| 405 | + break; |
| 406 | + |
| 407 | + if (unicode_display != unicode_default && c >= 0xc0) |
| 408 | + break; |
| 409 | + } |
| 410 | + while (1); |
| 411 | + |
| 412 | + /* Copy the input, translating as needed. */ |
| 413 | + in = original; |
| 414 | + if (buffer_len < (strlen (in) * 9)) |
| 415 | + { |
| 416 | + free ((void *) buffer); |
| 417 | + buffer_len = strlen (in) * 9; |
| 418 | + buffer = xmalloc (buffer_len + 1); |
| 419 | + } |
| 420 | + |
| 421 | + out = buffer; |
| 422 | + do |
| 423 | + { |
| 424 | + unsigned char c = *in++; |
| 425 | + |
| 426 | + if (c == 0) |
| 427 | + break; |
| 428 | + |
| 429 | + if (ISCNTRL (c)) |
| 430 | + { |
| 431 | + *out++ = '^'; |
| 432 | + *out++ = c + 0x40; |
| 433 | + } |
| 434 | + else if (unicode_display != unicode_default && c >= 0xc0) |
| 435 | + { |
| 436 | + unsigned int num_consumed; |
| 437 | + |
| 438 | + out += display_utf8 ((const unsigned char *)(in - 1), out, & num_consumed); |
| 439 | + in += num_consumed - 1; |
| 440 | + } |
| 441 | + else |
| 442 | + *out++ = c; |
| 443 | + } |
| 444 | + while (1); |
| 445 | + |
| 446 | + *out = 0; |
| 447 | + return buffer; |
| 448 | +} |
| 449 | + |
| 450 | /* Print symbol name NAME, read from ABFD, with printf format FORM, |
| 451 | demangling it if requested. */ |
| 452 | |
| 453 | @@ -444,6 +645,7 @@ print_symname (const char *form, struct |
| 454 | |
| 455 | if (name == NULL) |
| 456 | name = info->sinfo->name; |
| 457 | + |
| 458 | if (!with_symbol_versions |
| 459 | && bfd_get_flavour (abfd) == bfd_target_elf_flavour) |
| 460 | { |
| 461 | @@ -451,6 +653,7 @@ print_symname (const char *form, struct |
| 462 | if (atver) |
| 463 | *atver = 0; |
| 464 | } |
| 465 | + |
| 466 | if (do_demangle && *name) |
| 467 | { |
| 468 | alloc = bfd_demangle (abfd, name, demangle_flags); |
| 469 | @@ -458,6 +661,11 @@ print_symname (const char *form, struct |
| 470 | name = alloc; |
| 471 | } |
| 472 | |
| 473 | + if (unicode_display != unicode_default) |
| 474 | + { |
| 475 | + name = convert_utf8 (name); |
| 476 | + } |
| 477 | + |
| 478 | if (info != NULL && info->elfinfo && with_symbol_versions) |
| 479 | { |
| 480 | const char *version_string; |
| 481 | @@ -1807,7 +2015,7 @@ main (int argc, char **argv) |
| 482 | fatal (_("fatal error: libbfd ABI mismatch")); |
| 483 | set_default_bfd_target (); |
| 484 | |
| 485 | - while ((c = getopt_long (argc, argv, "aABCDef:gHhjJlnopPrSst:uvVvX:", |
| 486 | + while ((c = getopt_long (argc, argv, "aABCDef:gHhjJlnopPrSst:uU:vVvX:", |
| 487 | long_options, (int *) 0)) != EOF) |
| 488 | { |
| 489 | switch (c) |
| 490 | @@ -1900,6 +2108,24 @@ main (int argc, char **argv) |
| 491 | case 'u': |
| 492 | undefined_only = 1; |
| 493 | break; |
| 494 | + |
| 495 | + case 'U': |
| 496 | + if (streq (optarg, "default") || streq (optarg, "d")) |
| 497 | + unicode_display = unicode_default; |
| 498 | + else if (streq (optarg, "locale") || streq (optarg, "l")) |
| 499 | + unicode_display = unicode_locale; |
| 500 | + else if (streq (optarg, "escape") || streq (optarg, "e")) |
| 501 | + unicode_display = unicode_escape; |
| 502 | + else if (streq (optarg, "invalid") || streq (optarg, "i")) |
| 503 | + unicode_display = unicode_invalid; |
| 504 | + else if (streq (optarg, "hex") || streq (optarg, "x")) |
| 505 | + unicode_display = unicode_hex; |
| 506 | + else if (streq (optarg, "highlight") || streq (optarg, "h")) |
| 507 | + unicode_display = unicode_highlight; |
| 508 | + else |
| 509 | + fatal (_("invalid argument to -U/--unicode: %s"), optarg); |
| 510 | + break; |
| 511 | + |
| 512 | case 'V': |
| 513 | show_version = 1; |
| 514 | break; |
| 515 | diff --git a/binutils/objdump.c b/binutils/objdump.c |
| 516 | --- a/binutils/objdump.c 2021-12-19 19:00:27.046540270 -0800 |
| 517 | +++ b/binutils/objdump.c 2021-12-19 19:43:09.438736729 -0800 |
| 518 | @@ -204,6 +204,18 @@ static const struct objdump_private_desc |
| 519 | |
| 520 | /* The list of detected jumps inside a function. */ |
| 521 | static struct jump_info *detected_jumps = NULL; |
| 522 | + |
| 523 | +typedef enum unicode_display_type |
| 524 | +{ |
| 525 | + unicode_default = 0, |
| 526 | + unicode_locale, |
| 527 | + unicode_escape, |
| 528 | + unicode_hex, |
| 529 | + unicode_highlight, |
| 530 | + unicode_invalid |
| 531 | +} unicode_display_type; |
| 532 | + |
| 533 | +static unicode_display_type unicode_display = unicode_default; |
| 534 | |
| 535 | static void usage (FILE *, int) ATTRIBUTE_NORETURN; |
| 536 | static void |
| 537 | @@ -330,6 +342,9 @@ usage (FILE *stream, int status) |
| 538 | fprintf (stream, _("\ |
| 539 | -w, --wide Format output for more than 80 columns\n")); |
| 540 | fprintf (stream, _("\ |
| 541 | + -U[d|l|i|x|e|h] Controls the display of UTF-8 unicode characters\n\ |
| 542 | + --unicode=[default|locale|invalid|hex|escape|highlight]\n")); |
| 543 | + fprintf (stream, _("\ |
| 544 | -z, --disassemble-zeroes Do not skip blocks of zeroes when disassembling\n")); |
| 545 | fprintf (stream, _("\ |
| 546 | --start-address=ADDR Only process data whose address is >= ADDR\n")); |
| 547 | @@ -420,17 +435,23 @@ static struct option long_options[]= |
| 548 | { |
| 549 | {"adjust-vma", required_argument, NULL, OPTION_ADJUST_VMA}, |
| 550 | {"all-headers", no_argument, NULL, 'x'}, |
| 551 | - {"private-headers", no_argument, NULL, 'p'}, |
| 552 | - {"private", required_argument, NULL, 'P'}, |
| 553 | {"architecture", required_argument, NULL, 'm'}, |
| 554 | {"archive-headers", no_argument, NULL, 'a'}, |
| 555 | +#ifdef ENABLE_LIBCTF |
| 556 | + {"ctf", required_argument, NULL, OPTION_CTF}, |
| 557 | + {"ctf-parent", required_argument, NULL, OPTION_CTF_PARENT}, |
| 558 | +#endif |
| 559 | {"debugging", no_argument, NULL, 'g'}, |
| 560 | {"debugging-tags", no_argument, NULL, 'e'}, |
| 561 | {"demangle", optional_argument, NULL, 'C'}, |
| 562 | {"disassemble", optional_argument, NULL, 'd'}, |
| 563 | {"disassemble-all", no_argument, NULL, 'D'}, |
| 564 | - {"disassembler-options", required_argument, NULL, 'M'}, |
| 565 | {"disassemble-zeroes", no_argument, NULL, 'z'}, |
| 566 | + {"disassembler-options", required_argument, NULL, 'M'}, |
| 567 | + {"dwarf", optional_argument, NULL, OPTION_DWARF}, |
| 568 | + {"dwarf-check", no_argument, 0, OPTION_DWARF_CHECK}, |
| 569 | + {"dwarf-depth", required_argument, 0, OPTION_DWARF_DEPTH}, |
| 570 | + {"dwarf-start", required_argument, 0, OPTION_DWARF_START}, |
| 571 | {"dynamic-reloc", no_argument, NULL, 'R'}, |
| 572 | {"dynamic-syms", no_argument, NULL, 'T'}, |
| 573 | {"endian", required_argument, NULL, OPTION_ENDIAN}, |
| 574 | @@ -440,16 +461,23 @@ static struct option long_options[]= |
| 575 | {"full-contents", no_argument, NULL, 's'}, |
| 576 | {"headers", no_argument, NULL, 'h'}, |
| 577 | {"help", no_argument, NULL, 'H'}, |
| 578 | + {"include", required_argument, NULL, 'I'}, |
| 579 | {"info", no_argument, NULL, 'i'}, |
| 580 | + {"inlines", no_argument, 0, OPTION_INLINES}, |
| 581 | + {"insn-width", required_argument, NULL, OPTION_INSN_WIDTH}, |
| 582 | {"line-numbers", no_argument, NULL, 'l'}, |
| 583 | - {"no-show-raw-insn", no_argument, &show_raw_insn, -1}, |
| 584 | {"no-addresses", no_argument, &no_addresses, 1}, |
| 585 | - {"process-links", no_argument, &process_links, true}, |
| 586 | + {"no-recurse-limit", no_argument, NULL, OPTION_NO_RECURSE_LIMIT}, |
| 587 | + {"no-recursion-limit", no_argument, NULL, OPTION_NO_RECURSE_LIMIT}, |
| 588 | + {"no-show-raw-insn", no_argument, &show_raw_insn, -1}, |
| 589 | + {"prefix", required_argument, NULL, OPTION_PREFIX}, |
| 590 | {"prefix-addresses", no_argument, &prefix_addresses, 1}, |
| 591 | + {"prefix-strip", required_argument, NULL, OPTION_PREFIX_STRIP}, |
| 592 | + {"private", required_argument, NULL, 'P'}, |
| 593 | + {"private-headers", no_argument, NULL, 'p'}, |
| 594 | + {"process-links", no_argument, &process_links, true}, |
| 595 | {"recurse-limit", no_argument, NULL, OPTION_RECURSE_LIMIT}, |
| 596 | {"recursion-limit", no_argument, NULL, OPTION_RECURSE_LIMIT}, |
| 597 | - {"no-recurse-limit", no_argument, NULL, OPTION_NO_RECURSE_LIMIT}, |
| 598 | - {"no-recursion-limit", no_argument, NULL, OPTION_NO_RECURSE_LIMIT}, |
| 599 | {"reloc", no_argument, NULL, 'r'}, |
| 600 | {"section", required_argument, NULL, 'j'}, |
| 601 | {"section-headers", no_argument, NULL, 'h'}, |
| 602 | @@ -457,28 +485,16 @@ static struct option long_options[]= |
| 603 | {"source", no_argument, NULL, 'S'}, |
| 604 | {"source-comment", optional_argument, NULL, OPTION_SOURCE_COMMENT}, |
| 605 | {"special-syms", no_argument, &dump_special_syms, 1}, |
| 606 | - {"include", required_argument, NULL, 'I'}, |
| 607 | - {"dwarf", optional_argument, NULL, OPTION_DWARF}, |
| 608 | -#ifdef ENABLE_LIBCTF |
| 609 | - {"ctf", required_argument, NULL, OPTION_CTF}, |
| 610 | - {"ctf-parent", required_argument, NULL, OPTION_CTF_PARENT}, |
| 611 | -#endif |
| 612 | {"stabs", no_argument, NULL, 'G'}, |
| 613 | {"start-address", required_argument, NULL, OPTION_START_ADDRESS}, |
| 614 | {"stop-address", required_argument, NULL, OPTION_STOP_ADDRESS}, |
| 615 | {"syms", no_argument, NULL, 't'}, |
| 616 | {"target", required_argument, NULL, 'b'}, |
| 617 | + {"unicode", required_argument, NULL, 'U'}, |
| 618 | {"version", no_argument, NULL, 'V'}, |
| 619 | - {"wide", no_argument, NULL, 'w'}, |
| 620 | - {"prefix", required_argument, NULL, OPTION_PREFIX}, |
| 621 | - {"prefix-strip", required_argument, NULL, OPTION_PREFIX_STRIP}, |
| 622 | - {"insn-width", required_argument, NULL, OPTION_INSN_WIDTH}, |
| 623 | - {"dwarf-depth", required_argument, 0, OPTION_DWARF_DEPTH}, |
| 624 | - {"dwarf-start", required_argument, 0, OPTION_DWARF_START}, |
| 625 | - {"dwarf-check", no_argument, 0, OPTION_DWARF_CHECK}, |
| 626 | - {"inlines", no_argument, 0, OPTION_INLINES}, |
| 627 | {"visualize-jumps", optional_argument, 0, OPTION_VISUALIZE_JUMPS}, |
| 628 | - {0, no_argument, 0, 0} |
| 629 | + {"wide", no_argument, NULL, 'w'}, |
| 630 | + {NULL, no_argument, NULL, 0} |
| 631 | }; |
| 632 | |
| 633 | static void |
| 634 | @@ -488,9 +504,121 @@ nonfatal (const char *msg) |
| 635 | exit_status = 1; |
| 636 | } |
| 637 | |
| 638 | +/* Convert a potential UTF-8 encoded sequence in IN into characters in OUT. |
| 639 | + The conversion format is controlled by the unicode_display variable. |
| 640 | + Returns the number of characters added to OUT. |
| 641 | + Returns the number of bytes consumed from IN in CONSUMED. |
| 642 | + Always consumes at least one byte and displays at least one character. */ |
| 643 | + |
| 644 | +static unsigned int |
| 645 | +display_utf8 (const unsigned char * in, char * out, unsigned int * consumed) |
| 646 | +{ |
| 647 | + char * orig_out = out; |
| 648 | + unsigned int nchars = 0; |
| 649 | + unsigned int j; |
| 650 | + |
| 651 | + if (unicode_display == unicode_default) |
| 652 | + goto invalid; |
| 653 | + |
| 654 | + if (in[0] < 0xc0) |
| 655 | + goto invalid; |
| 656 | + |
| 657 | + if ((in[1] & 0xc0) != 0x80) |
| 658 | + goto invalid; |
| 659 | + |
| 660 | + if ((in[0] & 0x20) == 0) |
| 661 | + { |
| 662 | + nchars = 2; |
| 663 | + goto valid; |
| 664 | + } |
| 665 | + |
| 666 | + if ((in[2] & 0xc0) != 0x80) |
| 667 | + goto invalid; |
| 668 | + |
| 669 | + if ((in[0] & 0x10) == 0) |
| 670 | + { |
| 671 | + nchars = 3; |
| 672 | + goto valid; |
| 673 | + } |
| 674 | + |
| 675 | + if ((in[3] & 0xc0) != 0x80) |
| 676 | + goto invalid; |
| 677 | + |
| 678 | + nchars = 4; |
| 679 | + |
| 680 | + valid: |
| 681 | + switch (unicode_display) |
| 682 | + { |
| 683 | + case unicode_locale: |
| 684 | + /* Copy the bytes into the output buffer as is. */ |
| 685 | + memcpy (out, in, nchars); |
| 686 | + out += nchars; |
| 687 | + break; |
| 688 | + |
| 689 | + case unicode_invalid: |
| 690 | + case unicode_hex: |
| 691 | + out += sprintf (out, "%c", unicode_display == unicode_hex ? '<' : '{'); |
| 692 | + out += sprintf (out, "0x"); |
| 693 | + for (j = 0; j < nchars; j++) |
| 694 | + out += sprintf (out, "%02x", in [j]); |
| 695 | + out += sprintf (out, "%c", unicode_display == unicode_hex ? '>' : '}'); |
| 696 | + break; |
| 697 | + |
| 698 | + case unicode_highlight: |
| 699 | + if (isatty (1)) |
| 700 | + out += sprintf (out, "\x1B[31;47m"); /* Red. */ |
| 701 | + /* Fall through. */ |
| 702 | + case unicode_escape: |
| 703 | + switch (nchars) |
| 704 | + { |
| 705 | + case 2: |
| 706 | + out += sprintf (out, "\\u%02x%02x", |
| 707 | + ((in[0] & 0x1c) >> 2), |
| 708 | + ((in[0] & 0x03) << 6) | (in[1] & 0x3f)); |
| 709 | + break; |
| 710 | + |
| 711 | + case 3: |
| 712 | + out += sprintf (out, "\\u%02x%02x", |
| 713 | + ((in[0] & 0x0f) << 4) | ((in[1] & 0x3c) >> 2), |
| 714 | + ((in[1] & 0x03) << 6) | ((in[2] & 0x3f))); |
| 715 | + break; |
| 716 | + |
| 717 | + case 4: |
| 718 | + out += sprintf (out, "\\u%02x%02x%02x", |
| 719 | + ((in[0] & 0x07) << 6) | ((in[1] & 0x3c) >> 2), |
| 720 | + ((in[1] & 0x03) << 6) | ((in[2] & 0x3c) >> 2), |
| 721 | + ((in[2] & 0x03) << 6) | ((in[3] & 0x3f))); |
| 722 | + break; |
| 723 | + default: |
| 724 | + /* URG. */ |
| 725 | + break; |
| 726 | + } |
| 727 | + |
| 728 | + if (unicode_display == unicode_highlight && isatty (1)) |
| 729 | + out += sprintf (out, "\033[0m"); /* Default colour. */ |
| 730 | + break; |
| 731 | + |
| 732 | + default: |
| 733 | + /* URG */ |
| 734 | + break; |
| 735 | + } |
| 736 | + |
| 737 | + * consumed = nchars; |
| 738 | + return out - orig_out; |
| 739 | + |
| 740 | + invalid: |
| 741 | + /* Not a valid UTF-8 sequence. */ |
| 742 | + *out = *in; |
| 743 | + * consumed = 1; |
| 744 | + return 1; |
| 745 | +} |
| 746 | + |
| 747 | /* Returns a version of IN with any control characters |
| 748 | replaced by escape sequences. Uses a static buffer |
| 749 | - if necessary. */ |
| 750 | + if necessary. |
| 751 | + |
| 752 | + If unicode display is enabled, then also handles the |
| 753 | + conversion of unicode characters. */ |
| 754 | |
| 755 | static const char * |
| 756 | sanitize_string (const char * in) |
| 757 | @@ -508,40 +636,50 @@ sanitize_string (const char * in) |
| 758 | of cases it will not be needed. */ |
| 759 | do |
| 760 | { |
| 761 | - char c = *in++; |
| 762 | + unsigned char c = *in++; |
| 763 | |
| 764 | if (c == 0) |
| 765 | return original; |
| 766 | |
| 767 | if (ISCNTRL (c)) |
| 768 | break; |
| 769 | + |
| 770 | + if (unicode_display != unicode_default && c >= 0xc0) |
| 771 | + break; |
| 772 | } |
| 773 | while (1); |
| 774 | |
| 775 | /* Copy the input, translating as needed. */ |
| 776 | in = original; |
| 777 | - if (buffer_len < (strlen (in) * 2)) |
| 778 | + if (buffer_len < (strlen (in) * 9)) |
| 779 | { |
| 780 | free ((void *) buffer); |
| 781 | - buffer_len = strlen (in) * 2; |
| 782 | + buffer_len = strlen (in) * 9; |
| 783 | buffer = xmalloc (buffer_len + 1); |
| 784 | } |
| 785 | |
| 786 | out = buffer; |
| 787 | do |
| 788 | { |
| 789 | - char c = *in++; |
| 790 | + unsigned char c = *in++; |
| 791 | |
| 792 | if (c == 0) |
| 793 | break; |
| 794 | |
| 795 | - if (!ISCNTRL (c)) |
| 796 | - *out++ = c; |
| 797 | - else |
| 798 | + if (ISCNTRL (c)) |
| 799 | { |
| 800 | *out++ = '^'; |
| 801 | *out++ = c + 0x40; |
| 802 | } |
| 803 | + else if (unicode_display != unicode_default && c >= 0xc0) |
| 804 | + { |
| 805 | + unsigned int num_consumed; |
| 806 | + |
| 807 | + out += display_utf8 ((const unsigned char *)(in - 1), out, & num_consumed); |
| 808 | + in += num_consumed - 1; |
| 809 | + } |
| 810 | + else |
| 811 | + *out++ = c; |
| 812 | } |
| 813 | while (1); |
| 814 | |
| 815 | @@ -4529,6 +4667,24 @@ dump_symbols (bfd *abfd ATTRIBUTE_UNUSED |
| 816 | free (alloc); |
| 817 | } |
| 818 | } |
| 819 | + else if (unicode_display != unicode_default |
| 820 | + && name != NULL && *name != '\0') |
| 821 | + { |
| 822 | + const char * sanitized_name; |
| 823 | + |
| 824 | + /* If we want to sanitize the name, we do it here, and |
| 825 | + temporarily clobber it while calling bfd_print_symbol. |
| 826 | + FIXME: This is a gross hack. */ |
| 827 | + sanitized_name = sanitize_string (name); |
| 828 | + if (sanitized_name != name) |
| 829 | + (*current)->name = sanitized_name; |
| 830 | + else |
| 831 | + sanitized_name = NULL; |
| 832 | + bfd_print_symbol (cur_bfd, stdout, *current, |
| 833 | + bfd_print_symbol_all); |
| 834 | + if (sanitized_name != NULL) |
| 835 | + (*current)->name = name; |
| 836 | + } |
| 837 | else |
| 838 | bfd_print_symbol (cur_bfd, stdout, *current, |
| 839 | bfd_print_symbol_all); |
| 840 | @@ -5212,7 +5368,7 @@ main (int argc, char **argv) |
| 841 | set_default_bfd_target (); |
| 842 | |
| 843 | while ((c = getopt_long (argc, argv, |
| 844 | - "pP:ib:m:M:VvCdDlfFaHhrRtTxsSI:j:wE:zgeGW::", |
| 845 | + "CDE:FGHI:LM:P:RSTU:VW::ab:defghij:lm:prstvwxz", |
| 846 | long_options, (int *) 0)) |
| 847 | != EOF) |
| 848 | { |
| 849 | @@ -5495,6 +5651,23 @@ main (int argc, char **argv) |
| 850 | seenflag = true; |
| 851 | break; |
| 852 | |
| 853 | + case 'U': |
| 854 | + if (streq (optarg, "default") || streq (optarg, "d")) |
| 855 | + unicode_display = unicode_default; |
| 856 | + else if (streq (optarg, "locale") || streq (optarg, "l")) |
| 857 | + unicode_display = unicode_locale; |
| 858 | + else if (streq (optarg, "escape") || streq (optarg, "e")) |
| 859 | + unicode_display = unicode_escape; |
| 860 | + else if (streq (optarg, "invalid") || streq (optarg, "i")) |
| 861 | + unicode_display = unicode_invalid; |
| 862 | + else if (streq (optarg, "hex") || streq (optarg, "x")) |
| 863 | + unicode_display = unicode_hex; |
| 864 | + else if (streq (optarg, "highlight") || streq (optarg, "h")) |
| 865 | + unicode_display = unicode_highlight; |
| 866 | + else |
| 867 | + fatal (_("invalid argument to -U/--unicode: %s"), optarg); |
| 868 | + break; |
| 869 | + |
| 870 | case 'H': |
| 871 | usage (stdout, 0); |
| 872 | /* No need to set seenflag or to break - usage() does not return. */ |
| 873 | diff --git a/binutils/readelf.c b/binutils/readelf.c |
| 874 | --- a/binutils/readelf.c 2021-12-19 19:00:27.058540065 -0800 |
| 875 | +++ b/binutils/readelf.c 2021-12-19 19:27:56.538354462 -0800 |
| 876 | @@ -328,6 +328,19 @@ typedef enum print_mode |
| 877 | } |
| 878 | print_mode; |
| 879 | |
| 880 | +typedef enum unicode_display_type |
| 881 | +{ |
| 882 | + unicode_default = 0, |
| 883 | + unicode_locale, |
| 884 | + unicode_escape, |
| 885 | + unicode_hex, |
| 886 | + unicode_highlight, |
| 887 | + unicode_invalid |
| 888 | +} unicode_display_type; |
| 889 | + |
| 890 | +static unicode_display_type unicode_display = unicode_default; |
| 891 | + |
| 892 | + |
| 893 | /* Versioned symbol info. */ |
| 894 | enum versioned_symbol_info |
| 895 | { |
| 896 | @@ -632,11 +645,18 @@ print_symbol (signed int width, const ch |
| 897 | if (c == 0) |
| 898 | break; |
| 899 | |
| 900 | - /* Do not print control characters directly as they can affect terminal |
| 901 | - settings. Such characters usually appear in the names generated |
| 902 | - by the assembler for local labels. */ |
| 903 | - if (ISCNTRL (c)) |
| 904 | + if (ISPRINT (c)) |
| 905 | + { |
| 906 | + putchar (c); |
| 907 | + width_remaining --; |
| 908 | + num_printed ++; |
| 909 | + } |
| 910 | + else if (ISCNTRL (c)) |
| 911 | { |
| 912 | + /* Do not print control characters directly as they can affect terminal |
| 913 | + settings. Such characters usually appear in the names generated |
| 914 | + by the assembler for local labels. */ |
| 915 | + |
| 916 | if (width_remaining < 2) |
| 917 | break; |
| 918 | |
| 919 | @@ -644,11 +664,137 @@ print_symbol (signed int width, const ch |
| 920 | width_remaining -= 2; |
| 921 | num_printed += 2; |
| 922 | } |
| 923 | - else if (ISPRINT (c)) |
| 924 | + else if (c == 0x7f) |
| 925 | { |
| 926 | - putchar (c); |
| 927 | - width_remaining --; |
| 928 | - num_printed ++; |
| 929 | + if (width_remaining < 5) |
| 930 | + break; |
| 931 | + printf ("<DEL>"); |
| 932 | + width_remaining -= 5; |
| 933 | + num_printed += 5; |
| 934 | + } |
| 935 | + else if (unicode_display != unicode_locale |
| 936 | + && unicode_display != unicode_default) |
| 937 | + { |
| 938 | + /* Display unicode characters as something else. */ |
| 939 | + unsigned char bytes[4]; |
| 940 | + bool is_utf8; |
| 941 | + unsigned int nbytes; |
| 942 | + |
| 943 | + bytes[0] = c; |
| 944 | + |
| 945 | + if (bytes[0] < 0xc0) |
| 946 | + { |
| 947 | + nbytes = 1; |
| 948 | + is_utf8 = false; |
| 949 | + } |
| 950 | + else |
| 951 | + { |
| 952 | + bytes[1] = *symbol++; |
| 953 | + |
| 954 | + if ((bytes[1] & 0xc0) != 0x80) |
| 955 | + { |
| 956 | + is_utf8 = false; |
| 957 | + /* Do not consume this character. It may only |
| 958 | + be the first byte in the sequence that was |
| 959 | + corrupt. */ |
| 960 | + --symbol; |
| 961 | + nbytes = 1; |
| 962 | + } |
| 963 | + else if ((bytes[0] & 0x20) == 0) |
| 964 | + { |
| 965 | + is_utf8 = true; |
| 966 | + nbytes = 2; |
| 967 | + } |
| 968 | + else |
| 969 | + { |
| 970 | + bytes[2] = *symbol++; |
| 971 | + |
| 972 | + if ((bytes[2] & 0xc0) != 0x80) |
| 973 | + { |
| 974 | + is_utf8 = false; |
| 975 | + symbol -= 2; |
| 976 | + nbytes = 1; |
| 977 | + } |
| 978 | + else if ((bytes[0] & 0x10) == 0) |
| 979 | + { |
| 980 | + is_utf8 = true; |
| 981 | + nbytes = 3; |
| 982 | + } |
| 983 | + else |
| 984 | + { |
| 985 | + bytes[3] = *symbol++; |
| 986 | + |
| 987 | + nbytes = 4; |
| 988 | + |
| 989 | + if ((bytes[3] & 0xc0) != 0x80) |
| 990 | + { |
| 991 | + is_utf8 = false; |
| 992 | + symbol -= 3; |
| 993 | + nbytes = 1; |
| 994 | + } |
| 995 | + else |
| 996 | + is_utf8 = true; |
| 997 | + } |
| 998 | + } |
| 999 | + } |
| 1000 | + |
| 1001 | + if (unicode_display == unicode_invalid) |
| 1002 | + is_utf8 = false; |
| 1003 | + |
| 1004 | + if (unicode_display == unicode_hex || ! is_utf8) |
| 1005 | + { |
| 1006 | + unsigned int i; |
| 1007 | + |
| 1008 | + if (width_remaining < (nbytes * 2) + 2) |
| 1009 | + break; |
| 1010 | + |
| 1011 | + putchar (is_utf8 ? '<' : '{'); |
| 1012 | + printf ("0x"); |
| 1013 | + for (i = 0; i < nbytes; i++) |
| 1014 | + printf ("%02x", bytes[i]); |
| 1015 | + putchar (is_utf8 ? '>' : '}'); |
| 1016 | + } |
| 1017 | + else |
| 1018 | + { |
| 1019 | + if (unicode_display == unicode_highlight && isatty (1)) |
| 1020 | + printf ("\x1B[31;47m"); /* Red. */ |
| 1021 | + |
| 1022 | + switch (nbytes) |
| 1023 | + { |
| 1024 | + case 2: |
| 1025 | + if (width_remaining < 6) |
| 1026 | + break; |
| 1027 | + printf ("\\u%02x%02x", |
| 1028 | + (bytes[0] & 0x1c) >> 2, |
| 1029 | + ((bytes[0] & 0x03) << 6) | (bytes[1] & 0x3f)); |
| 1030 | + break; |
| 1031 | + case 3: |
| 1032 | + if (width_remaining < 6) |
| 1033 | + break; |
| 1034 | + printf ("\\u%02x%02x", |
| 1035 | + ((bytes[0] & 0x0f) << 4) | ((bytes[1] & 0x3c) >> 2), |
| 1036 | + ((bytes[1] & 0x03) << 6) | (bytes[2] & 0x3f)); |
| 1037 | + break; |
| 1038 | + case 4: |
| 1039 | + if (width_remaining < 8) |
| 1040 | + break; |
| 1041 | + printf ("\\u%02x%02x%02x", |
| 1042 | + ((bytes[0] & 0x07) << 6) | ((bytes[1] & 0x3c) >> 2), |
| 1043 | + ((bytes[1] & 0x03) << 6) | ((bytes[2] & 0x3c) >> 2), |
| 1044 | + ((bytes[2] & 0x03) << 6) | (bytes[3] & 0x3f)); |
| 1045 | + |
| 1046 | + break; |
| 1047 | + default: |
| 1048 | + /* URG. */ |
| 1049 | + break; |
| 1050 | + } |
| 1051 | + |
| 1052 | + if (unicode_display == unicode_highlight && isatty (1)) |
| 1053 | + printf ("\033[0m"); /* Default colour. */ |
| 1054 | + } |
| 1055 | + |
| 1056 | + if (bytes[nbytes - 1] == 0) |
| 1057 | + break; |
| 1058 | } |
| 1059 | else |
| 1060 | { |
| 1061 | @@ -4668,6 +4814,7 @@ static struct option options[] = |
| 1062 | {"syms", no_argument, 0, 's'}, |
| 1063 | {"silent-truncation",no_argument, 0, 'T'}, |
| 1064 | {"section-details", no_argument, 0, 't'}, |
| 1065 | + {"unicode", required_argument, NULL, 'U'}, |
| 1066 | {"unwind", no_argument, 0, 'u'}, |
| 1067 | {"version-info", no_argument, 0, 'V'}, |
| 1068 | {"version", no_argument, 0, 'v'}, |
| 1069 | @@ -4744,6 +4891,12 @@ usage (FILE * stream) |
| 1070 | fprintf (stream, _("\ |
| 1071 | --no-recurse-limit Disable a demangling recursion limit\n")); |
| 1072 | fprintf (stream, _("\ |
| 1073 | + -U[dlexhi] --unicode=[default|locale|escape|hex|highlight|invalid]\n\ |
| 1074 | + Display unicode characters as determined by the current locale\n\ |
| 1075 | + (default), escape sequences, \"<hex sequences>\", highlighted\n\ |
| 1076 | + escape sequences, or treat them as invalid and display as\n\ |
| 1077 | + \"{hex sequences}\"\n")); |
| 1078 | + fprintf (stream, _("\ |
| 1079 | -n --notes Display the core notes (if present)\n")); |
| 1080 | fprintf (stream, _("\ |
| 1081 | -r --relocs Display the relocations (if present)\n")); |
| 1082 | @@ -4928,7 +5081,7 @@ parse_args (struct dump_data *dumpdata, |
| 1083 | usage (stderr); |
| 1084 | |
| 1085 | while ((c = getopt_long |
| 1086 | - (argc, argv, "ACDHILNPR:STVWacdeghi:lnp:rstuvw::x:z", options, NULL)) != EOF) |
| 1087 | + (argc, argv, "ACDHILNPR:STU:VWacdeghi:lnp:rstuvw::x:z", options, NULL)) != EOF) |
| 1088 | { |
| 1089 | switch (c) |
| 1090 | { |
| 1091 | @@ -5130,6 +5283,25 @@ parse_args (struct dump_data *dumpdata, |
| 1092 | /* Ignored for backward compatibility. */ |
| 1093 | break; |
| 1094 | |
| 1095 | + case 'U': |
| 1096 | + if (optarg == NULL) |
| 1097 | + error (_("Missing arg to -U/--unicode")); /* Can this happen ? */ |
| 1098 | + else if (streq (optarg, "default") || streq (optarg, "d")) |
| 1099 | + unicode_display = unicode_default; |
| 1100 | + else if (streq (optarg, "locale") || streq (optarg, "l")) |
| 1101 | + unicode_display = unicode_locale; |
| 1102 | + else if (streq (optarg, "escape") || streq (optarg, "e")) |
| 1103 | + unicode_display = unicode_escape; |
| 1104 | + else if (streq (optarg, "invalid") || streq (optarg, "i")) |
| 1105 | + unicode_display = unicode_invalid; |
| 1106 | + else if (streq (optarg, "hex") || streq (optarg, "x")) |
| 1107 | + unicode_display = unicode_hex; |
| 1108 | + else if (streq (optarg, "highlight") || streq (optarg, "h")) |
| 1109 | + unicode_display = unicode_highlight; |
| 1110 | + else |
| 1111 | + error (_("invalid argument to -U/--unicode: %s"), optarg); |
| 1112 | + break; |
| 1113 | + |
| 1114 | case OPTION_SYM_BASE: |
| 1115 | sym_base = 0; |
| 1116 | if (optarg != NULL) |
| 1117 | diff --git a/binutils/strings.c b/binutils/strings.c |
| 1118 | --- a/binutils/strings.c 2021-12-19 19:00:27.058540065 -0800 |
| 1119 | +++ b/binutils/strings.c 2021-12-19 19:48:26.205313218 -0800 |
| 1120 | @@ -55,6 +55,19 @@ |
| 1121 | -T {bfdname} |
| 1122 | Specify a non-default object file format. |
| 1123 | |
| 1124 | + --unicode={default|locale|invalid|hex|escape|highlight} |
| 1125 | + -u {d|l|i|x|e|h} |
| 1126 | + Determine how to handle UTF-8 unicode characters. The default |
| 1127 | + is no special treatment. All other versions of this option |
| 1128 | + only apply if the encoding is valid and enabling the option |
| 1129 | + implies --encoding=S. |
| 1130 | + The 'locale' option displays the characters according to the |
| 1131 | + current locale. The 'invalid' option treats them as |
| 1132 | + non-string characters. The 'hex' option displays them as hex |
| 1133 | + byte sequences. The 'escape' option displays them as escape |
| 1134 | + sequences and the 'highlight' option displays them as |
| 1135 | + coloured escape sequences. |
| 1136 | + |
| 1137 | --output-separator=sep_string |
| 1138 | -s sep_string String used to separate parsed strings in output. |
| 1139 | Default is newline. |
| 1140 | @@ -76,6 +89,22 @@ |
| 1141 | #include "safe-ctype.h" |
| 1142 | #include "bucomm.h" |
| 1143 | |
| 1144 | +#ifndef streq |
| 1145 | +#define streq(a,b) (strcmp ((a),(b)) == 0) |
| 1146 | +#endif |
| 1147 | + |
| 1148 | +typedef enum unicode_display_type |
| 1149 | +{ |
| 1150 | + unicode_default = 0, |
| 1151 | + unicode_locale, |
| 1152 | + unicode_escape, |
| 1153 | + unicode_hex, |
| 1154 | + unicode_highlight, |
| 1155 | + unicode_invalid |
| 1156 | +} unicode_display_type; |
| 1157 | + |
| 1158 | +static unicode_display_type unicode_display = unicode_default; |
| 1159 | + |
| 1160 | #define STRING_ISGRAPHIC(c) \ |
| 1161 | ( (c) >= 0 \ |
| 1162 | && (c) <= 255 \ |
| 1163 | @@ -94,7 +123,7 @@ extern int errno; |
| 1164 | static int address_radix; |
| 1165 | |
| 1166 | /* Minimum length of sequence of graphic chars to trigger output. */ |
| 1167 | -static int string_min; |
| 1168 | +static unsigned int string_min; |
| 1169 | |
| 1170 | /* Whether or not we include all whitespace as a graphic char. */ |
| 1171 | static bool include_all_whitespace; |
| 1172 | @@ -121,21 +150,22 @@ static char *output_separator; |
| 1173 | static struct option long_options[] = |
| 1174 | { |
| 1175 | {"all", no_argument, NULL, 'a'}, |
| 1176 | + {"bytes", required_argument, NULL, 'n'}, |
| 1177 | {"data", no_argument, NULL, 'd'}, |
| 1178 | + {"encoding", required_argument, NULL, 'e'}, |
| 1179 | + {"help", no_argument, NULL, 'h'}, |
| 1180 | + {"include-all-whitespace", no_argument, NULL, 'w'}, |
| 1181 | + {"output-separator", required_argument, NULL, 's'}, |
| 1182 | {"print-file-name", no_argument, NULL, 'f'}, |
| 1183 | - {"bytes", required_argument, NULL, 'n'}, |
| 1184 | {"radix", required_argument, NULL, 't'}, |
| 1185 | - {"include-all-whitespace", no_argument, NULL, 'w'}, |
| 1186 | - {"encoding", required_argument, NULL, 'e'}, |
| 1187 | {"target", required_argument, NULL, 'T'}, |
| 1188 | - {"output-separator", required_argument, NULL, 's'}, |
| 1189 | - {"help", no_argument, NULL, 'h'}, |
| 1190 | + {"unicode", required_argument, NULL, 'U'}, |
| 1191 | {"version", no_argument, NULL, 'v'}, |
| 1192 | {NULL, 0, NULL, 0} |
| 1193 | }; |
| 1194 | |
| 1195 | static bool strings_file (char *); |
| 1196 | -static void print_strings (const char *, FILE *, file_ptr, int, int, char *); |
| 1197 | +static void print_strings (const char *, FILE *, file_ptr, int, char *); |
| 1198 | static void usage (FILE *, int) ATTRIBUTE_NORETURN; |
| 1199 | |
| 1200 | int main (int, char **); |
| 1201 | @@ -171,7 +201,7 @@ main (int argc, char **argv) |
| 1202 | encoding = 's'; |
| 1203 | output_separator = NULL; |
| 1204 | |
| 1205 | - while ((optc = getopt_long (argc, argv, "adfhHn:wot:e:T:s:Vv0123456789", |
| 1206 | + while ((optc = getopt_long (argc, argv, "adfhHn:wot:e:T:s:U:Vv0123456789", |
| 1207 | long_options, (int *) 0)) != EOF) |
| 1208 | { |
| 1209 | switch (optc) |
| 1210 | @@ -244,6 +274,23 @@ main (int argc, char **argv) |
| 1211 | output_separator = optarg; |
| 1212 | break; |
| 1213 | |
| 1214 | + case 'U': |
| 1215 | + if (streq (optarg, "default") || streq (optarg, "d")) |
| 1216 | + unicode_display = unicode_default; |
| 1217 | + else if (streq (optarg, "locale") || streq (optarg, "l")) |
| 1218 | + unicode_display = unicode_locale; |
| 1219 | + else if (streq (optarg, "escape") || streq (optarg, "e")) |
| 1220 | + unicode_display = unicode_escape; |
| 1221 | + else if (streq (optarg, "invalid") || streq (optarg, "i")) |
| 1222 | + unicode_display = unicode_invalid; |
| 1223 | + else if (streq (optarg, "hex") || streq (optarg, "x")) |
| 1224 | + unicode_display = unicode_hex; |
| 1225 | + else if (streq (optarg, "highlight") || streq (optarg, "h")) |
| 1226 | + unicode_display = unicode_highlight; |
| 1227 | + else |
| 1228 | + fatal (_("invalid argument to -U/--unicode: %s"), optarg); |
| 1229 | + break; |
| 1230 | + |
| 1231 | case 'V': |
| 1232 | case 'v': |
| 1233 | print_version ("strings"); |
| 1234 | @@ -258,6 +305,9 @@ main (int argc, char **argv) |
| 1235 | } |
| 1236 | } |
| 1237 | |
| 1238 | + if (unicode_display != unicode_default) |
| 1239 | + encoding = 'S'; |
| 1240 | + |
| 1241 | if (numeric_opt != 0) |
| 1242 | { |
| 1243 | string_min = (int) strtoul (argv[numeric_opt - 1] + 1, &s, 0); |
| 1244 | @@ -293,14 +343,14 @@ main (int argc, char **argv) |
| 1245 | { |
| 1246 | datasection_only = false; |
| 1247 | SET_BINARY (fileno (stdin)); |
| 1248 | - print_strings ("{standard input}", stdin, 0, 0, 0, (char *) NULL); |
| 1249 | + print_strings ("{standard input}", stdin, 0, 0, (char *) NULL); |
| 1250 | files_given = true; |
| 1251 | } |
| 1252 | else |
| 1253 | { |
| 1254 | for (; optind < argc; ++optind) |
| 1255 | { |
| 1256 | - if (strcmp (argv[optind], "-") == 0) |
| 1257 | + if (streq (argv[optind], "-")) |
| 1258 | datasection_only = false; |
| 1259 | else |
| 1260 | { |
| 1261 | @@ -342,7 +392,7 @@ strings_a_section (bfd *abfd, asection * |
| 1262 | } |
| 1263 | |
| 1264 | *got_a_section = true; |
| 1265 | - print_strings (filename, NULL, sect->filepos, 0, sectsize, (char *) mem); |
| 1266 | + print_strings (filename, NULL, sect->filepos, sectsize, (char *) mem); |
| 1267 | free (mem); |
| 1268 | } |
| 1269 | |
| 1270 | @@ -427,7 +477,7 @@ strings_file (char *file) |
| 1271 | return false; |
| 1272 | } |
| 1273 | |
| 1274 | - print_strings (file, stream, (file_ptr) 0, 0, 0, (char *) 0); |
| 1275 | + print_strings (file, stream, (file_ptr) 0, 0, (char *) NULL); |
| 1276 | |
| 1277 | if (fclose (stream) == EOF) |
| 1278 | { |
| 1279 | @@ -551,6 +601,626 @@ unget_part_char (long c, file_ptr *addre |
| 1280 | } |
| 1281 | } |
| 1282 | } |
| 1283 | + |
| 1284 | +static void |
| 1285 | +print_filename_and_address (const char * filename, file_ptr address) |
| 1286 | +{ |
| 1287 | + if (print_filenames) |
| 1288 | + printf ("%s: ", filename); |
| 1289 | + |
| 1290 | + if (! print_addresses) |
| 1291 | + return; |
| 1292 | + |
| 1293 | + switch (address_radix) |
| 1294 | + { |
| 1295 | + case 8: |
| 1296 | + if (sizeof (address) > sizeof (long)) |
| 1297 | + { |
| 1298 | +#ifndef __MSVCRT__ |
| 1299 | + printf ("%7llo ", (unsigned long long) address); |
| 1300 | +#else |
| 1301 | + printf ("%7I64o ", (unsigned long long) address); |
| 1302 | +#endif |
| 1303 | + } |
| 1304 | + else |
| 1305 | + printf ("%7lo ", (unsigned long) address); |
| 1306 | + break; |
| 1307 | + |
| 1308 | + case 10: |
| 1309 | + if (sizeof (address) > sizeof (long)) |
| 1310 | + { |
| 1311 | +#ifndef __MSVCRT__ |
| 1312 | + printf ("%7llu ", (unsigned long long) address); |
| 1313 | +#else |
| 1314 | + printf ("%7I64d ", (unsigned long long) address); |
| 1315 | +#endif |
| 1316 | + } |
| 1317 | + else |
| 1318 | + printf ("%7ld ", (long) address); |
| 1319 | + break; |
| 1320 | + |
| 1321 | + case 16: |
| 1322 | + if (sizeof (address) > sizeof (long)) |
| 1323 | + { |
| 1324 | +#ifndef __MSVCRT__ |
| 1325 | + printf ("%7llx ", (unsigned long long) address); |
| 1326 | +#else |
| 1327 | + printf ("%7I64x ", (unsigned long long) address); |
| 1328 | +#endif |
| 1329 | + } |
| 1330 | + else |
| 1331 | + printf ("%7lx ", (unsigned long) address); |
| 1332 | + break; |
| 1333 | + } |
| 1334 | +} |
| 1335 | + |
| 1336 | +/* Return non-zero if the bytes starting at BUFFER form a valid UTF-8 encoding. |
| 1337 | + If the encoding is valid then returns the number of bytes it uses. */ |
| 1338 | + |
| 1339 | +static unsigned int |
| 1340 | +is_valid_utf8 (const unsigned char * buffer, unsigned long buflen) |
| 1341 | +{ |
| 1342 | + if (buffer[0] < 0xc0) |
| 1343 | + return 0; |
| 1344 | + |
| 1345 | + if (buflen < 2) |
| 1346 | + return 0; |
| 1347 | + |
| 1348 | + if ((buffer[1] & 0xc0) != 0x80) |
| 1349 | + return 0; |
| 1350 | + |
| 1351 | + if ((buffer[0] & 0x20) == 0) |
| 1352 | + return 2; |
| 1353 | + |
| 1354 | + if (buflen < 3) |
| 1355 | + return 0; |
| 1356 | + |
| 1357 | + if ((buffer[2] & 0xc0) != 0x80) |
| 1358 | + return 0; |
| 1359 | + |
| 1360 | + if ((buffer[0] & 0x10) == 0) |
| 1361 | + return 3; |
| 1362 | + |
| 1363 | + if (buflen < 4) |
| 1364 | + return 0; |
| 1365 | + |
| 1366 | + if ((buffer[3] & 0xc0) != 0x80) |
| 1367 | + return 0; |
| 1368 | + |
| 1369 | + return 4; |
| 1370 | +} |
| 1371 | + |
| 1372 | +/* Display a UTF-8 encoded character in BUFFER according to the setting |
| 1373 | + of unicode_display. The character is known to be valid. |
| 1374 | + Returns the number of bytes consumed. */ |
| 1375 | + |
| 1376 | +static unsigned int |
| 1377 | +display_utf8_char (const unsigned char * buffer) |
| 1378 | +{ |
| 1379 | + unsigned int j; |
| 1380 | + unsigned int utf8_len; |
| 1381 | + |
| 1382 | + switch (buffer[0] & 0x30) |
| 1383 | + { |
| 1384 | + case 0x00: |
| 1385 | + case 0x10: |
| 1386 | + utf8_len = 2; |
| 1387 | + break; |
| 1388 | + case 0x20: |
| 1389 | + utf8_len = 3; |
| 1390 | + break; |
| 1391 | + default: |
| 1392 | + utf8_len = 4; |
| 1393 | + } |
| 1394 | + |
| 1395 | + switch (unicode_display) |
| 1396 | + { |
| 1397 | + default: |
| 1398 | + fprintf (stderr, "ICE: unexpected unicode display type\n"); |
| 1399 | + break; |
| 1400 | + |
| 1401 | + case unicode_escape: |
| 1402 | + case unicode_highlight: |
| 1403 | + if (unicode_display == unicode_highlight && isatty (1)) |
| 1404 | + printf ("\x1B[31;47m"); /* Red. */ |
| 1405 | + |
| 1406 | + switch (utf8_len) |
| 1407 | + { |
| 1408 | + case 2: |
| 1409 | + printf ("\\u%02x%02x", |
| 1410 | + ((buffer[0] & 0x1c) >> 2), |
| 1411 | + ((buffer[0] & 0x03) << 6) | (buffer[1] & 0x3f)); |
| 1412 | + break; |
| 1413 | + |
| 1414 | + case 3: |
| 1415 | + printf ("\\u%02x%02x", |
| 1416 | + ((buffer[0] & 0x0f) << 4) | ((buffer[1] & 0x3c) >> 2), |
| 1417 | + ((buffer[1] & 0x03) << 6) | ((buffer[2] & 0x3f))); |
| 1418 | + break; |
| 1419 | + |
| 1420 | + case 4: |
| 1421 | + printf ("\\u%02x%02x%02x", |
| 1422 | + ((buffer[0] & 0x07) << 6) | ((buffer[1] & 0x3c) >> 2), |
| 1423 | + ((buffer[1] & 0x03) << 6) | ((buffer[2] & 0x3c) >> 2), |
| 1424 | + ((buffer[2] & 0x03) << 6) | ((buffer[3] & 0x3f))); |
| 1425 | + break; |
| 1426 | + default: |
| 1427 | + /* URG. */ |
| 1428 | + break; |
| 1429 | + } |
| 1430 | + |
| 1431 | + if (unicode_display == unicode_highlight && isatty (1)) |
| 1432 | + printf ("\033[0m"); /* Default colour. */ |
| 1433 | + break; |
| 1434 | + |
| 1435 | + case unicode_hex: |
| 1436 | + putchar ('<'); |
| 1437 | + printf ("0x"); |
| 1438 | + for (j = 0; j < utf8_len; j++) |
| 1439 | + printf ("%02x", buffer [j]); |
| 1440 | + putchar ('>'); |
| 1441 | + break; |
| 1442 | + |
| 1443 | + case unicode_locale: |
| 1444 | + printf ("%.1s", buffer); |
| 1445 | + break; |
| 1446 | + } |
| 1447 | + |
| 1448 | + return utf8_len; |
| 1449 | +} |
| 1450 | + |
| 1451 | +/* Display strings in BUFFER. Treat any UTF-8 encoded characters encountered |
| 1452 | + according to the setting of the unicode_display variable. The buffer |
| 1453 | + contains BUFLEN bytes. |
| 1454 | + |
| 1455 | + Display the characters as if they started at ADDRESS and are contained in |
| 1456 | + FILENAME. */ |
| 1457 | + |
| 1458 | +static void |
| 1459 | +print_unicode_buffer (const char * filename, |
| 1460 | + file_ptr address, |
| 1461 | + const unsigned char * buffer, |
| 1462 | + unsigned long buflen) |
| 1463 | +{ |
| 1464 | + /* Paranoia checks... */ |
| 1465 | + if (filename == NULL |
| 1466 | + || buffer == NULL |
| 1467 | + || unicode_display == unicode_default |
| 1468 | + || encoding != 'S' |
| 1469 | + || encoding_bytes != 1) |
| 1470 | + { |
| 1471 | + fprintf (stderr, "ICE: bad arguments to print_unicode_buffer\n"); |
| 1472 | + return; |
| 1473 | + } |
| 1474 | + |
| 1475 | + if (buflen == 0) |
| 1476 | + return; |
| 1477 | + |
| 1478 | + /* We must only display strings that are at least string_min *characters* |
| 1479 | + long. So we scan the buffer in two stages. First we locate the start |
| 1480 | + of a potential string. Then we walk along it until we have found |
| 1481 | + string_min characters. Then we go back to the start point and start |
| 1482 | + displaying characters according to the unicode_display setting. */ |
| 1483 | + |
| 1484 | + unsigned long start_point = 0; |
| 1485 | + unsigned long i = 0; |
| 1486 | + unsigned int char_len = 1; |
| 1487 | + unsigned int num_found = 0; |
| 1488 | + |
| 1489 | + for (i = 0; i < buflen; i += char_len) |
| 1490 | + { |
| 1491 | + int c = buffer[i]; |
| 1492 | + |
| 1493 | + char_len = 1; |
| 1494 | + |
| 1495 | + /* Find the first potential character of a string. */ |
| 1496 | + if (! STRING_ISGRAPHIC (c)) |
| 1497 | + { |
| 1498 | + num_found = 0; |
| 1499 | + continue; |
| 1500 | + } |
| 1501 | + |
| 1502 | + if (c > 126) |
| 1503 | + { |
| 1504 | + if (c < 0xc0) |
| 1505 | + { |
| 1506 | + num_found = 0; |
| 1507 | + continue; |
| 1508 | + } |
| 1509 | + |
| 1510 | + if ((char_len = is_valid_utf8 (buffer + i, buflen - i)) == 0) |
| 1511 | + { |
| 1512 | + char_len = 1; |
| 1513 | + num_found = 0; |
| 1514 | + continue; |
| 1515 | + } |
| 1516 | + |
| 1517 | + if (unicode_display == unicode_invalid) |
| 1518 | + { |
| 1519 | + /* We have found a valid UTF-8 character, but we treat it as non-graphic. */ |
| 1520 | + num_found = 0; |
| 1521 | + continue; |
| 1522 | + } |
| 1523 | + } |
| 1524 | + |
| 1525 | + if (num_found == 0) |
| 1526 | + /* We have found a potential starting point for a string. */ |
| 1527 | + start_point = i; |
| 1528 | + |
| 1529 | + ++ num_found; |
| 1530 | + |
| 1531 | + if (num_found >= string_min) |
| 1532 | + break; |
| 1533 | + } |
| 1534 | + |
| 1535 | + if (num_found < string_min) |
| 1536 | + return; |
| 1537 | + |
| 1538 | + print_filename_and_address (filename, address + start_point); |
| 1539 | + |
| 1540 | + /* We have found string_min characters. Display them and any |
| 1541 | + more that follow. */ |
| 1542 | + for (i = start_point; i < buflen; i += char_len) |
| 1543 | + { |
| 1544 | + int c = buffer[i]; |
| 1545 | + |
| 1546 | + char_len = 1; |
| 1547 | + |
| 1548 | + if (! STRING_ISGRAPHIC (c)) |
| 1549 | + break; |
| 1550 | + else if (c < 127) |
| 1551 | + putchar (c); |
| 1552 | + else if (! is_valid_utf8 (buffer + i, buflen - i)) |
| 1553 | + break; |
| 1554 | + else if (unicode_display == unicode_invalid) |
| 1555 | + break; |
| 1556 | + else |
| 1557 | + char_len = display_utf8_char (buffer + i); |
| 1558 | + } |
| 1559 | + |
| 1560 | + if (output_separator) |
| 1561 | + fputs (output_separator, stdout); |
| 1562 | + else |
| 1563 | + putchar ('\n'); |
| 1564 | + |
| 1565 | + /* FIXME: Using tail recursion here is lazy programming... */ |
| 1566 | + print_unicode_buffer (filename, address + i, buffer + i, buflen - i); |
| 1567 | +} |
| 1568 | + |
| 1569 | +static int |
| 1570 | +get_unicode_byte (FILE * stream, |
| 1571 | + unsigned char * putback, |
| 1572 | + unsigned int * num_putback, |
| 1573 | + unsigned int * num_read) |
| 1574 | +{ |
| 1575 | + if (* num_putback > 0) |
| 1576 | + { |
| 1577 | + * num_putback = * num_putback - 1; |
| 1578 | + return putback [* num_putback]; |
| 1579 | + } |
| 1580 | + |
| 1581 | + * num_read = * num_read + 1; |
| 1582 | + |
| 1583 | +#if defined(HAVE_GETC_UNLOCKED) && HAVE_DECL_GETC_UNLOCKED |
| 1584 | + return getc_unlocked (stream); |
| 1585 | +#else |
| 1586 | + return getc (stream); |
| 1587 | +#endif |
| 1588 | +} |
| 1589 | + |
| 1590 | +/* Helper function for print_unicode_stream. */ |
| 1591 | + |
| 1592 | +static void |
| 1593 | +print_unicode_stream_body (const char * filename, |
| 1594 | + file_ptr address, |
| 1595 | + FILE * stream, |
| 1596 | + unsigned char * putback_buf, |
| 1597 | + unsigned int num_putback, |
| 1598 | + unsigned char * print_buf) |
| 1599 | +{ |
| 1600 | + /* It would be nice if we could just read the stream into a buffer |
| 1601 | + and then process if with print_unicode_buffer. But the input |
| 1602 | + might be huge or it might time-locked (eg stdin). So instead |
| 1603 | + we go one byte at a time... */ |
| 1604 | + |
| 1605 | + file_ptr start_point = 0; |
| 1606 | + unsigned int num_read = 0; |
| 1607 | + unsigned int num_chars = 0; |
| 1608 | + unsigned int num_print = 0; |
| 1609 | + int c = 0; |
| 1610 | + |
| 1611 | + /* Find a series of string_min characters. Put them into print_buf. */ |
| 1612 | + do |
| 1613 | + { |
| 1614 | + if (num_chars >= string_min) |
| 1615 | + break; |
| 1616 | + |
| 1617 | + c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read); |
| 1618 | + if (c == EOF) |
| 1619 | + break; |
| 1620 | + |
| 1621 | + if (! STRING_ISGRAPHIC (c)) |
| 1622 | + { |
| 1623 | + num_chars = num_print = 0; |
| 1624 | + continue; |
| 1625 | + } |
| 1626 | + |
| 1627 | + if (num_chars == 0) |
| 1628 | + start_point = num_read - 1; |
| 1629 | + |
| 1630 | + if (c < 127) |
| 1631 | + { |
| 1632 | + print_buf[num_print] = c; |
| 1633 | + num_chars ++; |
| 1634 | + num_print ++; |
| 1635 | + continue; |
| 1636 | + } |
| 1637 | + |
| 1638 | + if (c < 0xc0) |
| 1639 | + { |
| 1640 | + num_chars = num_print = 0; |
| 1641 | + continue; |
| 1642 | + } |
| 1643 | + |
| 1644 | + /* We *might* have a UTF-8 sequence. Time to start peeking. */ |
| 1645 | + char utf8[4]; |
| 1646 | + |
| 1647 | + utf8[0] = c; |
| 1648 | + c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read); |
| 1649 | + if (c == EOF) |
| 1650 | + break; |
| 1651 | + utf8[1] = c; |
| 1652 | + |
| 1653 | + if ((utf8[1] & 0xc0) != 0x80) |
| 1654 | + { |
| 1655 | + /* Invalid UTF-8. */ |
| 1656 | + putback_buf[num_putback++] = utf8[1]; |
| 1657 | + num_chars = num_print = 0; |
| 1658 | + continue; |
| 1659 | + } |
| 1660 | + else if ((utf8[0] & 0x20) == 0) |
| 1661 | + { |
| 1662 | + /* A valid 2-byte UTF-8 encoding. */ |
| 1663 | + if (unicode_display == unicode_invalid) |
| 1664 | + { |
| 1665 | + putback_buf[num_putback++] = utf8[1]; |
| 1666 | + num_chars = num_print = 0; |
| 1667 | + } |
| 1668 | + else |
| 1669 | + { |
| 1670 | + print_buf[num_print ++] = utf8[0]; |
| 1671 | + print_buf[num_print ++] = utf8[1]; |
| 1672 | + num_chars ++; |
| 1673 | + } |
| 1674 | + continue; |
| 1675 | + } |
| 1676 | + |
| 1677 | + c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read); |
| 1678 | + if (c == EOF) |
| 1679 | + break; |
| 1680 | + utf8[2] = c; |
| 1681 | + |
| 1682 | + if ((utf8[2] & 0xc0) != 0x80) |
| 1683 | + { |
| 1684 | + /* Invalid UTF-8. */ |
| 1685 | + putback_buf[num_putback++] = utf8[2]; |
| 1686 | + putback_buf[num_putback++] = utf8[1]; |
| 1687 | + num_chars = num_print = 0; |
| 1688 | + continue; |
| 1689 | + } |
| 1690 | + else if ((utf8[0] & 0x10) == 0) |
| 1691 | + { |
| 1692 | + /* A valid 3-byte UTF-8 encoding. */ |
| 1693 | + if (unicode_display == unicode_invalid) |
| 1694 | + { |
| 1695 | + putback_buf[num_putback++] = utf8[2]; |
| 1696 | + putback_buf[num_putback++] = utf8[1]; |
| 1697 | + num_chars = num_print = 0; |
| 1698 | + } |
| 1699 | + else |
| 1700 | + { |
| 1701 | + print_buf[num_print ++] = utf8[0]; |
| 1702 | + print_buf[num_print ++] = utf8[1]; |
| 1703 | + print_buf[num_print ++] = utf8[2]; |
| 1704 | + num_chars ++; |
| 1705 | + } |
| 1706 | + continue; |
| 1707 | + } |
| 1708 | + |
| 1709 | + c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read); |
| 1710 | + if (c == EOF) |
| 1711 | + break; |
| 1712 | + utf8[3] = c; |
| 1713 | + |
| 1714 | + if ((utf8[3] & 0xc0) != 0x80) |
| 1715 | + { |
| 1716 | + /* Invalid UTF-8. */ |
| 1717 | + putback_buf[num_putback++] = utf8[3]; |
| 1718 | + putback_buf[num_putback++] = utf8[2]; |
| 1719 | + putback_buf[num_putback++] = utf8[1]; |
| 1720 | + num_chars = num_print = 0; |
| 1721 | + } |
| 1722 | + /* We have a valid 4-byte UTF-8 encoding. */ |
| 1723 | + else if (unicode_display == unicode_invalid) |
| 1724 | + { |
| 1725 | + putback_buf[num_putback++] = utf8[3]; |
| 1726 | + putback_buf[num_putback++] = utf8[1]; |
| 1727 | + putback_buf[num_putback++] = utf8[2]; |
| 1728 | + num_chars = num_print = 0; |
| 1729 | + } |
| 1730 | + else |
| 1731 | + { |
| 1732 | + print_buf[num_print ++] = utf8[0]; |
| 1733 | + print_buf[num_print ++] = utf8[1]; |
| 1734 | + print_buf[num_print ++] = utf8[2]; |
| 1735 | + print_buf[num_print ++] = utf8[3]; |
| 1736 | + num_chars ++; |
| 1737 | + } |
| 1738 | + } |
| 1739 | + while (1); |
| 1740 | + |
| 1741 | + if (num_chars >= string_min) |
| 1742 | + { |
| 1743 | + /* We know that we have string_min valid characters in print_buf, |
| 1744 | + and there may be more to come in the stream. Start displaying |
| 1745 | + them. */ |
| 1746 | + |
| 1747 | + print_filename_and_address (filename, address + start_point); |
| 1748 | + |
| 1749 | + unsigned int i; |
| 1750 | + for (i = 0; i < num_print;) |
| 1751 | + { |
| 1752 | + if (print_buf[i] < 127) |
| 1753 | + putchar (print_buf[i++]); |
| 1754 | + else |
| 1755 | + i += display_utf8_char (print_buf + i); |
| 1756 | + } |
| 1757 | + |
| 1758 | + /* OK so now we have to start read unchecked bytes. */ |
| 1759 | + |
| 1760 | + /* Find a series of string_min characters. Put them into print_buf. */ |
| 1761 | + do |
| 1762 | + { |
| 1763 | + c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read); |
| 1764 | + if (c == EOF) |
| 1765 | + break; |
| 1766 | + |
| 1767 | + if (! STRING_ISGRAPHIC (c)) |
| 1768 | + break; |
| 1769 | + |
| 1770 | + if (c < 127) |
| 1771 | + { |
| 1772 | + putchar (c); |
| 1773 | + continue; |
| 1774 | + } |
| 1775 | + |
| 1776 | + if (c < 0xc0) |
| 1777 | + break; |
| 1778 | + |
| 1779 | + /* We *might* have a UTF-8 sequence. Time to start peeking. */ |
| 1780 | + unsigned char utf8[4]; |
| 1781 | + |
| 1782 | + utf8[0] = c; |
| 1783 | + c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read); |
| 1784 | + if (c == EOF) |
| 1785 | + break; |
| 1786 | + utf8[1] = c; |
| 1787 | + |
| 1788 | + if ((utf8[1] & 0xc0) != 0x80) |
| 1789 | + { |
| 1790 | + /* Invalid UTF-8. */ |
| 1791 | + putback_buf[num_putback++] = utf8[1]; |
| 1792 | + break; |
| 1793 | + } |
| 1794 | + else if ((utf8[0] & 0x20) == 0) |
| 1795 | + { |
| 1796 | + /* Valid 2-byte UTF-8. */ |
| 1797 | + if (unicode_display == unicode_invalid) |
| 1798 | + { |
| 1799 | + putback_buf[num_putback++] = utf8[1]; |
| 1800 | + break; |
| 1801 | + } |
| 1802 | + else |
| 1803 | + { |
| 1804 | + (void) display_utf8_char (utf8); |
| 1805 | + continue; |
| 1806 | + } |
| 1807 | + } |
| 1808 | + |
| 1809 | + c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read); |
| 1810 | + if (c == EOF) |
| 1811 | + break; |
| 1812 | + utf8[2] = c; |
| 1813 | + |
| 1814 | + if ((utf8[2] & 0xc0) != 0x80) |
| 1815 | + { |
| 1816 | + /* Invalid UTF-8. */ |
| 1817 | + putback_buf[num_putback++] = utf8[2]; |
| 1818 | + putback_buf[num_putback++] = utf8[1]; |
| 1819 | + break; |
| 1820 | + } |
| 1821 | + else if ((utf8[0] & 0x10) == 0) |
| 1822 | + { |
| 1823 | + /* Valid 3-byte UTF-8. */ |
| 1824 | + if (unicode_display == unicode_invalid) |
| 1825 | + { |
| 1826 | + putback_buf[num_putback++] = utf8[2]; |
| 1827 | + putback_buf[num_putback++] = utf8[1]; |
| 1828 | + break; |
| 1829 | + } |
| 1830 | + else |
| 1831 | + { |
| 1832 | + (void) display_utf8_char (utf8); |
| 1833 | + continue; |
| 1834 | + } |
| 1835 | + } |
| 1836 | + |
| 1837 | + c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read); |
| 1838 | + if (c == EOF) |
| 1839 | + break; |
| 1840 | + utf8[3] = c; |
| 1841 | + |
| 1842 | + if ((utf8[3] & 0xc0) != 0x80) |
| 1843 | + { |
| 1844 | + /* Invalid UTF-8. */ |
| 1845 | + putback_buf[num_putback++] = utf8[3]; |
| 1846 | + putback_buf[num_putback++] = utf8[2]; |
| 1847 | + putback_buf[num_putback++] = utf8[1]; |
| 1848 | + break; |
| 1849 | + } |
| 1850 | + else if (unicode_display == unicode_invalid) |
| 1851 | + { |
| 1852 | + putback_buf[num_putback++] = utf8[3]; |
| 1853 | + putback_buf[num_putback++] = utf8[2]; |
| 1854 | + putback_buf[num_putback++] = utf8[1]; |
| 1855 | + break; |
| 1856 | + } |
| 1857 | + else |
| 1858 | + /* A valid 4-byte UTF-8 encoding. */ |
| 1859 | + (void) display_utf8_char (utf8); |
| 1860 | + } |
| 1861 | + while (1); |
| 1862 | + |
| 1863 | + if (output_separator) |
| 1864 | + fputs (output_separator, stdout); |
| 1865 | + else |
| 1866 | + putchar ('\n'); |
| 1867 | + } |
| 1868 | + |
| 1869 | + if (c != EOF) |
| 1870 | + /* FIXME: Using tail recursion here is lazy, but it works. */ |
| 1871 | + print_unicode_stream_body (filename, address + num_read, stream, putback_buf, num_putback, print_buf); |
| 1872 | +} |
| 1873 | + |
| 1874 | +/* Display strings read in from STREAM. Treat any UTF-8 encoded characters |
| 1875 | + encountered according to the setting of the unicode_display variable. |
| 1876 | + The stream is positioned at ADDRESS and is attached to FILENAME. */ |
| 1877 | + |
| 1878 | +static void |
| 1879 | +print_unicode_stream (const char * filename, |
| 1880 | + file_ptr address, |
| 1881 | + FILE * stream) |
| 1882 | +{ |
| 1883 | + /* Paranoia checks... */ |
| 1884 | + if (filename == NULL |
| 1885 | + || stream == NULL |
| 1886 | + || unicode_display == unicode_default |
| 1887 | + || encoding != 'S' |
| 1888 | + || encoding_bytes != 1) |
| 1889 | + { |
| 1890 | + fprintf (stderr, "ICE: bad arguments to print_unicode_stream\n"); |
| 1891 | + return; |
| 1892 | + } |
| 1893 | + |
| 1894 | + /* Allocate space for string_min 4-byte utf-8 characters. */ |
| 1895 | + unsigned char * print_buf = xmalloc ((4 * string_min) + 1); |
| 1896 | + /* We should never have to put back more than 4 bytes. */ |
| 1897 | + unsigned char putback_buf[5]; |
| 1898 | + unsigned int num_putback = 0; |
| 1899 | + |
| 1900 | + print_unicode_stream_body (filename, address, stream, putback_buf, num_putback, print_buf); |
| 1901 | + free (print_buf); |
| 1902 | +} |
| 1903 | |
| 1904 | /* Find the strings in file FILENAME, read from STREAM. |
| 1905 | Assume that STREAM is positioned so that the next byte read |
| 1906 | @@ -566,20 +1236,29 @@ unget_part_char (long c, file_ptr *addre |
| 1907 | |
| 1908 | static void |
| 1909 | print_strings (const char *filename, FILE *stream, file_ptr address, |
| 1910 | - int stop_point, int magiccount, char *magic) |
| 1911 | + int magiccount, char *magic) |
| 1912 | { |
| 1913 | + if (unicode_display != unicode_default) |
| 1914 | + { |
| 1915 | + if (magic != NULL) |
| 1916 | + print_unicode_buffer (filename, address, |
| 1917 | + (const unsigned char *) magic, magiccount); |
| 1918 | + |
| 1919 | + if (stream != NULL) |
| 1920 | + print_unicode_stream (filename, address, stream); |
| 1921 | + return; |
| 1922 | + } |
| 1923 | + |
| 1924 | char *buf = (char *) xmalloc (sizeof (char) * (string_min + 1)); |
| 1925 | |
| 1926 | while (1) |
| 1927 | { |
| 1928 | file_ptr start; |
| 1929 | - int i; |
| 1930 | + unsigned int i; |
| 1931 | long c; |
| 1932 | |
| 1933 | /* See if the next `string_min' chars are all graphic chars. */ |
| 1934 | tryline: |
| 1935 | - if (stop_point && address >= stop_point) |
| 1936 | - break; |
| 1937 | start = address; |
| 1938 | for (i = 0; i < string_min; i++) |
| 1939 | { |
| 1940 | @@ -601,51 +1280,7 @@ print_strings (const char *filename, FIL |
| 1941 | |
| 1942 | /* We found a run of `string_min' graphic characters. Print up |
| 1943 | to the next non-graphic character. */ |
| 1944 | - |
| 1945 | - if (print_filenames) |
| 1946 | - printf ("%s: ", filename); |
| 1947 | - if (print_addresses) |
| 1948 | - switch (address_radix) |
| 1949 | - { |
| 1950 | - case 8: |
| 1951 | - if (sizeof (start) > sizeof (long)) |
| 1952 | - { |
| 1953 | -#ifndef __MSVCRT__ |
| 1954 | - printf ("%7llo ", (unsigned long long) start); |
| 1955 | -#else |
| 1956 | - printf ("%7I64o ", (unsigned long long) start); |
| 1957 | -#endif |
| 1958 | - } |
| 1959 | - else |
| 1960 | - printf ("%7lo ", (unsigned long) start); |
| 1961 | - break; |
| 1962 | - |
| 1963 | - case 10: |
| 1964 | - if (sizeof (start) > sizeof (long)) |
| 1965 | - { |
| 1966 | -#ifndef __MSVCRT__ |
| 1967 | - printf ("%7llu ", (unsigned long long) start); |
| 1968 | -#else |
| 1969 | - printf ("%7I64d ", (unsigned long long) start); |
| 1970 | -#endif |
| 1971 | - } |
| 1972 | - else |
| 1973 | - printf ("%7ld ", (long) start); |
| 1974 | - break; |
| 1975 | - |
| 1976 | - case 16: |
| 1977 | - if (sizeof (start) > sizeof (long)) |
| 1978 | - { |
| 1979 | -#ifndef __MSVCRT__ |
| 1980 | - printf ("%7llx ", (unsigned long long) start); |
| 1981 | -#else |
| 1982 | - printf ("%7I64x ", (unsigned long long) start); |
| 1983 | -#endif |
| 1984 | - } |
| 1985 | - else |
| 1986 | - printf ("%7lx ", (unsigned long) start); |
| 1987 | - break; |
| 1988 | - } |
| 1989 | + print_filename_and_address (filename, start); |
| 1990 | |
| 1991 | buf[i] = '\0'; |
| 1992 | fputs (buf, stdout); |
| 1993 | @@ -697,6 +1332,8 @@ usage (FILE *stream, int status) |
| 1994 | -T --target=<BFDNAME> Specify the binary file format\n\ |
| 1995 | -e --encoding={s,S,b,l,B,L} Select character size and endianness:\n\ |
| 1996 | s = 7-bit, S = 8-bit, {b,l} = 16-bit, {B,L} = 32-bit\n\ |
| 1997 | + --unicode={default|show|invalid|hex|escape|highlight}\n\ |
| 1998 | + -u {d|s|i|x|e|h} Specify how to treat UTF-8 encoded unicode characters\n\ |
| 1999 | -s --output-separator=<string> String used to separate strings in output.\n\ |
| 2000 | @<file> Read options from <file>\n\ |
| 2001 | -h --help Display this information\n\ |