Patrick Williams | f1e5d69 | 2016-03-30 15:21:19 -0500 | [diff] [blame] | 1 | Upstream-Status: Backport |
| 2 | |
| 3 | Signed-off-by: Li Xin <lixin.fnst@cn.fujitsu.com> |
| 4 | |
| 5 | From https://sourceware.org/git/gitweb.cgi?p=glibc.git;h=6c84109cfa26f35c3dfed3acb97d347361bd5849 |
| 6 | Author: Carlos O'Donell <carlos@systemhalted.org> |
| 7 | Date: Thu Oct 8 16:34:53 2015 -0400 |
| 8 | |
| 9 | strcoll: Remove incorrect STRDIFF-based optimization (Bug 18589). |
| 10 | |
| 11 | The optimization introduced in commit |
| 12 | f13c2a8dff2329c6692a80176262ceaaf8a6f74e, causes regressions in |
| 13 | sorting for languages that have digraphs that change sort order, like |
| 14 | cs_CZ which sorts ch between h and i. |
| 15 | |
| 16 | My analysis shows the fast-forwarding optimization in STRCOLL advances |
| 17 | through a digraph while possibly stopping in the middle which results |
| 18 | in a subsequent skipping of the digraph and incorrect sorting. The |
| 19 | optimization is incorrect as implemented and because of that I'm |
| 20 | removing it for 2.23, and I will also commit this fix for 2.22 where |
| 21 | it was originally introduced. |
| 22 | |
| 23 | This patch reverts the optimization, introduces a new bug-strcoll2.c |
| 24 | regression test that tests both cs_CZ.UTF-8 and da_DK.ISO-8859-1 and |
| 25 | ensures they sort one digraph each correctly. The optimization can't be |
| 26 | applied without regressing this test. |
| 27 | |
| 28 | Checked on x86_64, bug-strcoll2.c fails without this patch and passes |
| 29 | after. This will also get a fix on 2.22 which has the same bug. |
| 30 | |
| 31 | (cherry picked from commit 87701a58e291bd7ac3b407d10a829dac52c9c16e) |
| 32 | --- |
| 33 | locale/C-collate.c | 4 +- |
| 34 | locale/categories.def | 1 - |
| 35 | locale/langinfo.h | 1 - |
| 36 | locale/localeinfo.h | 7 ---- |
| 37 | locale/programs/ld-collate.c | 9 ----- |
| 38 | string/bug-strcoll2.c | 95 ++++++++++++++++++++++++++++++++++++++++++++ |
| 39 | string/strcoll_l.c | 39 +----------------- |
| 40 | wcsmbs/wcscoll_l.c | 1 - |
| 41 | 8 files changed, 98 insertions(+), 59 deletions(-) |
| 42 | create mode 100644 string/bug-strcoll2.c |
| 43 | |
| 44 | diff --git a/locale/C-collate.c b/locale/C-collate.c |
| 45 | index d7f3c55..06dfdfa 100644 |
| 46 | --- a/locale/C-collate.c |
| 47 | +++ b/locale/C-collate.c |
| 48 | @@ -144,8 +144,6 @@ const struct __locale_data _nl_C_LC_COLLATE attribute_hidden = |
| 49 | /* _NL_COLLATE_COLLSEQWC */ |
| 50 | { .string = (const char *) collseqwc }, |
| 51 | /* _NL_COLLATE_CODESET */ |
| 52 | - { .string = _nl_C_codeset }, |
| 53 | - /* _NL_COLLATE_ENCODING_TYPE */ |
| 54 | - { .word = __cet_8bit } |
| 55 | + { .string = _nl_C_codeset } |
| 56 | } |
| 57 | }; |
| 58 | diff --git a/locale/categories.def b/locale/categories.def |
| 59 | index 045489d..a8dda53 100644 |
| 60 | --- a/locale/categories.def |
| 61 | +++ b/locale/categories.def |
| 62 | @@ -58,7 +58,6 @@ DEFINE_CATEGORY |
| 63 | DEFINE_ELEMENT (_NL_COLLATE_COLLSEQMB, "collate-collseqmb", std, wstring) |
| 64 | DEFINE_ELEMENT (_NL_COLLATE_COLLSEQWC, "collate-collseqwc", std, wstring) |
| 65 | DEFINE_ELEMENT (_NL_COLLATE_CODESET, "collate-codeset", std, string) |
| 66 | - DEFINE_ELEMENT (_NL_COLLATE_ENCODING_TYPE, "collate-encoding-type", std, word) |
| 67 | ), NO_POSTLOAD) |
| 68 | |
| 69 | |
| 70 | diff --git a/locale/langinfo.h b/locale/langinfo.h |
| 71 | index ffc5c7f..a565d9d 100644 |
| 72 | --- a/locale/langinfo.h |
| 73 | +++ b/locale/langinfo.h |
| 74 | @@ -255,7 +255,6 @@ enum |
| 75 | _NL_COLLATE_COLLSEQMB, |
| 76 | _NL_COLLATE_COLLSEQWC, |
| 77 | _NL_COLLATE_CODESET, |
| 78 | - _NL_COLLATE_ENCODING_TYPE, |
| 79 | _NL_NUM_LC_COLLATE, |
| 80 | |
| 81 | /* LC_CTYPE category: character classification. |
| 82 | diff --git a/locale/localeinfo.h b/locale/localeinfo.h |
| 83 | index a7516c0..c076d8e 100644 |
| 84 | --- a/locale/localeinfo.h |
| 85 | +++ b/locale/localeinfo.h |
| 86 | @@ -110,13 +110,6 @@ enum coll_sort_rule |
| 87 | sort_mask |
| 88 | }; |
| 89 | |
| 90 | -/* Collation encoding type. */ |
| 91 | -enum collation_encoding_type |
| 92 | -{ |
| 93 | - __cet_other, |
| 94 | - __cet_8bit, |
| 95 | - __cet_utf8 |
| 96 | -}; |
| 97 | |
| 98 | /* We can map the types of the entries into a few categories. */ |
| 99 | enum value_type |
| 100 | diff --git a/locale/programs/ld-collate.c b/locale/programs/ld-collate.c |
| 101 | index 16e9039..3c88c6d 100644 |
| 102 | --- a/locale/programs/ld-collate.c |
| 103 | +++ b/locale/programs/ld-collate.c |
| 104 | @@ -32,7 +32,6 @@ |
| 105 | #include "linereader.h" |
| 106 | #include "locfile.h" |
| 107 | #include "elem-hash.h" |
| 108 | -#include "../localeinfo.h" |
| 109 | |
| 110 | /* Uncomment the following line in the production version. */ |
| 111 | /* #define NDEBUG 1 */ |
| 112 | @@ -2130,8 +2129,6 @@ collate_output (struct localedef_t *locale, const struct charmap_t *charmap, |
| 113 | /* The words have to be handled specially. */ |
| 114 | if (idx == _NL_ITEM_INDEX (_NL_COLLATE_SYMB_HASH_SIZEMB)) |
| 115 | add_locale_uint32 (&file, 0); |
| 116 | - else if (idx == _NL_ITEM_INDEX (_NL_COLLATE_ENCODING_TYPE)) |
| 117 | - add_locale_uint32 (&file, __cet_other); |
| 118 | else |
| 119 | add_locale_empty (&file); |
| 120 | } |
| 121 | @@ -2495,12 +2492,6 @@ collate_output (struct localedef_t *locale, const struct charmap_t *charmap, |
| 122 | add_locale_raw_data (&file, collate->mbseqorder, 256); |
| 123 | add_locale_collseq_table (&file, &collate->wcseqorder); |
| 124 | add_locale_string (&file, charmap->code_set_name); |
| 125 | - if (strcmp (charmap->code_set_name, "UTF-8") == 0) |
| 126 | - add_locale_uint32 (&file, __cet_utf8); |
| 127 | - else if (charmap->mb_cur_max == 1) |
| 128 | - add_locale_uint32 (&file, __cet_8bit); |
| 129 | - else |
| 130 | - add_locale_uint32 (&file, __cet_other); |
| 131 | write_locale_data (output_path, LC_COLLATE, "LC_COLLATE", &file); |
| 132 | |
| 133 | obstack_free (&weightpool, NULL); |
| 134 | diff --git a/string/bug-strcoll2.c b/string/bug-strcoll2.c |
| 135 | new file mode 100644 |
| 136 | index 0000000..950b090 |
| 137 | --- /dev/null |
| 138 | +++ b/string/bug-strcoll2.c |
| 139 | @@ -0,0 +1,95 @@ |
| 140 | +/* Bug 18589: sort-test.sh fails at random. |
| 141 | + * Copyright (C) 1998-2015 Free Software Foundation, Inc. |
| 142 | + * This file is part of the GNU C Library. |
| 143 | + * Contributed by Ulrich Drepper <drepper@cygnus.com>, 1998. |
| 144 | + * |
| 145 | + * The GNU C Library is free software; you can redistribute it and/or |
| 146 | + * modify it under the terms of the GNU Lesser General Public |
| 147 | + * License as published by the Free Software Foundation; either |
| 148 | + * version 2.1 of the License, or (at your option) any later version. |
| 149 | + * |
| 150 | + * The GNU C Library is distributed in the hope that it will be useful, |
| 151 | + * but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 152 | + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| 153 | + * Lesser General Public License for more details. |
| 154 | + * |
| 155 | + * You should have received a copy of the GNU Lesser General Public |
| 156 | + * License along with the GNU C Library; if not, see |
| 157 | + * <http://www.gnu.org/licenses/>. */ |
| 158 | + |
| 159 | +#include <stdio.h> |
| 160 | +#include <string.h> |
| 161 | +#include <locale.h> |
| 162 | + |
| 163 | +/* An incorrect strcoll optimization resulted in incorrect |
| 164 | + * results from strcoll for cs_CZ and da_DK. */ |
| 165 | + |
| 166 | +int |
| 167 | +test_cs_CZ (void) |
| 168 | +{ |
| 169 | + const char t1[] = "config"; |
| 170 | + const char t2[] = "choose"; |
| 171 | + if (setlocale (LC_ALL, "cs_CZ.UTF-8") == NULL) |
| 172 | + { |
| 173 | + perror ("setlocale"); |
| 174 | + return 1; |
| 175 | + } |
| 176 | + /* In Czech the digraph ch sorts after c, therefore we expect |
| 177 | + * config to sort before choose. */ |
| 178 | + int a = strcoll (t1, t2); |
| 179 | + int b = strcoll (t2, t1); |
| 180 | + printf ("strcoll (\"%s\", \"%s\") = %d\n", t1, t2, a); |
| 181 | + printf ("strcoll (\"%s\", \"%s\") = %d\n", t2, t1, b); |
| 182 | + if (a < 0 && b > 0) |
| 183 | + { |
| 184 | + puts ("PASS: config < choose"); |
| 185 | + return 0; |
| 186 | + } |
| 187 | + else |
| 188 | + { |
| 189 | + puts ("FAIL: Wrong sorting in cz_CZ.UTF-8."); |
| 190 | + return 1; |
| 191 | + } |
| 192 | +} |
| 193 | + |
| 194 | +int |
| 195 | +test_da_DK (void) |
| 196 | +{ |
| 197 | + const char t1[] = "AS"; |
| 198 | + const char t2[] = "AA"; |
| 199 | + if (setlocale (LC_ALL, "da_DK.ISO-8859-1") == NULL) |
| 200 | + { |
| 201 | + perror ("setlocale"); |
| 202 | + return 1; |
| 203 | + } |
| 204 | + /* AA should be treated as the last letter of the Danish alphabet, |
| 205 | + * hence sorting after AS. */ |
| 206 | + int a = strcoll (t1, t2); |
| 207 | + int b = strcoll (t2, t1); |
| 208 | + printf ("strcoll (\"%s\", \"%s\") = %d\n", t1, t2, a); |
| 209 | + printf ("strcoll (\"%s\", \"%s\") = %d\n", t2, t1, b); |
| 210 | + if (a < 0 && b > 0) |
| 211 | + { |
| 212 | + puts ("PASS: AS < AA"); |
| 213 | + return 0; |
| 214 | + } |
| 215 | + else |
| 216 | + { |
| 217 | + puts ("FAIL: Wrong sorting in da_DK.ISO-8859-1"); |
| 218 | + return 1; |
| 219 | + } |
| 220 | +} |
| 221 | + |
| 222 | +static int |
| 223 | +do_test (void) |
| 224 | +{ |
| 225 | + int err = 0; |
| 226 | + err |= test_cs_CZ (); |
| 227 | + err |= test_da_DK (); |
| 228 | + return err; |
| 229 | +} |
| 230 | + |
| 231 | +#define TEST_FUNCTION do_test () |
| 232 | +#include "../test-skeleton.c" |
| 233 | + |
| 234 | + |
| 235 | diff --git a/string/strcoll_l.c b/string/strcoll_l.c |
| 236 | index b36b18c..a18b65e 100644 |
| 237 | --- a/string/strcoll_l.c |
| 238 | +++ b/string/strcoll_l.c |
| 239 | @@ -30,7 +30,6 @@ |
| 240 | # define STRING_TYPE char |
| 241 | # define USTRING_TYPE unsigned char |
| 242 | # define STRCOLL __strcoll_l |
| 243 | -# define STRDIFF __strdiff |
| 244 | # define STRCMP strcmp |
| 245 | # define WEIGHT_H "../locale/weight.h" |
| 246 | # define SUFFIX MB |
| 247 | @@ -43,19 +42,6 @@ |
| 248 | #include "../locale/localeinfo.h" |
| 249 | #include WEIGHT_H |
| 250 | |
| 251 | -#define MASK_UTF8_7BIT (1 << 7) |
| 252 | -#define MASK_UTF8_START (3 << 6) |
| 253 | - |
| 254 | -size_t |
| 255 | -STRDIFF (const STRING_TYPE *s, const STRING_TYPE *t) |
| 256 | -{ |
| 257 | - size_t n; |
| 258 | - |
| 259 | - for (n = 0; *s != '\0' && *s++ == *t++; ++n) |
| 260 | - continue; |
| 261 | - |
| 262 | - return n; |
| 263 | -} |
| 264 | |
| 265 | /* Track status while looking for sequences in a string. */ |
| 266 | typedef struct |
| 267 | @@ -274,29 +260,9 @@ STRCOLL (const STRING_TYPE *s1, const STRING_TYPE *s2, __locale_t l) |
| 268 | const USTRING_TYPE *extra; |
| 269 | const int32_t *indirect; |
| 270 | |
| 271 | - /* In case there is no locale specific sort order (C / POSIX). */ |
| 272 | if (nrules == 0) |
| 273 | return STRCMP (s1, s2); |
| 274 | |
| 275 | - /* Fast forward to the position of the first difference. Needs to be |
| 276 | - encoding aware as the byte-by-byte comparison can stop in the middle |
| 277 | - of a char sequence for multibyte encodings like UTF-8. */ |
| 278 | - uint_fast32_t encoding = |
| 279 | - current->values[_NL_ITEM_INDEX (_NL_COLLATE_ENCODING_TYPE)].word; |
| 280 | - if (encoding != __cet_other) |
| 281 | - { |
| 282 | - size_t diff = STRDIFF (s1, s2); |
| 283 | - if (diff > 0) |
| 284 | - { |
| 285 | - if (encoding == __cet_utf8 && (*(s1 + diff) & MASK_UTF8_7BIT) != 0) |
| 286 | - do |
| 287 | - diff--; |
| 288 | - while (diff > 0 && (*(s1 + diff) & MASK_UTF8_START) != MASK_UTF8_START); |
| 289 | - s1 += diff; |
| 290 | - s2 += diff; |
| 291 | - } |
| 292 | - } |
| 293 | - |
| 294 | /* Catch empty strings. */ |
| 295 | if (__glibc_unlikely (*s1 == '\0') || __glibc_unlikely (*s2 == '\0')) |
| 296 | return (*s1 != '\0') - (*s2 != '\0'); |
| 297 | @@ -363,9 +329,8 @@ STRCOLL (const STRING_TYPE *s1, const STRING_TYPE *s2, __locale_t l) |
| 298 | byte-level comparison to ensure that we don't waste time |
| 299 | going through multiple passes for totally equal strings |
| 300 | before proceeding to subsequent passes. */ |
| 301 | - if (pass == 0 && encoding == __cet_other && |
| 302 | - STRCMP (s1, s2) == 0) |
| 303 | - return result; |
| 304 | + if (pass == 0 && STRCMP (s1, s2) == 0) |
| 305 | + return result; |
| 306 | else |
| 307 | break; |
| 308 | } |
| 309 | diff --git a/wcsmbs/wcscoll_l.c b/wcsmbs/wcscoll_l.c |
| 310 | index 6d9384a..87f240d 100644 |
| 311 | --- a/wcsmbs/wcscoll_l.c |
| 312 | +++ b/wcsmbs/wcscoll_l.c |
| 313 | @@ -23,7 +23,6 @@ |
| 314 | #define STRING_TYPE wchar_t |
| 315 | #define USTRING_TYPE wint_t |
| 316 | #define STRCOLL __wcscoll_l |
| 317 | -#define STRDIFF __wcsdiff |
| 318 | #define STRCMP __wcscmp |
| 319 | #define WEIGHT_H "../locale/weightwc.h" |
| 320 | #define SUFFIX WC |
| 321 | -- |
| 322 | 1.8.4.2 |
| 323 | |