| Upstream-Status: Backport |
| |
| Signed-off-by: Li Xin <lixin.fnst@cn.fujitsu.com> |
| |
| From https://sourceware.org/git/gitweb.cgi?p=glibc.git;h=6c84109cfa26f35c3dfed3acb97d347361bd5849 |
| Author: Carlos O'Donell <carlos@systemhalted.org> |
| Date: Thu Oct 8 16:34:53 2015 -0400 |
| |
| strcoll: Remove incorrect STRDIFF-based optimization (Bug 18589). |
| |
| The optimization introduced in commit |
| f13c2a8dff2329c6692a80176262ceaaf8a6f74e, causes regressions in |
| sorting for languages that have digraphs that change sort order, like |
| cs_CZ which sorts ch between h and i. |
| |
| My analysis shows the fast-forwarding optimization in STRCOLL advances |
| through a digraph while possibly stopping in the middle which results |
| in a subsequent skipping of the digraph and incorrect sorting. The |
| optimization is incorrect as implemented and because of that I'm |
| removing it for 2.23, and I will also commit this fix for 2.22 where |
| it was originally introduced. |
| |
| This patch reverts the optimization, introduces a new bug-strcoll2.c |
| regression test that tests both cs_CZ.UTF-8 and da_DK.ISO-8859-1 and |
| ensures they sort one digraph each correctly. The optimization can't be |
| applied without regressing this test. |
| |
| Checked on x86_64, bug-strcoll2.c fails without this patch and passes |
| after. This will also get a fix on 2.22 which has the same bug. |
| |
| (cherry picked from commit 87701a58e291bd7ac3b407d10a829dac52c9c16e) |
| --- |
| locale/C-collate.c | 4 +- |
| locale/categories.def | 1 - |
| locale/langinfo.h | 1 - |
| locale/localeinfo.h | 7 ---- |
| locale/programs/ld-collate.c | 9 ----- |
| string/bug-strcoll2.c | 95 ++++++++++++++++++++++++++++++++++++++++++++ |
| string/strcoll_l.c | 39 +----------------- |
| wcsmbs/wcscoll_l.c | 1 - |
| 8 files changed, 98 insertions(+), 59 deletions(-) |
| create mode 100644 string/bug-strcoll2.c |
| |
| diff --git a/locale/C-collate.c b/locale/C-collate.c |
| index d7f3c55..06dfdfa 100644 |
| --- a/locale/C-collate.c |
| +++ b/locale/C-collate.c |
| @@ -144,8 +144,6 @@ const struct __locale_data _nl_C_LC_COLLATE attribute_hidden = |
| /* _NL_COLLATE_COLLSEQWC */ |
| { .string = (const char *) collseqwc }, |
| /* _NL_COLLATE_CODESET */ |
| - { .string = _nl_C_codeset }, |
| - /* _NL_COLLATE_ENCODING_TYPE */ |
| - { .word = __cet_8bit } |
| + { .string = _nl_C_codeset } |
| } |
| }; |
| diff --git a/locale/categories.def b/locale/categories.def |
| index 045489d..a8dda53 100644 |
| --- a/locale/categories.def |
| +++ b/locale/categories.def |
| @@ -58,7 +58,6 @@ DEFINE_CATEGORY |
| DEFINE_ELEMENT (_NL_COLLATE_COLLSEQMB, "collate-collseqmb", std, wstring) |
| DEFINE_ELEMENT (_NL_COLLATE_COLLSEQWC, "collate-collseqwc", std, wstring) |
| DEFINE_ELEMENT (_NL_COLLATE_CODESET, "collate-codeset", std, string) |
| - DEFINE_ELEMENT (_NL_COLLATE_ENCODING_TYPE, "collate-encoding-type", std, word) |
| ), NO_POSTLOAD) |
| |
| |
| diff --git a/locale/langinfo.h b/locale/langinfo.h |
| index ffc5c7f..a565d9d 100644 |
| --- a/locale/langinfo.h |
| +++ b/locale/langinfo.h |
| @@ -255,7 +255,6 @@ enum |
| _NL_COLLATE_COLLSEQMB, |
| _NL_COLLATE_COLLSEQWC, |
| _NL_COLLATE_CODESET, |
| - _NL_COLLATE_ENCODING_TYPE, |
| _NL_NUM_LC_COLLATE, |
| |
| /* LC_CTYPE category: character classification. |
| diff --git a/locale/localeinfo.h b/locale/localeinfo.h |
| index a7516c0..c076d8e 100644 |
| --- a/locale/localeinfo.h |
| +++ b/locale/localeinfo.h |
| @@ -110,13 +110,6 @@ enum coll_sort_rule |
| sort_mask |
| }; |
| |
| -/* Collation encoding type. */ |
| -enum collation_encoding_type |
| -{ |
| - __cet_other, |
| - __cet_8bit, |
| - __cet_utf8 |
| -}; |
| |
| /* We can map the types of the entries into a few categories. */ |
| enum value_type |
| diff --git a/locale/programs/ld-collate.c b/locale/programs/ld-collate.c |
| index 16e9039..3c88c6d 100644 |
| --- a/locale/programs/ld-collate.c |
| +++ b/locale/programs/ld-collate.c |
| @@ -32,7 +32,6 @@ |
| #include "linereader.h" |
| #include "locfile.h" |
| #include "elem-hash.h" |
| -#include "../localeinfo.h" |
| |
| /* Uncomment the following line in the production version. */ |
| /* #define NDEBUG 1 */ |
| @@ -2130,8 +2129,6 @@ collate_output (struct localedef_t *locale, const struct charmap_t *charmap, |
| /* The words have to be handled specially. */ |
| if (idx == _NL_ITEM_INDEX (_NL_COLLATE_SYMB_HASH_SIZEMB)) |
| add_locale_uint32 (&file, 0); |
| - else if (idx == _NL_ITEM_INDEX (_NL_COLLATE_ENCODING_TYPE)) |
| - add_locale_uint32 (&file, __cet_other); |
| else |
| add_locale_empty (&file); |
| } |
| @@ -2495,12 +2492,6 @@ collate_output (struct localedef_t *locale, const struct charmap_t *charmap, |
| add_locale_raw_data (&file, collate->mbseqorder, 256); |
| add_locale_collseq_table (&file, &collate->wcseqorder); |
| add_locale_string (&file, charmap->code_set_name); |
| - if (strcmp (charmap->code_set_name, "UTF-8") == 0) |
| - add_locale_uint32 (&file, __cet_utf8); |
| - else if (charmap->mb_cur_max == 1) |
| - add_locale_uint32 (&file, __cet_8bit); |
| - else |
| - add_locale_uint32 (&file, __cet_other); |
| write_locale_data (output_path, LC_COLLATE, "LC_COLLATE", &file); |
| |
| obstack_free (&weightpool, NULL); |
| diff --git a/string/bug-strcoll2.c b/string/bug-strcoll2.c |
| new file mode 100644 |
| index 0000000..950b090 |
| --- /dev/null |
| +++ b/string/bug-strcoll2.c |
| @@ -0,0 +1,95 @@ |
| +/* Bug 18589: sort-test.sh fails at random. |
| + * Copyright (C) 1998-2015 Free Software Foundation, Inc. |
| + * This file is part of the GNU C Library. |
| + * Contributed by Ulrich Drepper <drepper@cygnus.com>, 1998. |
| + * |
| + * The GNU C Library is free software; you can redistribute it and/or |
| + * modify it under the terms of the GNU Lesser General Public |
| + * License as published by the Free Software Foundation; either |
| + * version 2.1 of the License, or (at your option) any later version. |
| + * |
| + * The GNU C Library is distributed in the hope that it will be useful, |
| + * but WITHOUT ANY WARRANTY; without even the implied warranty of |
| + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| + * Lesser General Public License for more details. |
| + * |
| + * You should have received a copy of the GNU Lesser General Public |
| + * License along with the GNU C Library; if not, see |
| + * <http://www.gnu.org/licenses/>. */ |
| + |
| +#include <stdio.h> |
| +#include <string.h> |
| +#include <locale.h> |
| + |
| +/* An incorrect strcoll optimization resulted in incorrect |
| + * results from strcoll for cs_CZ and da_DK. */ |
| + |
| +int |
| +test_cs_CZ (void) |
| +{ |
| + const char t1[] = "config"; |
| + const char t2[] = "choose"; |
| + if (setlocale (LC_ALL, "cs_CZ.UTF-8") == NULL) |
| + { |
| + perror ("setlocale"); |
| + return 1; |
| + } |
| + /* In Czech the digraph ch sorts after c, therefore we expect |
| + * config to sort before choose. */ |
| + int a = strcoll (t1, t2); |
| + int b = strcoll (t2, t1); |
| + printf ("strcoll (\"%s\", \"%s\") = %d\n", t1, t2, a); |
| + printf ("strcoll (\"%s\", \"%s\") = %d\n", t2, t1, b); |
| + if (a < 0 && b > 0) |
| + { |
| + puts ("PASS: config < choose"); |
| + return 0; |
| + } |
| + else |
| + { |
| + puts ("FAIL: Wrong sorting in cz_CZ.UTF-8."); |
| + return 1; |
| + } |
| +} |
| + |
| +int |
| +test_da_DK (void) |
| +{ |
| + const char t1[] = "AS"; |
| + const char t2[] = "AA"; |
| + if (setlocale (LC_ALL, "da_DK.ISO-8859-1") == NULL) |
| + { |
| + perror ("setlocale"); |
| + return 1; |
| + } |
| + /* AA should be treated as the last letter of the Danish alphabet, |
| + * hence sorting after AS. */ |
| + int a = strcoll (t1, t2); |
| + int b = strcoll (t2, t1); |
| + printf ("strcoll (\"%s\", \"%s\") = %d\n", t1, t2, a); |
| + printf ("strcoll (\"%s\", \"%s\") = %d\n", t2, t1, b); |
| + if (a < 0 && b > 0) |
| + { |
| + puts ("PASS: AS < AA"); |
| + return 0; |
| + } |
| + else |
| + { |
| + puts ("FAIL: Wrong sorting in da_DK.ISO-8859-1"); |
| + return 1; |
| + } |
| +} |
| + |
| +static int |
| +do_test (void) |
| +{ |
| + int err = 0; |
| + err |= test_cs_CZ (); |
| + err |= test_da_DK (); |
| + return err; |
| +} |
| + |
| +#define TEST_FUNCTION do_test () |
| +#include "../test-skeleton.c" |
| + |
| + |
| diff --git a/string/strcoll_l.c b/string/strcoll_l.c |
| index b36b18c..a18b65e 100644 |
| --- a/string/strcoll_l.c |
| +++ b/string/strcoll_l.c |
| @@ -30,7 +30,6 @@ |
| # define STRING_TYPE char |
| # define USTRING_TYPE unsigned char |
| # define STRCOLL __strcoll_l |
| -# define STRDIFF __strdiff |
| # define STRCMP strcmp |
| # define WEIGHT_H "../locale/weight.h" |
| # define SUFFIX MB |
| @@ -43,19 +42,6 @@ |
| #include "../locale/localeinfo.h" |
| #include WEIGHT_H |
| |
| -#define MASK_UTF8_7BIT (1 << 7) |
| -#define MASK_UTF8_START (3 << 6) |
| - |
| -size_t |
| -STRDIFF (const STRING_TYPE *s, const STRING_TYPE *t) |
| -{ |
| - size_t n; |
| - |
| - for (n = 0; *s != '\0' && *s++ == *t++; ++n) |
| - continue; |
| - |
| - return n; |
| -} |
| |
| /* Track status while looking for sequences in a string. */ |
| typedef struct |
| @@ -274,29 +260,9 @@ STRCOLL (const STRING_TYPE *s1, const STRING_TYPE *s2, __locale_t l) |
| const USTRING_TYPE *extra; |
| const int32_t *indirect; |
| |
| - /* In case there is no locale specific sort order (C / POSIX). */ |
| if (nrules == 0) |
| return STRCMP (s1, s2); |
| |
| - /* Fast forward to the position of the first difference. Needs to be |
| - encoding aware as the byte-by-byte comparison can stop in the middle |
| - of a char sequence for multibyte encodings like UTF-8. */ |
| - uint_fast32_t encoding = |
| - current->values[_NL_ITEM_INDEX (_NL_COLLATE_ENCODING_TYPE)].word; |
| - if (encoding != __cet_other) |
| - { |
| - size_t diff = STRDIFF (s1, s2); |
| - if (diff > 0) |
| - { |
| - if (encoding == __cet_utf8 && (*(s1 + diff) & MASK_UTF8_7BIT) != 0) |
| - do |
| - diff--; |
| - while (diff > 0 && (*(s1 + diff) & MASK_UTF8_START) != MASK_UTF8_START); |
| - s1 += diff; |
| - s2 += diff; |
| - } |
| - } |
| - |
| /* Catch empty strings. */ |
| if (__glibc_unlikely (*s1 == '\0') || __glibc_unlikely (*s2 == '\0')) |
| return (*s1 != '\0') - (*s2 != '\0'); |
| @@ -363,9 +329,8 @@ STRCOLL (const STRING_TYPE *s1, const STRING_TYPE *s2, __locale_t l) |
| byte-level comparison to ensure that we don't waste time |
| going through multiple passes for totally equal strings |
| before proceeding to subsequent passes. */ |
| - if (pass == 0 && encoding == __cet_other && |
| - STRCMP (s1, s2) == 0) |
| - return result; |
| + if (pass == 0 && STRCMP (s1, s2) == 0) |
| + return result; |
| else |
| break; |
| } |
| diff --git a/wcsmbs/wcscoll_l.c b/wcsmbs/wcscoll_l.c |
| index 6d9384a..87f240d 100644 |
| --- a/wcsmbs/wcscoll_l.c |
| +++ b/wcsmbs/wcscoll_l.c |
| @@ -23,7 +23,6 @@ |
| #define STRING_TYPE wchar_t |
| #define USTRING_TYPE wint_t |
| #define STRCOLL __wcscoll_l |
| -#define STRDIFF __wcsdiff |
| #define STRCMP __wcscmp |
| #define WEIGHT_H "../locale/weightwc.h" |
| #define SUFFIX WC |
| -- |
| 1.8.4.2 |
| |