Patrick Williams | c124f4f | 2015-09-15 14:41:29 -0500 | [diff] [blame] | 1 | From c884dd12ec062569335702848fc5f29f436c28fa Mon Sep 17 00:00:00 2001 |
| 2 | From: Li xin <lixin.fnst@cn.fujitsu.com> |
| 3 | Date: Mon, 25 May 2015 10:15:57 +0900 |
| 4 | Subject: [PATCH] grep egrep fgrep: Fix LSB NG cases. |
| 5 | |
| 6 | The LSB core test requires grep egrep and fgrep can |
| 7 | perform pattern matching in searches without regard |
| 8 | to case if -i option is specified. |
| 9 | |
| 10 | Upstream-Status: backport. |
| 11 | |
| 12 | Signed-off-by: Li Xin <lixin.fnst@cn.fujitsu.com> |
| 13 | --- |
| 14 | lib/posix/regex.h | 4 + |
| 15 | src/dfa.c | 22 +- |
| 16 | src/grep.c | 96 ++++--- |
| 17 | src/search.c | 833 +++++++++++++++++++++++++++++++++++++++++++++--------- |
| 18 | 4 files changed, 768 insertions(+), 187 deletions(-) |
| 19 | |
| 20 | diff --git a/lib/posix/regex.h b/lib/posix/regex.h |
| 21 | index 63c2fef..7bb2b0e 100644 |
| 22 | --- a/lib/posix/regex.h |
| 23 | +++ b/lib/posix/regex.h |
| 24 | @@ -109,6 +109,10 @@ typedef unsigned long int reg_syntax_t; |
| 25 | If not set, \{, \}, {, and } are literals. */ |
| 26 | #define RE_INTERVALS (RE_HAT_LISTS_NOT_NEWLINE << 1) |
| 27 | |
| 28 | +/* If this bit is set, then ignore case when matching. |
| 29 | + If not set, then case is significant. */ |
| 30 | +#define RE_ICASE (RE_INVALID_INTERVAL_ORD << 1) |
| 31 | + |
| 32 | /* If this bit is set, +, ? and | aren't recognized as operators. |
| 33 | If not set, they are. */ |
| 34 | #define RE_LIMITED_OPS (RE_INTERVALS << 1) |
| 35 | diff --git a/src/dfa.c b/src/dfa.c |
| 36 | index 590bfa7..27c876a 100644 |
| 37 | --- a/src/dfa.c |
| 38 | +++ b/src/dfa.c |
| 39 | @@ -414,7 +414,7 @@ update_mb_len_index (unsigned char const *p, int len) |
| 40 | |
| 41 | /* This function fetch a wide character, and update cur_mb_len, |
| 42 | used only if the current locale is a multibyte environment. */ |
| 43 | -static wchar_t |
| 44 | +static wint_t |
| 45 | fetch_wc (char const *eoferr) |
| 46 | { |
| 47 | wchar_t wc; |
| 48 | @@ -423,7 +423,7 @@ fetch_wc (char const *eoferr) |
| 49 | if (eoferr != 0) |
| 50 | dfaerror (eoferr); |
| 51 | else |
| 52 | - return -1; |
| 53 | + return WEOF; |
| 54 | } |
| 55 | |
| 56 | cur_mb_len = mbrtowc(&wc, lexptr, lexleft, &mbs); |
| 57 | @@ -459,7 +459,7 @@ fetch_wc (char const *eoferr) |
| 58 | static void |
| 59 | parse_bracket_exp_mb () |
| 60 | { |
| 61 | - wchar_t wc, wc1, wc2; |
| 62 | + wint_t wc, wc1, wc2; |
| 63 | |
| 64 | /* Work area to build a mb_char_classes. */ |
| 65 | struct mb_char_classes *work_mbc; |
| 66 | @@ -496,7 +496,7 @@ parse_bracket_exp_mb () |
| 67 | work_mbc->invert = 0; |
| 68 | do |
| 69 | { |
| 70 | - wc1 = -1; /* mark wc1 is not initialized". */ |
| 71 | + wc1 = WEOF; /* mark wc1 is not initialized". */ |
| 72 | |
| 73 | /* Note that if we're looking at some other [:...:] construct, |
| 74 | we just treat it as a bunch of ordinary characters. We can do |
| 75 | @@ -586,7 +586,7 @@ parse_bracket_exp_mb () |
| 76 | work_mbc->coll_elems[work_mbc->ncoll_elems++] = elem; |
| 77 | } |
| 78 | } |
| 79 | - wc = -1; |
| 80 | + wc1 = wc = WEOF; |
| 81 | } |
| 82 | else |
| 83 | /* We treat '[' as a normal character here. */ |
| 84 | @@ -600,7 +600,7 @@ parse_bracket_exp_mb () |
| 85 | wc = fetch_wc(("Unbalanced [")); |
| 86 | } |
| 87 | |
| 88 | - if (wc1 == -1) |
| 89 | + if (wc1 == WEOF) |
| 90 | wc1 = fetch_wc(_("Unbalanced [")); |
| 91 | |
| 92 | if (wc1 == L'-') |
| 93 | @@ -630,17 +630,17 @@ parse_bracket_exp_mb () |
| 94 | } |
| 95 | REALLOC_IF_NECESSARY(work_mbc->range_sts, wchar_t, |
| 96 | range_sts_al, work_mbc->nranges + 1); |
| 97 | - work_mbc->range_sts[work_mbc->nranges] = wc; |
| 98 | + work_mbc->range_sts[work_mbc->nranges] = (wchar_t)wc; |
| 99 | REALLOC_IF_NECESSARY(work_mbc->range_ends, wchar_t, |
| 100 | range_ends_al, work_mbc->nranges + 1); |
| 101 | - work_mbc->range_ends[work_mbc->nranges++] = wc2; |
| 102 | + work_mbc->range_ends[work_mbc->nranges++] = (wchar_t)wc2; |
| 103 | } |
| 104 | - else if (wc != -1) |
| 105 | + else if (wc != WEOF) |
| 106 | /* build normal characters. */ |
| 107 | { |
| 108 | REALLOC_IF_NECESSARY(work_mbc->chars, wchar_t, chars_al, |
| 109 | work_mbc->nchars + 1); |
| 110 | - work_mbc->chars[work_mbc->nchars++] = wc; |
| 111 | + work_mbc->chars[work_mbc->nchars++] = (wchar_t)wc; |
| 112 | } |
| 113 | } |
| 114 | while ((wc = wc1) != L']'); |
| 115 | @@ -2552,6 +2552,8 @@ match_mb_charset (struct dfa *d, int s, position pos, int index) |
| 116 | } |
| 117 | |
| 118 | /* match with a character? */ |
| 119 | + if (case_fold) |
| 120 | + wc = towlower (wc); |
| 121 | for (i = 0; i<work_mbc->nchars; i++) |
| 122 | { |
| 123 | if (wc == work_mbc->chars[i]) |
| 124 | diff --git a/src/grep.c b/src/grep.c |
| 125 | index 2fb2fac..3fd4b47 100644 |
| 126 | --- a/src/grep.c |
| 127 | +++ b/src/grep.c |
| 128 | @@ -30,6 +30,12 @@ |
| 129 | # include <sys/time.h> |
| 130 | # include <sys/resource.h> |
| 131 | #endif |
| 132 | +#if defined HAVE_WCTYPE_H && defined HAVE_WCHAR_H && defined HAVE_MBRTOWC |
| 133 | +/* We can handle multibyte string. */ |
| 134 | +# define MBS_SUPPORT |
| 135 | +# include <wchar.h> |
| 136 | +# include <wctype.h> |
| 137 | +#endif |
| 138 | #include <stdio.h> |
| 139 | #include "system.h" |
| 140 | #include "getopt.h" |
| 141 | @@ -255,19 +261,6 @@ reset (int fd, char const *file, struct stats *stats) |
| 142 | bufbeg[-1] = eolbyte; |
| 143 | bufdesc = fd; |
| 144 | |
| 145 | - if (fstat (fd, &stats->stat) != 0) |
| 146 | - { |
| 147 | - error (0, errno, "fstat"); |
| 148 | - return 0; |
| 149 | - } |
| 150 | - if (directories == SKIP_DIRECTORIES && S_ISDIR (stats->stat.st_mode)) |
| 151 | - return 0; |
| 152 | -#ifndef DJGPP |
| 153 | - if (devices == SKIP_DEVICES && (S_ISCHR(stats->stat.st_mode) || S_ISBLK(stats->stat.st_mode) || S_ISSOCK(stats->stat.st_mode))) |
| 154 | -#else |
| 155 | - if (devices == SKIP_DEVICES && (S_ISCHR(stats->stat.st_mode) || S_ISBLK(stats->stat.st_mode))) |
| 156 | -#endif |
| 157 | - return 0; |
| 158 | if (S_ISREG (stats->stat.st_mode)) |
| 159 | { |
| 160 | if (file) |
| 161 | @@ -558,33 +551,6 @@ prline (char const *beg, char const *lim, int sep) |
| 162 | { |
| 163 | size_t match_size; |
| 164 | size_t match_offset; |
| 165 | - if(match_icase) |
| 166 | - { |
| 167 | - /* Yuck, this is tricky */ |
| 168 | - char *buf = (char*) xmalloc (lim - beg); |
| 169 | - char *ibeg = buf; |
| 170 | - char *ilim = ibeg + (lim - beg); |
| 171 | - int i; |
| 172 | - for (i = 0; i < lim - beg; i++) |
| 173 | - ibeg[i] = tolower (beg[i]); |
| 174 | - while ((match_offset = (*execute) (ibeg, ilim-ibeg, &match_size, 1)) |
| 175 | - != (size_t) -1) |
| 176 | - { |
| 177 | - char const *b = beg + match_offset; |
| 178 | - if (b == lim) |
| 179 | - break; |
| 180 | - fwrite (beg, sizeof (char), match_offset, stdout); |
| 181 | - printf ("\33[%sm", grep_color); |
| 182 | - fwrite (b, sizeof (char), match_size, stdout); |
| 183 | - fputs ("\33[00m", stdout); |
| 184 | - beg = b + match_size; |
| 185 | - ibeg = ibeg + match_offset + match_size; |
| 186 | - } |
| 187 | - fwrite (beg, 1, lim - beg, stdout); |
| 188 | - free (buf); |
| 189 | - lastout = lim; |
| 190 | - return; |
| 191 | - } |
| 192 | while (lim-beg && (match_offset = (*execute) (beg, lim - beg, &match_size, 1)) |
| 193 | != (size_t) -1) |
| 194 | { |
| 195 | @@ -601,6 +567,7 @@ prline (char const *beg, char const *lim, int sep) |
| 196 | fputs ("\33[00m", stdout); |
| 197 | beg = b + match_size; |
| 198 | } |
| 199 | + fputs ("\33[K", stdout); |
| 200 | } |
| 201 | fwrite (beg, 1, lim - beg, stdout); |
| 202 | if (ferror (stdout)) |
| 203 | @@ -623,7 +590,7 @@ prpending (char const *lim) |
| 204 | size_t match_size; |
| 205 | --pending; |
| 206 | if (outleft |
| 207 | - || (((*execute) (lastout, nl - lastout, &match_size, 0) == (size_t) -1) |
| 208 | + || (((*execute) (lastout, nl + 1 - lastout, &match_size, 0) == (size_t) -1) |
| 209 | == !out_invert)) |
| 210 | prline (lastout, nl + 1, '-'); |
| 211 | else |
| 212 | @@ -895,6 +862,19 @@ grepfile (char const *file, struct stats *stats) |
| 213 | } |
| 214 | else |
| 215 | { |
| 216 | + if (stat (file, &stats->stat) != 0) |
| 217 | + { |
| 218 | + suppressible_error (file, errno); |
| 219 | + return 1; |
| 220 | + } |
| 221 | + if (directories == SKIP_DIRECTORIES && S_ISDIR (stats->stat.st_mode)) |
| 222 | + return 1; |
| 223 | +#ifndef DJGPP |
| 224 | + if (devices == SKIP_DEVICES && (S_ISCHR(stats->stat.st_mode) || S_ISBLK(stats->stat.st_mode) || S_ISSOCK(stats->stat.st_mode) || S_ISFIFO(stats->stat.st_mode))) |
| 225 | +#else |
| 226 | + if (devices == SKIP_DEVICES && (S_ISCHR(stats->stat.st_mode) || S_ISBLK(stats->stat.st_mode))) |
| 227 | +#endif |
| 228 | + return 1; |
| 229 | while ((desc = open (file, O_RDONLY)) < 0 && errno == EINTR) |
| 230 | continue; |
| 231 | |
| 232 | @@ -1681,9 +1661,6 @@ warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.\n")) |
| 233 | out_invert ^= 1; |
| 234 | match_lines = match_words = 0; |
| 235 | } |
| 236 | - else |
| 237 | - /* Strip trailing newline. */ |
| 238 | - --keycc; |
| 239 | } |
| 240 | else |
| 241 | if (optind < argc) |
| 242 | @@ -1697,6 +1674,37 @@ warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.\n")) |
| 243 | if (!install_matcher (matcher) && !install_matcher ("default")) |
| 244 | abort (); |
| 245 | |
| 246 | +#ifdef MBS_SUPPORT |
| 247 | + if (MB_CUR_MAX != 1 && match_icase) |
| 248 | + { |
| 249 | + wchar_t wc; |
| 250 | + mbstate_t cur_state, prev_state; |
| 251 | + int i, len = strlen(keys); |
| 252 | + |
| 253 | + memset(&cur_state, 0, sizeof(mbstate_t)); |
| 254 | + for (i = 0; i <= len ;) |
| 255 | + { |
| 256 | + size_t mbclen; |
| 257 | + mbclen = mbrtowc(&wc, keys + i, len - i, &cur_state); |
| 258 | + if (mbclen == (size_t) -1 || mbclen == (size_t) -2 || mbclen == 0) |
| 259 | + { |
| 260 | + /* An invalid sequence, or a truncated multibyte character. |
| 261 | + We treat it as a singlebyte character. */ |
| 262 | + mbclen = 1; |
| 263 | + } |
| 264 | + else |
| 265 | + { |
| 266 | + if (iswupper((wint_t)wc)) |
| 267 | + { |
| 268 | + wc = towlower((wint_t)wc); |
| 269 | + wcrtomb(keys + i, wc, &cur_state); |
| 270 | + } |
| 271 | + } |
| 272 | + i += mbclen; |
| 273 | + } |
| 274 | + } |
| 275 | +#endif /* MBS_SUPPORT */ |
| 276 | + |
| 277 | (*compile)(keys, keycc); |
| 278 | |
| 279 | if ((argc - optind > 1 && !no_filenames) || with_filenames) |
| 280 | diff --git a/src/search.c b/src/search.c |
| 281 | index 7bd233f..3c6a485 100644 |
| 282 | --- a/src/search.c |
| 283 | +++ b/src/search.c |
| 284 | @@ -18,9 +18,13 @@ |
| 285 | |
| 286 | /* Written August 1992 by Mike Haertel. */ |
| 287 | |
| 288 | +#ifndef _GNU_SOURCE |
| 289 | +# define _GNU_SOURCE 1 |
| 290 | +#endif |
| 291 | #ifdef HAVE_CONFIG_H |
| 292 | # include <config.h> |
| 293 | #endif |
| 294 | +#include <assert.h> |
| 295 | #include <sys/types.h> |
| 296 | #if defined HAVE_WCTYPE_H && defined HAVE_WCHAR_H && defined HAVE_MBRTOWC |
| 297 | /* We can handle multibyte string. */ |
| 298 | @@ -31,7 +35,7 @@ |
| 299 | |
| 300 | #include "system.h" |
| 301 | #include "grep.h" |
| 302 | -#include "regex.h" |
| 303 | +#include <regex.h> |
| 304 | #include "dfa.h" |
| 305 | #include "kwset.h" |
| 306 | #include "error.h" |
| 307 | @@ -39,6 +43,9 @@ |
| 308 | #ifdef HAVE_LIBPCRE |
| 309 | # include <pcre.h> |
| 310 | #endif |
| 311 | +#ifdef HAVE_LANGINFO_CODESET |
| 312 | +# include <langinfo.h> |
| 313 | +#endif |
| 314 | |
| 315 | #define NCHAR (UCHAR_MAX + 1) |
| 316 | |
| 317 | @@ -70,9 +77,10 @@ static kwset_t kwset; |
| 318 | call the regexp matcher at all. */ |
| 319 | static int kwset_exact_matches; |
| 320 | |
| 321 | -#if defined(MBS_SUPPORT) |
| 322 | -static char* check_multibyte_string PARAMS ((char const *buf, size_t size)); |
| 323 | -#endif |
| 324 | +/* UTF-8 encoding allows some optimizations that we can't otherwise |
| 325 | + assume in a multibyte encoding. */ |
| 326 | +static int using_utf8; |
| 327 | + |
| 328 | static void kwsinit PARAMS ((void)); |
| 329 | static void kwsmusts PARAMS ((void)); |
| 330 | static void Gcompile PARAMS ((char const *, size_t)); |
| 331 | @@ -84,6 +92,15 @@ static void Pcompile PARAMS ((char const *, size_t )); |
| 332 | static size_t Pexecute PARAMS ((char const *, size_t, size_t *, int)); |
| 333 | |
| 334 | void |
| 335 | +check_utf8 (void) |
| 336 | +{ |
| 337 | +#ifdef HAVE_LANGINFO_CODESET |
| 338 | + if (strcmp (nl_langinfo (CODESET), "UTF-8") == 0) |
| 339 | + using_utf8 = 1; |
| 340 | +#endif |
| 341 | +} |
| 342 | + |
| 343 | +void |
| 344 | dfaerror (char const *mesg) |
| 345 | { |
| 346 | error (2, 0, mesg); |
| 347 | @@ -141,38 +158,6 @@ kwsmusts (void) |
| 348 | } |
| 349 | } |
| 350 | |
| 351 | -#ifdef MBS_SUPPORT |
| 352 | -/* This function allocate the array which correspond to "buf". |
| 353 | - Then this check multibyte string and mark on the positions which |
| 354 | - are not singlebyte character nor the first byte of a multibyte |
| 355 | - character. Caller must free the array. */ |
| 356 | -static char* |
| 357 | -check_multibyte_string(char const *buf, size_t size) |
| 358 | -{ |
| 359 | - char *mb_properties = malloc(size); |
| 360 | - mbstate_t cur_state; |
| 361 | - int i; |
| 362 | - memset(&cur_state, 0, sizeof(mbstate_t)); |
| 363 | - memset(mb_properties, 0, sizeof(char)*size); |
| 364 | - for (i = 0; i < size ;) |
| 365 | - { |
| 366 | - size_t mbclen; |
| 367 | - mbclen = mbrlen(buf + i, size - i, &cur_state); |
| 368 | - |
| 369 | - if (mbclen == (size_t) -1 || mbclen == (size_t) -2 || mbclen == 0) |
| 370 | - { |
| 371 | - /* An invalid sequence, or a truncated multibyte character. |
| 372 | - We treat it as a singlebyte character. */ |
| 373 | - mbclen = 1; |
| 374 | - } |
| 375 | - mb_properties[i] = mbclen; |
| 376 | - i += mbclen; |
| 377 | - } |
| 378 | - |
| 379 | - return mb_properties; |
| 380 | -} |
| 381 | -#endif |
| 382 | - |
| 383 | static void |
| 384 | Gcompile (char const *pattern, size_t size) |
| 385 | { |
| 386 | @@ -181,7 +166,8 @@ Gcompile (char const *pattern, size_t size) |
| 387 | size_t total = size; |
| 388 | char const *motif = pattern; |
| 389 | |
| 390 | - re_set_syntax (RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE); |
| 391 | + check_utf8 (); |
| 392 | + re_set_syntax (RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE | (match_icase ? RE_ICASE : 0)); |
| 393 | dfasyntax (RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE, match_icase, eolbyte); |
| 394 | |
| 395 | /* For GNU regex compiler we have to pass the patterns separately to detect |
| 396 | @@ -218,6 +204,10 @@ Gcompile (char const *pattern, size_t size) |
| 397 | motif = sep; |
| 398 | } while (sep && total != 0); |
| 399 | |
| 400 | + /* Strip trailing newline. */ |
| 401 | + if (size && pattern[size - 1] == '\n') |
| 402 | + size--; |
| 403 | + |
| 404 | /* In the match_words and match_lines cases, we use a different pattern |
| 405 | for the DFA matcher that will quickly throw out cases that won't work. |
| 406 | Then if DFA succeeds we do some hairy stuff using the regex matcher |
| 407 | @@ -233,7 +223,7 @@ Gcompile (char const *pattern, size_t size) |
| 408 | static char const line_end[] = "\\)$"; |
| 409 | static char const word_beg[] = "\\(^\\|[^[:alnum:]_]\\)\\("; |
| 410 | static char const word_end[] = "\\)\\([^[:alnum:]_]\\|$\\)"; |
| 411 | - char *n = malloc (sizeof word_beg - 1 + size + sizeof word_end); |
| 412 | + char *n = xmalloc (sizeof word_beg - 1 + size + sizeof word_end); |
| 413 | size_t i; |
| 414 | strcpy (n, match_lines ? line_beg : word_beg); |
| 415 | i = strlen (n); |
| 416 | @@ -257,14 +247,15 @@ Ecompile (char const *pattern, size_t size) |
| 417 | size_t total = size; |
| 418 | char const *motif = pattern; |
| 419 | |
| 420 | + check_utf8 (); |
| 421 | if (strcmp (matcher, "awk") == 0) |
| 422 | { |
| 423 | - re_set_syntax (RE_SYNTAX_AWK); |
| 424 | + re_set_syntax (RE_SYNTAX_AWK | (match_icase ? RE_ICASE : 0)); |
| 425 | dfasyntax (RE_SYNTAX_AWK, match_icase, eolbyte); |
| 426 | } |
| 427 | else |
| 428 | { |
| 429 | - re_set_syntax (RE_SYNTAX_POSIX_EGREP); |
| 430 | + re_set_syntax (RE_SYNTAX_POSIX_EGREP | (match_icase ? RE_ICASE : 0)); |
| 431 | dfasyntax (RE_SYNTAX_POSIX_EGREP, match_icase, eolbyte); |
| 432 | } |
| 433 | |
| 434 | @@ -301,6 +292,10 @@ Ecompile (char const *pattern, size_t size) |
| 435 | motif = sep; |
| 436 | } while (sep && total != 0); |
| 437 | |
| 438 | + /* Strip trailing newline. */ |
| 439 | + if (size && pattern[size - 1] == '\n') |
| 440 | + size--; |
| 441 | + |
| 442 | /* In the match_words and match_lines cases, we use a different pattern |
| 443 | for the DFA matcher that will quickly throw out cases that won't work. |
| 444 | Then if DFA succeeds we do some hairy stuff using the regex matcher |
| 445 | @@ -316,7 +311,7 @@ Ecompile (char const *pattern, size_t size) |
| 446 | static char const line_end[] = ")$"; |
| 447 | static char const word_beg[] = "(^|[^[:alnum:]_])("; |
| 448 | static char const word_end[] = ")([^[:alnum:]_]|$)"; |
| 449 | - char *n = malloc (sizeof word_beg - 1 + size + sizeof word_end); |
| 450 | + char *n = xmalloc (sizeof word_beg - 1 + size + sizeof word_end); |
| 451 | size_t i; |
| 452 | strcpy (n, match_lines ? line_beg : word_beg); |
| 453 | i = strlen(n); |
| 454 | @@ -339,15 +334,34 @@ EGexecute (char const *buf, size_t size, size_t *match_size, int exact) |
| 455 | char eol = eolbyte; |
| 456 | int backref, start, len; |
| 457 | struct kwsmatch kwsm; |
| 458 | - size_t i; |
| 459 | + size_t i, ret_val; |
| 460 | + static int use_dfa; |
| 461 | + static int use_dfa_checked = 0; |
| 462 | #ifdef MBS_SUPPORT |
| 463 | - char *mb_properties = NULL; |
| 464 | + int mb_cur_max = MB_CUR_MAX; |
| 465 | + mbstate_t mbs; |
| 466 | + memset (&mbs, '\0', sizeof (mbstate_t)); |
| 467 | #endif /* MBS_SUPPORT */ |
| 468 | |
| 469 | + if (!use_dfa_checked) |
| 470 | + { |
| 471 | + char *grep_use_dfa = getenv ("GREP_USE_DFA"); |
| 472 | + if (!grep_use_dfa) |
| 473 | + { |
| 474 | #ifdef MBS_SUPPORT |
| 475 | - if (MB_CUR_MAX > 1 && kwset) |
| 476 | - mb_properties = check_multibyte_string(buf, size); |
| 477 | + /* Turn off DFA when processing multibyte input. */ |
| 478 | + use_dfa = (MB_CUR_MAX == 1); |
| 479 | +#else |
| 480 | + use_dfa = 1; |
| 481 | #endif /* MBS_SUPPORT */ |
| 482 | + } |
| 483 | + else |
| 484 | + { |
| 485 | + use_dfa = atoi (grep_use_dfa); |
| 486 | + } |
| 487 | + |
| 488 | + use_dfa_checked = 1; |
| 489 | + } |
| 490 | |
| 491 | buflim = buf + size; |
| 492 | |
| 493 | @@ -358,47 +372,120 @@ EGexecute (char const *buf, size_t size, size_t *match_size, int exact) |
| 494 | if (kwset) |
| 495 | { |
| 496 | /* Find a possible match using the KWset matcher. */ |
| 497 | - size_t offset = kwsexec (kwset, beg, buflim - beg, &kwsm); |
| 498 | +#ifdef MBS_SUPPORT |
| 499 | + size_t bytes_left = 0; |
| 500 | +#endif /* MBS_SUPPORT */ |
| 501 | + size_t offset; |
| 502 | +#ifdef MBS_SUPPORT |
| 503 | + /* kwsexec doesn't work with match_icase and multibyte input. */ |
| 504 | + if (match_icase && mb_cur_max > 1) |
| 505 | + /* Avoid kwset */ |
| 506 | + offset = 0; |
| 507 | + else |
| 508 | +#endif /* MBS_SUPPORT */ |
| 509 | + offset = kwsexec (kwset, beg, buflim - beg, &kwsm); |
| 510 | if (offset == (size_t) -1) |
| 511 | - { |
| 512 | + goto failure; |
| 513 | #ifdef MBS_SUPPORT |
| 514 | - if (MB_CUR_MAX > 1) |
| 515 | - free(mb_properties); |
| 516 | -#endif |
| 517 | - return (size_t)-1; |
| 518 | + if (mb_cur_max > 1 && !using_utf8) |
| 519 | + { |
| 520 | + bytes_left = offset; |
| 521 | + while (bytes_left) |
| 522 | + { |
| 523 | + size_t mlen = mbrlen (beg, bytes_left, &mbs); |
| 524 | + if (mlen == (size_t) -1 || mlen == 0) |
| 525 | + { |
| 526 | + /* Incomplete character: treat as single-byte. */ |
| 527 | + memset (&mbs, '\0', sizeof (mbstate_t)); |
| 528 | + beg++; |
| 529 | + bytes_left--; |
| 530 | + continue; |
| 531 | + } |
| 532 | + |
| 533 | + if (mlen == (size_t) -2) |
| 534 | + /* Offset points inside multibyte character: |
| 535 | + * no good. */ |
| 536 | + break; |
| 537 | + |
| 538 | + beg += mlen; |
| 539 | + bytes_left -= mlen; |
| 540 | + } |
| 541 | } |
| 542 | + else |
| 543 | +#endif /* MBS_SUPPORT */ |
| 544 | beg += offset; |
| 545 | /* Narrow down to the line containing the candidate, and |
| 546 | run it through DFA. */ |
| 547 | end = memchr(beg, eol, buflim - beg); |
| 548 | end++; |
| 549 | #ifdef MBS_SUPPORT |
| 550 | - if (MB_CUR_MAX > 1 && mb_properties[beg - buf] == 0) |
| 551 | + if (mb_cur_max > 1 && bytes_left) |
| 552 | continue; |
| 553 | -#endif |
| 554 | +#endif /* MBS_SUPPORT */ |
| 555 | while (beg > buf && beg[-1] != eol) |
| 556 | --beg; |
| 557 | - if (kwsm.index < kwset_exact_matches) |
| 558 | - goto success; |
| 559 | - if (dfaexec (&dfa, beg, end - beg, &backref) == (size_t) -1) |
| 560 | + if ( |
| 561 | +#ifdef MBS_SUPPORT |
| 562 | + !(match_icase && mb_cur_max > 1) && |
| 563 | +#endif /* MBS_SUPPORT */ |
| 564 | + (kwsm.index < kwset_exact_matches)) |
| 565 | + goto success_in_beg_and_end; |
| 566 | + if (use_dfa && |
| 567 | + dfaexec (&dfa, beg, end - beg, &backref) == (size_t) -1) |
| 568 | continue; |
| 569 | } |
| 570 | else |
| 571 | { |
| 572 | /* No good fixed strings; start with DFA. */ |
| 573 | - size_t offset = dfaexec (&dfa, beg, buflim - beg, &backref); |
| 574 | +#ifdef MBS_SUPPORT |
| 575 | + size_t bytes_left = 0; |
| 576 | +#endif /* MBS_SUPPORT */ |
| 577 | + size_t offset = 0; |
| 578 | + if (use_dfa) |
| 579 | + offset = dfaexec (&dfa, beg, buflim - beg, &backref); |
| 580 | if (offset == (size_t) -1) |
| 581 | break; |
| 582 | /* Narrow down to the line we've found. */ |
| 583 | +#ifdef MBS_SUPPORT |
| 584 | + if (mb_cur_max > 1 && !using_utf8) |
| 585 | + { |
| 586 | + bytes_left = offset; |
| 587 | + while (bytes_left) |
| 588 | + { |
| 589 | + size_t mlen = mbrlen (beg, bytes_left, &mbs); |
| 590 | + if (mlen == (size_t) -1 || mlen == 0) |
| 591 | + { |
| 592 | + /* Incomplete character: treat as single-byte. */ |
| 593 | + memset (&mbs, '\0', sizeof (mbstate_t)); |
| 594 | + beg++; |
| 595 | + bytes_left--; |
| 596 | + continue; |
| 597 | + } |
| 598 | + |
| 599 | + if (mlen == (size_t) -2) |
| 600 | + /* Offset points inside multibyte character: |
| 601 | + * no good. */ |
| 602 | + break; |
| 603 | + |
| 604 | + beg += mlen; |
| 605 | + bytes_left -= mlen; |
| 606 | + } |
| 607 | + } |
| 608 | + else |
| 609 | +#endif /* MBS_SUPPORT */ |
| 610 | beg += offset; |
| 611 | end = memchr (beg, eol, buflim - beg); |
| 612 | end++; |
| 613 | +#ifdef MBS_SUPPORT |
| 614 | + if (mb_cur_max > 1 && bytes_left) |
| 615 | + continue; |
| 616 | +#endif /* MBS_SUPPORT */ |
| 617 | while (beg > buf && beg[-1] != eol) |
| 618 | --beg; |
| 619 | } |
| 620 | /* Successful, no backreferences encountered! */ |
| 621 | - if (!backref) |
| 622 | - goto success; |
| 623 | + if (use_dfa && !backref) |
| 624 | + goto success_in_beg_and_end; |
| 625 | } |
| 626 | else |
| 627 | end = beg + size; |
| 628 | @@ -413,14 +500,11 @@ EGexecute (char const *buf, size_t size, size_t *match_size, int exact) |
| 629 | end - beg - 1, &(patterns[i].regs)))) |
| 630 | { |
| 631 | len = patterns[i].regs.end[0] - start; |
| 632 | - if (exact) |
| 633 | - { |
| 634 | - *match_size = len; |
| 635 | - return start; |
| 636 | - } |
| 637 | + if (exact && !match_words) |
| 638 | + goto success_in_start_and_len; |
| 639 | if ((!match_lines && !match_words) |
| 640 | || (match_lines && len == end - beg - 1)) |
| 641 | - goto success; |
| 642 | + goto success_in_beg_and_end; |
| 643 | /* If -w, check if the match aligns with word boundaries. |
| 644 | We do this iteratively because: |
| 645 | (a) the line may contain more than one occurence of the |
| 646 | @@ -431,10 +515,114 @@ EGexecute (char const *buf, size_t size, size_t *match_size, int exact) |
| 647 | if (match_words) |
| 648 | while (start >= 0) |
| 649 | { |
| 650 | - if ((start == 0 || !WCHAR ((unsigned char) beg[start - 1])) |
| 651 | - && (len == end - beg - 1 |
| 652 | - || !WCHAR ((unsigned char) beg[start + len]))) |
| 653 | - goto success; |
| 654 | + int lword_match = 0; |
| 655 | + if (start == 0) |
| 656 | + lword_match = 1; |
| 657 | + else |
| 658 | + { |
| 659 | + assert (start > 0); |
| 660 | +#ifdef MBS_SUPPORT |
| 661 | + if (mb_cur_max > 1) |
| 662 | + { |
| 663 | + const char *s; |
| 664 | + size_t mr; |
| 665 | + wchar_t pwc; |
| 666 | + |
| 667 | + /* Locate the start of the multibyte character |
| 668 | + before the match position (== beg + start). */ |
| 669 | + if (using_utf8) |
| 670 | + { |
| 671 | + /* UTF-8 is a special case: scan backwards |
| 672 | + until we find a 7-bit character or a |
| 673 | + lead byte. */ |
| 674 | + s = beg + start - 1; |
| 675 | + while (s > buf |
| 676 | + && (unsigned char) *s >= 0x80 |
| 677 | + && (unsigned char) *s <= 0xbf) |
| 678 | + --s; |
| 679 | + } |
| 680 | + else |
| 681 | + { |
| 682 | + /* Scan forwards to find the start of the |
| 683 | + last complete character before the |
| 684 | + match position. */ |
| 685 | + size_t bytes_left = start - 1; |
| 686 | + s = beg; |
| 687 | + while (bytes_left > 0) |
| 688 | + { |
| 689 | + mr = mbrlen (s, bytes_left, &mbs); |
| 690 | + if (mr == (size_t) -1 || mr == 0) |
| 691 | + { |
| 692 | + memset (&mbs, '\0', sizeof (mbs)); |
| 693 | + s++; |
| 694 | + bytes_left--; |
| 695 | + continue; |
| 696 | + } |
| 697 | + if (mr == (size_t) -2) |
| 698 | + { |
| 699 | + memset (&mbs, '\0', sizeof (mbs)); |
| 700 | + break; |
| 701 | + } |
| 702 | + s += mr; |
| 703 | + bytes_left -= mr; |
| 704 | + } |
| 705 | + } |
| 706 | + mr = mbrtowc (&pwc, s, beg + start - s, &mbs); |
| 707 | + if (mr == (size_t) -2 || mr == (size_t) -1 || |
| 708 | + mr == 0) |
| 709 | + { |
| 710 | + memset (&mbs, '\0', sizeof (mbstate_t)); |
| 711 | + lword_match = 1; |
| 712 | + } |
| 713 | + else if (!(iswalnum (pwc) || pwc == L'_') |
| 714 | + && mr == beg + start - s) |
| 715 | + lword_match = 1; |
| 716 | + } |
| 717 | + else |
| 718 | +#endif /* MBS_SUPPORT */ |
| 719 | + if (!WCHAR ((unsigned char) beg[start - 1])) |
| 720 | + lword_match = 1; |
| 721 | + } |
| 722 | + |
| 723 | + if (lword_match) |
| 724 | + { |
| 725 | + int rword_match = 0; |
| 726 | + if (start + len == end - beg - 1) |
| 727 | + rword_match = 1; |
| 728 | + else |
| 729 | + { |
| 730 | +#ifdef MBS_SUPPORT |
| 731 | + if (mb_cur_max > 1) |
| 732 | + { |
| 733 | + wchar_t nwc; |
| 734 | + int mr; |
| 735 | + |
| 736 | + mr = mbtowc (&nwc, beg + start + len, |
| 737 | + end - beg - start - len - 1); |
| 738 | + if (mr <= 0) |
| 739 | + { |
| 740 | + memset (&mbs, '\0', sizeof (mbstate_t)); |
| 741 | + rword_match = 1; |
| 742 | + } |
| 743 | + else if (!iswalnum (nwc) && nwc != L'_') |
| 744 | + rword_match = 1; |
| 745 | + } |
| 746 | + else |
| 747 | +#endif /* MBS_SUPPORT */ |
| 748 | + if (!WCHAR ((unsigned char) beg[start + len])) |
| 749 | + rword_match = 1; |
| 750 | + } |
| 751 | + |
| 752 | + if (rword_match) |
| 753 | + { |
| 754 | + if (!exact) |
| 755 | + /* Returns the whole line. */ |
| 756 | + goto success_in_beg_and_end; |
| 757 | + else |
| 758 | + /* Returns just this word match. */ |
| 759 | + goto success_in_start_and_len; |
| 760 | + } |
| 761 | + } |
| 762 | if (len > 0) |
| 763 | { |
| 764 | /* Try a shorter length anchored at the same place. */ |
| 765 | @@ -461,26 +649,154 @@ EGexecute (char const *buf, size_t size, size_t *match_size, int exact) |
| 766 | } |
| 767 | } /* for Regex patterns. */ |
| 768 | } /* for (beg = end ..) */ |
| 769 | -#ifdef MBS_SUPPORT |
| 770 | - if (MB_CUR_MAX > 1 && mb_properties) |
| 771 | - free (mb_properties); |
| 772 | -#endif /* MBS_SUPPORT */ |
| 773 | + |
| 774 | + failure: |
| 775 | return (size_t) -1; |
| 776 | |
| 777 | - success: |
| 778 | -#ifdef MBS_SUPPORT |
| 779 | - if (MB_CUR_MAX > 1 && mb_properties) |
| 780 | - free (mb_properties); |
| 781 | -#endif /* MBS_SUPPORT */ |
| 782 | - *match_size = end - beg; |
| 783 | - return beg - buf; |
| 784 | + success_in_beg_and_end: |
| 785 | + len = end - beg; |
| 786 | + start = beg - buf; |
| 787 | + /* FALLTHROUGH */ |
| 788 | + |
| 789 | + success_in_start_and_len: |
| 790 | + *match_size = len; |
| 791 | + return start; |
| 792 | } |
| 793 | |
| 794 | +#ifdef MBS_SUPPORT |
| 795 | +static int f_i_multibyte; /* whether we're using the new -Fi MB method */ |
| 796 | +static struct |
| 797 | +{ |
| 798 | + wchar_t **patterns; |
| 799 | + size_t count, maxlen; |
| 800 | + unsigned char *match; |
| 801 | +} Fimb; |
| 802 | +#endif |
| 803 | + |
| 804 | static void |
| 805 | Fcompile (char const *pattern, size_t size) |
| 806 | { |
| 807 | + int mb_cur_max = MB_CUR_MAX; |
| 808 | char const *beg, *lim, *err; |
| 809 | |
| 810 | + check_utf8 (); |
| 811 | +#ifdef MBS_SUPPORT |
| 812 | + /* Support -F -i for UTF-8 input. */ |
| 813 | + if (match_icase && mb_cur_max > 1) |
| 814 | + { |
| 815 | + mbstate_t mbs; |
| 816 | + wchar_t *wcpattern = xmalloc ((size + 1) * sizeof (wchar_t)); |
| 817 | + const char *patternend = pattern; |
| 818 | + size_t wcsize; |
| 819 | + kwset_t fimb_kwset = NULL; |
| 820 | + char *starts = NULL; |
| 821 | + wchar_t *wcbeg, *wclim; |
| 822 | + size_t allocated = 0; |
| 823 | + |
| 824 | + memset (&mbs, '\0', sizeof (mbs)); |
| 825 | +# ifdef __GNU_LIBRARY__ |
| 826 | + wcsize = mbsnrtowcs (wcpattern, &patternend, size, size, &mbs); |
| 827 | + if (patternend != pattern + size) |
| 828 | + wcsize = (size_t) -1; |
| 829 | +# else |
| 830 | + { |
| 831 | + char *patterncopy = xmalloc (size + 1); |
| 832 | + |
| 833 | + memcpy (patterncopy, pattern, size); |
| 834 | + patterncopy[size] = '\0'; |
| 835 | + patternend = patterncopy; |
| 836 | + wcsize = mbsrtowcs (wcpattern, &patternend, size, &mbs); |
| 837 | + if (patternend != patterncopy + size) |
| 838 | + wcsize = (size_t) -1; |
| 839 | + free (patterncopy); |
| 840 | + } |
| 841 | +# endif |
| 842 | + if (wcsize + 2 <= 2) |
| 843 | + { |
| 844 | +fimb_fail: |
| 845 | + free (wcpattern); |
| 846 | + free (starts); |
| 847 | + if (fimb_kwset) |
| 848 | + kwsfree (fimb_kwset); |
| 849 | + free (Fimb.patterns); |
| 850 | + Fimb.patterns = NULL; |
| 851 | + } |
| 852 | + else |
| 853 | + { |
| 854 | + if (!(fimb_kwset = kwsalloc (NULL))) |
| 855 | + error (2, 0, _("memory exhausted")); |
| 856 | + |
| 857 | + starts = xmalloc (mb_cur_max * 3); |
| 858 | + wcbeg = wcpattern; |
| 859 | + do |
| 860 | + { |
| 861 | + int i; |
| 862 | + size_t wclen; |
| 863 | + |
| 864 | + if (Fimb.count >= allocated) |
| 865 | + { |
| 866 | + if (allocated == 0) |
| 867 | + allocated = 128; |
| 868 | + else |
| 869 | + allocated *= 2; |
| 870 | + Fimb.patterns = xrealloc (Fimb.patterns, |
| 871 | + sizeof (wchar_t *) * allocated); |
| 872 | + } |
| 873 | + Fimb.patterns[Fimb.count++] = wcbeg; |
| 874 | + for (wclim = wcbeg; |
| 875 | + wclim < wcpattern + wcsize && *wclim != L'\n'; ++wclim) |
| 876 | + *wclim = towlower (*wclim); |
| 877 | + *wclim = L'\0'; |
| 878 | + wclen = wclim - wcbeg; |
| 879 | + if (wclen > Fimb.maxlen) |
| 880 | + Fimb.maxlen = wclen; |
| 881 | + if (wclen > 3) |
| 882 | + wclen = 3; |
| 883 | + if (wclen == 0) |
| 884 | + { |
| 885 | + if ((err = kwsincr (fimb_kwset, "", 0)) != 0) |
| 886 | + error (2, 0, err); |
| 887 | + } |
| 888 | + else |
| 889 | + for (i = 0; i < (1 << wclen); i++) |
| 890 | + { |
| 891 | + char *p = starts; |
| 892 | + int j, k; |
| 893 | + |
| 894 | + for (j = 0; j < wclen; ++j) |
| 895 | + { |
| 896 | + wchar_t wc = wcbeg[j]; |
| 897 | + if (i & (1 << j)) |
| 898 | + { |
| 899 | + wc = towupper (wc); |
| 900 | + if (wc == wcbeg[j]) |
| 901 | + continue; |
| 902 | + } |
| 903 | + k = wctomb (p, wc); |
| 904 | + if (k <= 0) |
| 905 | + goto fimb_fail; |
| 906 | + p += k; |
| 907 | + } |
| 908 | + if ((err = kwsincr (fimb_kwset, starts, p - starts)) != 0) |
| 909 | + error (2, 0, err); |
| 910 | + } |
| 911 | + if (wclim < wcpattern + wcsize) |
| 912 | + ++wclim; |
| 913 | + wcbeg = wclim; |
| 914 | + } |
| 915 | + while (wcbeg < wcpattern + wcsize); |
| 916 | + f_i_multibyte = 1; |
| 917 | + kwset = fimb_kwset; |
| 918 | + free (starts); |
| 919 | + Fimb.match = xmalloc (Fimb.count); |
| 920 | + if ((err = kwsprep (kwset)) != 0) |
| 921 | + error (2, 0, err); |
| 922 | + return; |
| 923 | + } |
| 924 | + } |
| 925 | +#endif /* MBS_SUPPORT */ |
| 926 | + |
| 927 | + |
| 928 | kwsinit (); |
| 929 | beg = pattern; |
| 930 | do |
| 931 | @@ -499,6 +815,76 @@ Fcompile (char const *pattern, size_t size) |
| 932 | error (2, 0, err); |
| 933 | } |
| 934 | |
| 935 | +#ifdef MBS_SUPPORT |
| 936 | +static int |
| 937 | +Fimbexec (const char *buf, size_t size, size_t *plen, int exact) |
| 938 | +{ |
| 939 | + size_t len, letter, i; |
| 940 | + int ret = -1; |
| 941 | + mbstate_t mbs; |
| 942 | + wchar_t wc; |
| 943 | + int patterns_left; |
| 944 | + |
| 945 | + assert (match_icase && f_i_multibyte == 1); |
| 946 | + assert (MB_CUR_MAX > 1); |
| 947 | + |
| 948 | + memset (&mbs, '\0', sizeof (mbs)); |
| 949 | + memset (Fimb.match, '\1', Fimb.count); |
| 950 | + letter = len = 0; |
| 951 | + patterns_left = 1; |
| 952 | + while (patterns_left && len <= size) |
| 953 | + { |
| 954 | + size_t c; |
| 955 | + |
| 956 | + patterns_left = 0; |
| 957 | + if (len < size) |
| 958 | + { |
| 959 | + c = mbrtowc (&wc, buf + len, size - len, &mbs); |
| 960 | + if (c + 2 <= 2) |
| 961 | + return ret; |
| 962 | + |
| 963 | + wc = towlower (wc); |
| 964 | + } |
| 965 | + else |
| 966 | + { |
| 967 | + c = 1; |
| 968 | + wc = L'\0'; |
| 969 | + } |
| 970 | + |
| 971 | + for (i = 0; i < Fimb.count; i++) |
| 972 | + { |
| 973 | + if (Fimb.match[i]) |
| 974 | + { |
| 975 | + if (Fimb.patterns[i][letter] == L'\0') |
| 976 | + { |
| 977 | + /* Found a match. */ |
| 978 | + *plen = len; |
| 979 | + if (!exact && !match_words) |
| 980 | + return 0; |
| 981 | + else |
| 982 | + { |
| 983 | + /* For -w or exact look for longest match. */ |
| 984 | + ret = 0; |
| 985 | + Fimb.match[i] = '\0'; |
| 986 | + continue; |
| 987 | + } |
| 988 | + } |
| 989 | + |
| 990 | + if (Fimb.patterns[i][letter] == wc) |
| 991 | + patterns_left = 1; |
| 992 | + else |
| 993 | + Fimb.match[i] = '\0'; |
| 994 | + } |
| 995 | + } |
| 996 | + |
| 997 | + len += c; |
| 998 | + letter++; |
| 999 | + } |
| 1000 | + |
| 1001 | + return ret; |
| 1002 | +} |
| 1003 | +#endif /* MBS_SUPPORT */ |
| 1004 | + |
| 1005 | static size_t |
| 1006 | Fexecute (char const *buf, size_t size, size_t *match_size, int exact) |
| 1007 | { |
| 1008 | @@ -506,88 +892,268 @@ Fexecute (char const *buf, size_t size, size_t *match_size, int exact) |
| 1009 | register size_t len; |
| 1010 | char eol = eolbyte; |
| 1011 | struct kwsmatch kwsmatch; |
| 1012 | + size_t ret_val; |
| 1013 | #ifdef MBS_SUPPORT |
| 1014 | - char *mb_properties; |
| 1015 | - if (MB_CUR_MAX > 1) |
| 1016 | - mb_properties = check_multibyte_string (buf, size); |
| 1017 | + int mb_cur_max = MB_CUR_MAX; |
| 1018 | + mbstate_t mbs; |
| 1019 | + memset (&mbs, '\0', sizeof (mbstate_t)); |
| 1020 | + const char *last_char = NULL; |
| 1021 | #endif /* MBS_SUPPORT */ |
| 1022 | |
| 1023 | - for (beg = buf; beg <= buf + size; ++beg) |
| 1024 | + for (beg = buf; beg < buf + size; ++beg) |
| 1025 | { |
| 1026 | - size_t offset = kwsexec (kwset, beg, buf + size - beg, &kwsmatch); |
| 1027 | + size_t offset; |
| 1028 | + offset = kwsexec (kwset, beg, buf + size - beg, &kwsmatch); |
| 1029 | + |
| 1030 | if (offset == (size_t) -1) |
| 1031 | - { |
| 1032 | + goto failure; |
| 1033 | #ifdef MBS_SUPPORT |
| 1034 | - if (MB_CUR_MAX > 1) |
| 1035 | - free(mb_properties); |
| 1036 | -#endif /* MBS_SUPPORT */ |
| 1037 | - return offset; |
| 1038 | + if (mb_cur_max > 1 && !using_utf8) |
| 1039 | + { |
| 1040 | + size_t bytes_left = offset; |
| 1041 | + while (bytes_left) |
| 1042 | + { |
| 1043 | + size_t mlen = mbrlen (beg, bytes_left, &mbs); |
| 1044 | + |
| 1045 | + last_char = beg; |
| 1046 | + if (mlen == (size_t) -1 || mlen == 0) |
| 1047 | + { |
| 1048 | + /* Incomplete character: treat as single-byte. */ |
| 1049 | + memset (&mbs, '\0', sizeof (mbstate_t)); |
| 1050 | + beg++; |
| 1051 | + bytes_left--; |
| 1052 | + continue; |
| 1053 | + } |
| 1054 | + |
| 1055 | + if (mlen == (size_t) -2) |
| 1056 | + /* Offset points inside multibyte character: no good. */ |
| 1057 | + break; |
| 1058 | + |
| 1059 | + beg += mlen; |
| 1060 | + bytes_left -= mlen; |
| 1061 | + } |
| 1062 | + |
| 1063 | + if (bytes_left) |
| 1064 | + continue; |
| 1065 | } |
| 1066 | -#ifdef MBS_SUPPORT |
| 1067 | - if (MB_CUR_MAX > 1 && mb_properties[offset+beg-buf] == 0) |
| 1068 | - continue; /* It is a part of multibyte character. */ |
| 1069 | + else |
| 1070 | #endif /* MBS_SUPPORT */ |
| 1071 | beg += offset; |
| 1072 | - len = kwsmatch.size[0]; |
| 1073 | - if (exact) |
| 1074 | - { |
| 1075 | - *match_size = len; |
| 1076 | #ifdef MBS_SUPPORT |
| 1077 | - if (MB_CUR_MAX > 1) |
| 1078 | - free (mb_properties); |
| 1079 | + /* For f_i_multibyte, the string at beg now matches first 3 chars of |
| 1080 | + one of the search strings (less if there are shorter search strings). |
| 1081 | + See if this is a real match. */ |
| 1082 | + if (f_i_multibyte |
| 1083 | + && Fimbexec (beg, buf + size - beg, &kwsmatch.size[0], exact)) |
| 1084 | + goto next_char; |
| 1085 | #endif /* MBS_SUPPORT */ |
| 1086 | - return beg - buf; |
| 1087 | - } |
| 1088 | + len = kwsmatch.size[0]; |
| 1089 | + if (exact && !match_words) |
| 1090 | + goto success_in_beg_and_len; |
| 1091 | if (match_lines) |
| 1092 | { |
| 1093 | if (beg > buf && beg[-1] != eol) |
| 1094 | - continue; |
| 1095 | + goto next_char; |
| 1096 | if (beg + len < buf + size && beg[len] != eol) |
| 1097 | - continue; |
| 1098 | + goto next_char; |
| 1099 | goto success; |
| 1100 | } |
| 1101 | else if (match_words) |
| 1102 | - for (try = beg; len; ) |
| 1103 | - { |
| 1104 | - if (try > buf && WCHAR((unsigned char) try[-1])) |
| 1105 | - break; |
| 1106 | - if (try + len < buf + size && WCHAR((unsigned char) try[len])) |
| 1107 | - { |
| 1108 | - offset = kwsexec (kwset, beg, --len, &kwsmatch); |
| 1109 | - if (offset == (size_t) -1) |
| 1110 | - { |
| 1111 | + { |
| 1112 | + while (len) |
| 1113 | + { |
| 1114 | + int word_match = 0; |
| 1115 | + if (beg > buf) |
| 1116 | + { |
| 1117 | #ifdef MBS_SUPPORT |
| 1118 | - if (MB_CUR_MAX > 1) |
| 1119 | - free (mb_properties); |
| 1120 | + if (mb_cur_max > 1) |
| 1121 | + { |
| 1122 | + const char *s; |
| 1123 | + int mr; |
| 1124 | + wchar_t pwc; |
| 1125 | + |
| 1126 | + if (using_utf8) |
| 1127 | + { |
| 1128 | + s = beg - 1; |
| 1129 | + while (s > buf |
| 1130 | + && (unsigned char) *s >= 0x80 |
| 1131 | + && (unsigned char) *s <= 0xbf) |
| 1132 | + --s; |
| 1133 | + } |
| 1134 | + else |
| 1135 | + s = last_char; |
| 1136 | + mr = mbtowc (&pwc, s, beg - s); |
| 1137 | + if (mr <= 0) |
| 1138 | + memset (&mbs, '\0', sizeof (mbstate_t)); |
| 1139 | + else if ((iswalnum (pwc) || pwc == L'_') |
| 1140 | + && mr == (int) (beg - s)) |
| 1141 | + goto next_char; |
| 1142 | + } |
| 1143 | + else |
| 1144 | #endif /* MBS_SUPPORT */ |
| 1145 | - return offset; |
| 1146 | - } |
| 1147 | - try = beg + offset; |
| 1148 | - len = kwsmatch.size[0]; |
| 1149 | - } |
| 1150 | - else |
| 1151 | - goto success; |
| 1152 | - } |
| 1153 | + if (WCHAR ((unsigned char) beg[-1])) |
| 1154 | + goto next_char; |
| 1155 | + } |
| 1156 | +#ifdef MBS_SUPPORT |
| 1157 | + if (mb_cur_max > 1) |
| 1158 | + { |
| 1159 | + wchar_t nwc; |
| 1160 | + int mr; |
| 1161 | + |
| 1162 | + mr = mbtowc (&nwc, beg + len, buf + size - beg - len); |
| 1163 | + if (mr <= 0) |
| 1164 | + { |
| 1165 | + memset (&mbs, '\0', sizeof (mbstate_t)); |
| 1166 | + word_match = 1; |
| 1167 | + } |
| 1168 | + else if (!iswalnum (nwc) && nwc != L'_') |
| 1169 | + word_match = 1; |
| 1170 | + } |
| 1171 | + else |
| 1172 | +#endif /* MBS_SUPPORT */ |
| 1173 | + if (beg + len >= buf + size || !WCHAR ((unsigned char) beg[len])) |
| 1174 | + word_match = 1; |
| 1175 | + if (word_match) |
| 1176 | + { |
| 1177 | + if (!exact) |
| 1178 | + /* Returns the whole line now we know there's a word match. */ |
| 1179 | + goto success; |
| 1180 | + else |
| 1181 | + /* Returns just this word match. */ |
| 1182 | + goto success_in_beg_and_len; |
| 1183 | + } |
| 1184 | + if (len > 0) |
| 1185 | + { |
| 1186 | + /* Try a shorter length anchored at the same place. */ |
| 1187 | + --len; |
| 1188 | + offset = kwsexec (kwset, beg, len, &kwsmatch); |
| 1189 | + |
| 1190 | + if (offset == -1) |
| 1191 | + goto next_char; /* Try a different anchor. */ |
| 1192 | +#ifdef MBS_SUPPORT |
| 1193 | + if (mb_cur_max > 1 && !using_utf8) |
| 1194 | + { |
| 1195 | + size_t bytes_left = offset; |
| 1196 | + while (bytes_left) |
| 1197 | + { |
| 1198 | + size_t mlen = mbrlen (beg, bytes_left, &mbs); |
| 1199 | + |
| 1200 | + last_char = beg; |
| 1201 | + if (mlen == (size_t) -1 || mlen == 0) |
| 1202 | + { |
| 1203 | + /* Incomplete character: treat as single-byte. */ |
| 1204 | + memset (&mbs, '\0', sizeof (mbstate_t)); |
| 1205 | + beg++; |
| 1206 | + bytes_left--; |
| 1207 | + continue; |
| 1208 | + } |
| 1209 | + |
| 1210 | + if (mlen == (size_t) -2) |
| 1211 | + { |
| 1212 | + /* Offset points inside multibyte character: |
| 1213 | + * no good. */ |
| 1214 | + break; |
| 1215 | + } |
| 1216 | + |
| 1217 | + beg += mlen; |
| 1218 | + bytes_left -= mlen; |
| 1219 | + } |
| 1220 | + |
| 1221 | + if (bytes_left) |
| 1222 | + { |
| 1223 | + memset (&mbs, '\0', sizeof (mbstate_t)); |
| 1224 | + goto next_char; /* Try a different anchor. */ |
| 1225 | + } |
| 1226 | + } |
| 1227 | + else |
| 1228 | +#endif /* MBS_SUPPORT */ |
| 1229 | + beg += offset; |
| 1230 | +#ifdef MBS_SUPPORT |
| 1231 | + /* The string at beg now matches first 3 chars of one of |
| 1232 | + the search strings (less if there are shorter search |
| 1233 | + strings). See if this is a real match. */ |
| 1234 | + if (f_i_multibyte |
| 1235 | + && Fimbexec (beg, len - offset, &kwsmatch.size[0], |
| 1236 | + exact)) |
| 1237 | + goto next_char; |
| 1238 | +#endif /* MBS_SUPPORT */ |
| 1239 | + len = kwsmatch.size[0]; |
| 1240 | + } |
| 1241 | + } |
| 1242 | + } |
| 1243 | else |
| 1244 | goto success; |
| 1245 | - } |
| 1246 | - |
| 1247 | +next_char:; |
| 1248 | #ifdef MBS_SUPPORT |
| 1249 | - if (MB_CUR_MAX > 1) |
| 1250 | - free (mb_properties); |
| 1251 | + /* Advance to next character. For MB_CUR_MAX == 1 case this is handled |
| 1252 | + by ++beg above. */ |
| 1253 | + if (mb_cur_max > 1) |
| 1254 | + { |
| 1255 | + if (using_utf8) |
| 1256 | + { |
| 1257 | + unsigned char c = *beg; |
| 1258 | + if (c >= 0xc2) |
| 1259 | + { |
| 1260 | + if (c < 0xe0) |
| 1261 | + ++beg; |
| 1262 | + else if (c < 0xf0) |
| 1263 | + beg += 2; |
| 1264 | + else if (c < 0xf8) |
| 1265 | + beg += 3; |
| 1266 | + else if (c < 0xfc) |
| 1267 | + beg += 4; |
| 1268 | + else if (c < 0xfe) |
| 1269 | + beg += 5; |
| 1270 | + } |
| 1271 | + } |
| 1272 | + else |
| 1273 | + { |
| 1274 | + size_t l = mbrlen (beg, buf + size - beg, &mbs); |
| 1275 | + |
| 1276 | + last_char = beg; |
| 1277 | + if (l + 2 >= 2) |
| 1278 | + beg += l - 1; |
| 1279 | + else |
| 1280 | + memset (&mbs, '\0', sizeof (mbstate_t)); |
| 1281 | + } |
| 1282 | + } |
| 1283 | #endif /* MBS_SUPPORT */ |
| 1284 | + } |
| 1285 | + |
| 1286 | + failure: |
| 1287 | return -1; |
| 1288 | |
| 1289 | success: |
| 1290 | +#ifdef MBS_SUPPORT |
| 1291 | + if (mb_cur_max > 1 && !using_utf8) |
| 1292 | + { |
| 1293 | + end = beg + len; |
| 1294 | + while (end < buf + size) |
| 1295 | + { |
| 1296 | + size_t mlen = mbrlen (end, buf + size - end, &mbs); |
| 1297 | + if (mlen == (size_t) -1 || mlen == (size_t) -2 || mlen == 0) |
| 1298 | + { |
| 1299 | + memset (&mbs, '\0', sizeof (mbstate_t)); |
| 1300 | + mlen = 1; |
| 1301 | + } |
| 1302 | + if (mlen == 1 && *end == eol) |
| 1303 | + break; |
| 1304 | + |
| 1305 | + end += mlen; |
| 1306 | + } |
| 1307 | + } |
| 1308 | + else |
| 1309 | +#endif /* MBS_SUPPORT */ |
| 1310 | end = memchr (beg + len, eol, (buf + size) - (beg + len)); |
| 1311 | + |
| 1312 | end++; |
| 1313 | while (buf < beg && beg[-1] != eol) |
| 1314 | --beg; |
| 1315 | - *match_size = end - beg; |
| 1316 | -#ifdef MBS_SUPPORT |
| 1317 | - if (MB_CUR_MAX > 1) |
| 1318 | - free (mb_properties); |
| 1319 | -#endif /* MBS_SUPPORT */ |
| 1320 | + len = end - beg; |
| 1321 | + /* FALLTHROUGH */ |
| 1322 | + |
| 1323 | + success_in_beg_and_len: |
| 1324 | + *match_size = len; |
| 1325 | return beg - buf; |
| 1326 | } |
| 1327 | |
| 1328 | @@ -701,8 +1267,9 @@ Pexecute (char const *buf, size_t size, size_t *match_size, int exact) |
| 1329 | char eol = eolbyte; |
| 1330 | if (!exact) |
| 1331 | { |
| 1332 | - end = memchr (end, eol, buflim - end); |
| 1333 | - end++; |
| 1334 | + while (end < buflim) |
| 1335 | + if (*end++ == eol) |
| 1336 | + break; |
| 1337 | while (buf < beg && beg[-1] != eol) |
| 1338 | --beg; |
| 1339 | } |
| 1340 | -- |
| 1341 | 1.8.4.2 |
| 1342 | |