| From c884dd12ec062569335702848fc5f29f436c28fa Mon Sep 17 00:00:00 2001 |
| From: Li xin <lixin.fnst@cn.fujitsu.com> |
| Date: Mon, 25 May 2015 10:15:57 +0900 |
| Subject: [PATCH] grep egrep fgrep: Fix LSB NG cases. |
| |
| The LSB core test requires grep egrep and fgrep can |
| perform pattern matching in searches without regard |
| to case if -i option is specified. |
| |
| Upstream-Status: backport. |
| |
| Signed-off-by: Li Xin <lixin.fnst@cn.fujitsu.com> |
| --- |
| lib/posix/regex.h | 4 + |
| src/dfa.c | 22 +- |
| src/grep.c | 96 ++++--- |
| src/search.c | 833 +++++++++++++++++++++++++++++++++++++++++++++--------- |
| 4 files changed, 768 insertions(+), 187 deletions(-) |
| |
| diff --git a/lib/posix/regex.h b/lib/posix/regex.h |
| index 63c2fef..7bb2b0e 100644 |
| --- a/lib/posix/regex.h |
| +++ b/lib/posix/regex.h |
| @@ -109,6 +109,10 @@ typedef unsigned long int reg_syntax_t; |
| If not set, \{, \}, {, and } are literals. */ |
| #define RE_INTERVALS (RE_HAT_LISTS_NOT_NEWLINE << 1) |
| |
| +/* If this bit is set, then ignore case when matching. |
| + If not set, then case is significant. */ |
| +#define RE_ICASE (RE_INVALID_INTERVAL_ORD << 1) |
| + |
| /* If this bit is set, +, ? and | aren't recognized as operators. |
| If not set, they are. */ |
| #define RE_LIMITED_OPS (RE_INTERVALS << 1) |
| diff --git a/src/dfa.c b/src/dfa.c |
| index 590bfa7..27c876a 100644 |
| --- a/src/dfa.c |
| +++ b/src/dfa.c |
| @@ -414,7 +414,7 @@ update_mb_len_index (unsigned char const *p, int len) |
| |
| /* This function fetch a wide character, and update cur_mb_len, |
| used only if the current locale is a multibyte environment. */ |
| -static wchar_t |
| +static wint_t |
| fetch_wc (char const *eoferr) |
| { |
| wchar_t wc; |
| @@ -423,7 +423,7 @@ fetch_wc (char const *eoferr) |
| if (eoferr != 0) |
| dfaerror (eoferr); |
| else |
| - return -1; |
| + return WEOF; |
| } |
| |
| cur_mb_len = mbrtowc(&wc, lexptr, lexleft, &mbs); |
| @@ -459,7 +459,7 @@ fetch_wc (char const *eoferr) |
| static void |
| parse_bracket_exp_mb () |
| { |
| - wchar_t wc, wc1, wc2; |
| + wint_t wc, wc1, wc2; |
| |
| /* Work area to build a mb_char_classes. */ |
| struct mb_char_classes *work_mbc; |
| @@ -496,7 +496,7 @@ parse_bracket_exp_mb () |
| work_mbc->invert = 0; |
| do |
| { |
| - wc1 = -1; /* mark wc1 is not initialized". */ |
| + wc1 = WEOF; /* mark wc1 is not initialized". */ |
| |
| /* Note that if we're looking at some other [:...:] construct, |
| we just treat it as a bunch of ordinary characters. We can do |
| @@ -586,7 +586,7 @@ parse_bracket_exp_mb () |
| work_mbc->coll_elems[work_mbc->ncoll_elems++] = elem; |
| } |
| } |
| - wc = -1; |
| + wc1 = wc = WEOF; |
| } |
| else |
| /* We treat '[' as a normal character here. */ |
| @@ -600,7 +600,7 @@ parse_bracket_exp_mb () |
| wc = fetch_wc(("Unbalanced [")); |
| } |
| |
| - if (wc1 == -1) |
| + if (wc1 == WEOF) |
| wc1 = fetch_wc(_("Unbalanced [")); |
| |
| if (wc1 == L'-') |
| @@ -630,17 +630,17 @@ parse_bracket_exp_mb () |
| } |
| REALLOC_IF_NECESSARY(work_mbc->range_sts, wchar_t, |
| range_sts_al, work_mbc->nranges + 1); |
| - work_mbc->range_sts[work_mbc->nranges] = wc; |
| + work_mbc->range_sts[work_mbc->nranges] = (wchar_t)wc; |
| REALLOC_IF_NECESSARY(work_mbc->range_ends, wchar_t, |
| range_ends_al, work_mbc->nranges + 1); |
| - work_mbc->range_ends[work_mbc->nranges++] = wc2; |
| + work_mbc->range_ends[work_mbc->nranges++] = (wchar_t)wc2; |
| } |
| - else if (wc != -1) |
| + else if (wc != WEOF) |
| /* build normal characters. */ |
| { |
| REALLOC_IF_NECESSARY(work_mbc->chars, wchar_t, chars_al, |
| work_mbc->nchars + 1); |
| - work_mbc->chars[work_mbc->nchars++] = wc; |
| + work_mbc->chars[work_mbc->nchars++] = (wchar_t)wc; |
| } |
| } |
| while ((wc = wc1) != L']'); |
| @@ -2552,6 +2552,8 @@ match_mb_charset (struct dfa *d, int s, position pos, int index) |
| } |
| |
| /* match with a character? */ |
| + if (case_fold) |
| + wc = towlower (wc); |
| for (i = 0; i<work_mbc->nchars; i++) |
| { |
| if (wc == work_mbc->chars[i]) |
| diff --git a/src/grep.c b/src/grep.c |
| index 2fb2fac..3fd4b47 100644 |
| --- a/src/grep.c |
| +++ b/src/grep.c |
| @@ -30,6 +30,12 @@ |
| # include <sys/time.h> |
| # include <sys/resource.h> |
| #endif |
| +#if defined HAVE_WCTYPE_H && defined HAVE_WCHAR_H && defined HAVE_MBRTOWC |
| +/* We can handle multibyte string. */ |
| +# define MBS_SUPPORT |
| +# include <wchar.h> |
| +# include <wctype.h> |
| +#endif |
| #include <stdio.h> |
| #include "system.h" |
| #include "getopt.h" |
| @@ -255,19 +261,6 @@ reset (int fd, char const *file, struct stats *stats) |
| bufbeg[-1] = eolbyte; |
| bufdesc = fd; |
| |
| - if (fstat (fd, &stats->stat) != 0) |
| - { |
| - error (0, errno, "fstat"); |
| - return 0; |
| - } |
| - if (directories == SKIP_DIRECTORIES && S_ISDIR (stats->stat.st_mode)) |
| - return 0; |
| -#ifndef DJGPP |
| - if (devices == SKIP_DEVICES && (S_ISCHR(stats->stat.st_mode) || S_ISBLK(stats->stat.st_mode) || S_ISSOCK(stats->stat.st_mode))) |
| -#else |
| - if (devices == SKIP_DEVICES && (S_ISCHR(stats->stat.st_mode) || S_ISBLK(stats->stat.st_mode))) |
| -#endif |
| - return 0; |
| if (S_ISREG (stats->stat.st_mode)) |
| { |
| if (file) |
| @@ -558,33 +551,6 @@ prline (char const *beg, char const *lim, int sep) |
| { |
| size_t match_size; |
| size_t match_offset; |
| - if(match_icase) |
| - { |
| - /* Yuck, this is tricky */ |
| - char *buf = (char*) xmalloc (lim - beg); |
| - char *ibeg = buf; |
| - char *ilim = ibeg + (lim - beg); |
| - int i; |
| - for (i = 0; i < lim - beg; i++) |
| - ibeg[i] = tolower (beg[i]); |
| - while ((match_offset = (*execute) (ibeg, ilim-ibeg, &match_size, 1)) |
| - != (size_t) -1) |
| - { |
| - char const *b = beg + match_offset; |
| - if (b == lim) |
| - break; |
| - fwrite (beg, sizeof (char), match_offset, stdout); |
| - printf ("\33[%sm", grep_color); |
| - fwrite (b, sizeof (char), match_size, stdout); |
| - fputs ("\33[00m", stdout); |
| - beg = b + match_size; |
| - ibeg = ibeg + match_offset + match_size; |
| - } |
| - fwrite (beg, 1, lim - beg, stdout); |
| - free (buf); |
| - lastout = lim; |
| - return; |
| - } |
| while (lim-beg && (match_offset = (*execute) (beg, lim - beg, &match_size, 1)) |
| != (size_t) -1) |
| { |
| @@ -601,6 +567,7 @@ prline (char const *beg, char const *lim, int sep) |
| fputs ("\33[00m", stdout); |
| beg = b + match_size; |
| } |
| + fputs ("\33[K", stdout); |
| } |
| fwrite (beg, 1, lim - beg, stdout); |
| if (ferror (stdout)) |
| @@ -623,7 +590,7 @@ prpending (char const *lim) |
| size_t match_size; |
| --pending; |
| if (outleft |
| - || (((*execute) (lastout, nl - lastout, &match_size, 0) == (size_t) -1) |
| + || (((*execute) (lastout, nl + 1 - lastout, &match_size, 0) == (size_t) -1) |
| == !out_invert)) |
| prline (lastout, nl + 1, '-'); |
| else |
| @@ -895,6 +862,19 @@ grepfile (char const *file, struct stats *stats) |
| } |
| else |
| { |
| + if (stat (file, &stats->stat) != 0) |
| + { |
| + suppressible_error (file, errno); |
| + return 1; |
| + } |
| + if (directories == SKIP_DIRECTORIES && S_ISDIR (stats->stat.st_mode)) |
| + return 1; |
| +#ifndef DJGPP |
| + if (devices == SKIP_DEVICES && (S_ISCHR(stats->stat.st_mode) || S_ISBLK(stats->stat.st_mode) || S_ISSOCK(stats->stat.st_mode) || S_ISFIFO(stats->stat.st_mode))) |
| +#else |
| + if (devices == SKIP_DEVICES && (S_ISCHR(stats->stat.st_mode) || S_ISBLK(stats->stat.st_mode))) |
| +#endif |
| + return 1; |
| while ((desc = open (file, O_RDONLY)) < 0 && errno == EINTR) |
| continue; |
| |
| @@ -1681,9 +1661,6 @@ warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.\n")) |
| out_invert ^= 1; |
| match_lines = match_words = 0; |
| } |
| - else |
| - /* Strip trailing newline. */ |
| - --keycc; |
| } |
| else |
| if (optind < argc) |
| @@ -1697,6 +1674,37 @@ warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.\n")) |
| if (!install_matcher (matcher) && !install_matcher ("default")) |
| abort (); |
| |
| +#ifdef MBS_SUPPORT |
| + if (MB_CUR_MAX != 1 && match_icase) |
| + { |
| + wchar_t wc; |
| + mbstate_t cur_state, prev_state; |
| + int i, len = strlen(keys); |
| + |
| + memset(&cur_state, 0, sizeof(mbstate_t)); |
| + for (i = 0; i <= len ;) |
| + { |
| + size_t mbclen; |
| + mbclen = mbrtowc(&wc, keys + i, len - i, &cur_state); |
| + if (mbclen == (size_t) -1 || mbclen == (size_t) -2 || mbclen == 0) |
| + { |
| + /* An invalid sequence, or a truncated multibyte character. |
| + We treat it as a singlebyte character. */ |
| + mbclen = 1; |
| + } |
| + else |
| + { |
| + if (iswupper((wint_t)wc)) |
| + { |
| + wc = towlower((wint_t)wc); |
| + wcrtomb(keys + i, wc, &cur_state); |
| + } |
| + } |
| + i += mbclen; |
| + } |
| + } |
| +#endif /* MBS_SUPPORT */ |
| + |
| (*compile)(keys, keycc); |
| |
| if ((argc - optind > 1 && !no_filenames) || with_filenames) |
| diff --git a/src/search.c b/src/search.c |
| index 7bd233f..3c6a485 100644 |
| --- a/src/search.c |
| +++ b/src/search.c |
| @@ -18,9 +18,13 @@ |
| |
| /* Written August 1992 by Mike Haertel. */ |
| |
| +#ifndef _GNU_SOURCE |
| +# define _GNU_SOURCE 1 |
| +#endif |
| #ifdef HAVE_CONFIG_H |
| # include <config.h> |
| #endif |
| +#include <assert.h> |
| #include <sys/types.h> |
| #if defined HAVE_WCTYPE_H && defined HAVE_WCHAR_H && defined HAVE_MBRTOWC |
| /* We can handle multibyte string. */ |
| @@ -31,7 +35,7 @@ |
| |
| #include "system.h" |
| #include "grep.h" |
| -#include "regex.h" |
| +#include <regex.h> |
| #include "dfa.h" |
| #include "kwset.h" |
| #include "error.h" |
| @@ -39,6 +43,9 @@ |
| #ifdef HAVE_LIBPCRE |
| # include <pcre.h> |
| #endif |
| +#ifdef HAVE_LANGINFO_CODESET |
| +# include <langinfo.h> |
| +#endif |
| |
| #define NCHAR (UCHAR_MAX + 1) |
| |
| @@ -70,9 +77,10 @@ static kwset_t kwset; |
| call the regexp matcher at all. */ |
| static int kwset_exact_matches; |
| |
| -#if defined(MBS_SUPPORT) |
| -static char* check_multibyte_string PARAMS ((char const *buf, size_t size)); |
| -#endif |
| +/* UTF-8 encoding allows some optimizations that we can't otherwise |
| + assume in a multibyte encoding. */ |
| +static int using_utf8; |
| + |
| static void kwsinit PARAMS ((void)); |
| static void kwsmusts PARAMS ((void)); |
| static void Gcompile PARAMS ((char const *, size_t)); |
| @@ -84,6 +92,15 @@ static void Pcompile PARAMS ((char const *, size_t )); |
| static size_t Pexecute PARAMS ((char const *, size_t, size_t *, int)); |
| |
| void |
| +check_utf8 (void) |
| +{ |
| +#ifdef HAVE_LANGINFO_CODESET |
| + if (strcmp (nl_langinfo (CODESET), "UTF-8") == 0) |
| + using_utf8 = 1; |
| +#endif |
| +} |
| + |
| +void |
| dfaerror (char const *mesg) |
| { |
| error (2, 0, mesg); |
| @@ -141,38 +158,6 @@ kwsmusts (void) |
| } |
| } |
| |
| -#ifdef MBS_SUPPORT |
| -/* This function allocate the array which correspond to "buf". |
| - Then this check multibyte string and mark on the positions which |
| - are not singlebyte character nor the first byte of a multibyte |
| - character. Caller must free the array. */ |
| -static char* |
| -check_multibyte_string(char const *buf, size_t size) |
| -{ |
| - char *mb_properties = malloc(size); |
| - mbstate_t cur_state; |
| - int i; |
| - memset(&cur_state, 0, sizeof(mbstate_t)); |
| - memset(mb_properties, 0, sizeof(char)*size); |
| - for (i = 0; i < size ;) |
| - { |
| - size_t mbclen; |
| - mbclen = mbrlen(buf + i, size - i, &cur_state); |
| - |
| - if (mbclen == (size_t) -1 || mbclen == (size_t) -2 || mbclen == 0) |
| - { |
| - /* An invalid sequence, or a truncated multibyte character. |
| - We treat it as a singlebyte character. */ |
| - mbclen = 1; |
| - } |
| - mb_properties[i] = mbclen; |
| - i += mbclen; |
| - } |
| - |
| - return mb_properties; |
| -} |
| -#endif |
| - |
| static void |
| Gcompile (char const *pattern, size_t size) |
| { |
| @@ -181,7 +166,8 @@ Gcompile (char const *pattern, size_t size) |
| size_t total = size; |
| char const *motif = pattern; |
| |
| - re_set_syntax (RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE); |
| + check_utf8 (); |
| + re_set_syntax (RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE | (match_icase ? RE_ICASE : 0)); |
| dfasyntax (RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE, match_icase, eolbyte); |
| |
| /* For GNU regex compiler we have to pass the patterns separately to detect |
| @@ -218,6 +204,10 @@ Gcompile (char const *pattern, size_t size) |
| motif = sep; |
| } while (sep && total != 0); |
| |
| + /* Strip trailing newline. */ |
| + if (size && pattern[size - 1] == '\n') |
| + size--; |
| + |
| /* In the match_words and match_lines cases, we use a different pattern |
| for the DFA matcher that will quickly throw out cases that won't work. |
| Then if DFA succeeds we do some hairy stuff using the regex matcher |
| @@ -233,7 +223,7 @@ Gcompile (char const *pattern, size_t size) |
| static char const line_end[] = "\\)$"; |
| static char const word_beg[] = "\\(^\\|[^[:alnum:]_]\\)\\("; |
| static char const word_end[] = "\\)\\([^[:alnum:]_]\\|$\\)"; |
| - char *n = malloc (sizeof word_beg - 1 + size + sizeof word_end); |
| + char *n = xmalloc (sizeof word_beg - 1 + size + sizeof word_end); |
| size_t i; |
| strcpy (n, match_lines ? line_beg : word_beg); |
| i = strlen (n); |
| @@ -257,14 +247,15 @@ Ecompile (char const *pattern, size_t size) |
| size_t total = size; |
| char const *motif = pattern; |
| |
| + check_utf8 (); |
| if (strcmp (matcher, "awk") == 0) |
| { |
| - re_set_syntax (RE_SYNTAX_AWK); |
| + re_set_syntax (RE_SYNTAX_AWK | (match_icase ? RE_ICASE : 0)); |
| dfasyntax (RE_SYNTAX_AWK, match_icase, eolbyte); |
| } |
| else |
| { |
| - re_set_syntax (RE_SYNTAX_POSIX_EGREP); |
| + re_set_syntax (RE_SYNTAX_POSIX_EGREP | (match_icase ? RE_ICASE : 0)); |
| dfasyntax (RE_SYNTAX_POSIX_EGREP, match_icase, eolbyte); |
| } |
| |
| @@ -301,6 +292,10 @@ Ecompile (char const *pattern, size_t size) |
| motif = sep; |
| } while (sep && total != 0); |
| |
| + /* Strip trailing newline. */ |
| + if (size && pattern[size - 1] == '\n') |
| + size--; |
| + |
| /* In the match_words and match_lines cases, we use a different pattern |
| for the DFA matcher that will quickly throw out cases that won't work. |
| Then if DFA succeeds we do some hairy stuff using the regex matcher |
| @@ -316,7 +311,7 @@ Ecompile (char const *pattern, size_t size) |
| static char const line_end[] = ")$"; |
| static char const word_beg[] = "(^|[^[:alnum:]_])("; |
| static char const word_end[] = ")([^[:alnum:]_]|$)"; |
| - char *n = malloc (sizeof word_beg - 1 + size + sizeof word_end); |
| + char *n = xmalloc (sizeof word_beg - 1 + size + sizeof word_end); |
| size_t i; |
| strcpy (n, match_lines ? line_beg : word_beg); |
| i = strlen(n); |
| @@ -339,15 +334,34 @@ EGexecute (char const *buf, size_t size, size_t *match_size, int exact) |
| char eol = eolbyte; |
| int backref, start, len; |
| struct kwsmatch kwsm; |
| - size_t i; |
| + size_t i, ret_val; |
| + static int use_dfa; |
| + static int use_dfa_checked = 0; |
| #ifdef MBS_SUPPORT |
| - char *mb_properties = NULL; |
| + int mb_cur_max = MB_CUR_MAX; |
| + mbstate_t mbs; |
| + memset (&mbs, '\0', sizeof (mbstate_t)); |
| #endif /* MBS_SUPPORT */ |
| |
| + if (!use_dfa_checked) |
| + { |
| + char *grep_use_dfa = getenv ("GREP_USE_DFA"); |
| + if (!grep_use_dfa) |
| + { |
| #ifdef MBS_SUPPORT |
| - if (MB_CUR_MAX > 1 && kwset) |
| - mb_properties = check_multibyte_string(buf, size); |
| + /* Turn off DFA when processing multibyte input. */ |
| + use_dfa = (MB_CUR_MAX == 1); |
| +#else |
| + use_dfa = 1; |
| #endif /* MBS_SUPPORT */ |
| + } |
| + else |
| + { |
| + use_dfa = atoi (grep_use_dfa); |
| + } |
| + |
| + use_dfa_checked = 1; |
| + } |
| |
| buflim = buf + size; |
| |
| @@ -358,47 +372,120 @@ EGexecute (char const *buf, size_t size, size_t *match_size, int exact) |
| if (kwset) |
| { |
| /* Find a possible match using the KWset matcher. */ |
| - size_t offset = kwsexec (kwset, beg, buflim - beg, &kwsm); |
| +#ifdef MBS_SUPPORT |
| + size_t bytes_left = 0; |
| +#endif /* MBS_SUPPORT */ |
| + size_t offset; |
| +#ifdef MBS_SUPPORT |
| + /* kwsexec doesn't work with match_icase and multibyte input. */ |
| + if (match_icase && mb_cur_max > 1) |
| + /* Avoid kwset */ |
| + offset = 0; |
| + else |
| +#endif /* MBS_SUPPORT */ |
| + offset = kwsexec (kwset, beg, buflim - beg, &kwsm); |
| if (offset == (size_t) -1) |
| - { |
| + goto failure; |
| #ifdef MBS_SUPPORT |
| - if (MB_CUR_MAX > 1) |
| - free(mb_properties); |
| -#endif |
| - return (size_t)-1; |
| + if (mb_cur_max > 1 && !using_utf8) |
| + { |
| + bytes_left = offset; |
| + while (bytes_left) |
| + { |
| + size_t mlen = mbrlen (beg, bytes_left, &mbs); |
| + if (mlen == (size_t) -1 || mlen == 0) |
| + { |
| + /* Incomplete character: treat as single-byte. */ |
| + memset (&mbs, '\0', sizeof (mbstate_t)); |
| + beg++; |
| + bytes_left--; |
| + continue; |
| + } |
| + |
| + if (mlen == (size_t) -2) |
| + /* Offset points inside multibyte character: |
| + * no good. */ |
| + break; |
| + |
| + beg += mlen; |
| + bytes_left -= mlen; |
| + } |
| } |
| + else |
| +#endif /* MBS_SUPPORT */ |
| beg += offset; |
| /* Narrow down to the line containing the candidate, and |
| run it through DFA. */ |
| end = memchr(beg, eol, buflim - beg); |
| end++; |
| #ifdef MBS_SUPPORT |
| - if (MB_CUR_MAX > 1 && mb_properties[beg - buf] == 0) |
| + if (mb_cur_max > 1 && bytes_left) |
| continue; |
| -#endif |
| +#endif /* MBS_SUPPORT */ |
| while (beg > buf && beg[-1] != eol) |
| --beg; |
| - if (kwsm.index < kwset_exact_matches) |
| - goto success; |
| - if (dfaexec (&dfa, beg, end - beg, &backref) == (size_t) -1) |
| + if ( |
| +#ifdef MBS_SUPPORT |
| + !(match_icase && mb_cur_max > 1) && |
| +#endif /* MBS_SUPPORT */ |
| + (kwsm.index < kwset_exact_matches)) |
| + goto success_in_beg_and_end; |
| + if (use_dfa && |
| + dfaexec (&dfa, beg, end - beg, &backref) == (size_t) -1) |
| continue; |
| } |
| else |
| { |
| /* No good fixed strings; start with DFA. */ |
| - size_t offset = dfaexec (&dfa, beg, buflim - beg, &backref); |
| +#ifdef MBS_SUPPORT |
| + size_t bytes_left = 0; |
| +#endif /* MBS_SUPPORT */ |
| + size_t offset = 0; |
| + if (use_dfa) |
| + offset = dfaexec (&dfa, beg, buflim - beg, &backref); |
| if (offset == (size_t) -1) |
| break; |
| /* Narrow down to the line we've found. */ |
| +#ifdef MBS_SUPPORT |
| + if (mb_cur_max > 1 && !using_utf8) |
| + { |
| + bytes_left = offset; |
| + while (bytes_left) |
| + { |
| + size_t mlen = mbrlen (beg, bytes_left, &mbs); |
| + if (mlen == (size_t) -1 || mlen == 0) |
| + { |
| + /* Incomplete character: treat as single-byte. */ |
| + memset (&mbs, '\0', sizeof (mbstate_t)); |
| + beg++; |
| + bytes_left--; |
| + continue; |
| + } |
| + |
| + if (mlen == (size_t) -2) |
| + /* Offset points inside multibyte character: |
| + * no good. */ |
| + break; |
| + |
| + beg += mlen; |
| + bytes_left -= mlen; |
| + } |
| + } |
| + else |
| +#endif /* MBS_SUPPORT */ |
| beg += offset; |
| end = memchr (beg, eol, buflim - beg); |
| end++; |
| +#ifdef MBS_SUPPORT |
| + if (mb_cur_max > 1 && bytes_left) |
| + continue; |
| +#endif /* MBS_SUPPORT */ |
| while (beg > buf && beg[-1] != eol) |
| --beg; |
| } |
| /* Successful, no backreferences encountered! */ |
| - if (!backref) |
| - goto success; |
| + if (use_dfa && !backref) |
| + goto success_in_beg_and_end; |
| } |
| else |
| end = beg + size; |
| @@ -413,14 +500,11 @@ EGexecute (char const *buf, size_t size, size_t *match_size, int exact) |
| end - beg - 1, &(patterns[i].regs)))) |
| { |
| len = patterns[i].regs.end[0] - start; |
| - if (exact) |
| - { |
| - *match_size = len; |
| - return start; |
| - } |
| + if (exact && !match_words) |
| + goto success_in_start_and_len; |
| if ((!match_lines && !match_words) |
| || (match_lines && len == end - beg - 1)) |
| - goto success; |
| + goto success_in_beg_and_end; |
| /* If -w, check if the match aligns with word boundaries. |
| We do this iteratively because: |
| (a) the line may contain more than one occurence of the |
| @@ -431,10 +515,114 @@ EGexecute (char const *buf, size_t size, size_t *match_size, int exact) |
| if (match_words) |
| while (start >= 0) |
| { |
| - if ((start == 0 || !WCHAR ((unsigned char) beg[start - 1])) |
| - && (len == end - beg - 1 |
| - || !WCHAR ((unsigned char) beg[start + len]))) |
| - goto success; |
| + int lword_match = 0; |
| + if (start == 0) |
| + lword_match = 1; |
| + else |
| + { |
| + assert (start > 0); |
| +#ifdef MBS_SUPPORT |
| + if (mb_cur_max > 1) |
| + { |
| + const char *s; |
| + size_t mr; |
| + wchar_t pwc; |
| + |
| + /* Locate the start of the multibyte character |
| + before the match position (== beg + start). */ |
| + if (using_utf8) |
| + { |
| + /* UTF-8 is a special case: scan backwards |
| + until we find a 7-bit character or a |
| + lead byte. */ |
| + s = beg + start - 1; |
| + while (s > buf |
| + && (unsigned char) *s >= 0x80 |
| + && (unsigned char) *s <= 0xbf) |
| + --s; |
| + } |
| + else |
| + { |
| + /* Scan forwards to find the start of the |
| + last complete character before the |
| + match position. */ |
| + size_t bytes_left = start - 1; |
| + s = beg; |
| + while (bytes_left > 0) |
| + { |
| + mr = mbrlen (s, bytes_left, &mbs); |
| + if (mr == (size_t) -1 || mr == 0) |
| + { |
| + memset (&mbs, '\0', sizeof (mbs)); |
| + s++; |
| + bytes_left--; |
| + continue; |
| + } |
| + if (mr == (size_t) -2) |
| + { |
| + memset (&mbs, '\0', sizeof (mbs)); |
| + break; |
| + } |
| + s += mr; |
| + bytes_left -= mr; |
| + } |
| + } |
| + mr = mbrtowc (&pwc, s, beg + start - s, &mbs); |
| + if (mr == (size_t) -2 || mr == (size_t) -1 || |
| + mr == 0) |
| + { |
| + memset (&mbs, '\0', sizeof (mbstate_t)); |
| + lword_match = 1; |
| + } |
| + else if (!(iswalnum (pwc) || pwc == L'_') |
| + && mr == beg + start - s) |
| + lword_match = 1; |
| + } |
| + else |
| +#endif /* MBS_SUPPORT */ |
| + if (!WCHAR ((unsigned char) beg[start - 1])) |
| + lword_match = 1; |
| + } |
| + |
| + if (lword_match) |
| + { |
| + int rword_match = 0; |
| + if (start + len == end - beg - 1) |
| + rword_match = 1; |
| + else |
| + { |
| +#ifdef MBS_SUPPORT |
| + if (mb_cur_max > 1) |
| + { |
| + wchar_t nwc; |
| + int mr; |
| + |
| + mr = mbtowc (&nwc, beg + start + len, |
| + end - beg - start - len - 1); |
| + if (mr <= 0) |
| + { |
| + memset (&mbs, '\0', sizeof (mbstate_t)); |
| + rword_match = 1; |
| + } |
| + else if (!iswalnum (nwc) && nwc != L'_') |
| + rword_match = 1; |
| + } |
| + else |
| +#endif /* MBS_SUPPORT */ |
| + if (!WCHAR ((unsigned char) beg[start + len])) |
| + rword_match = 1; |
| + } |
| + |
| + if (rword_match) |
| + { |
| + if (!exact) |
| + /* Returns the whole line. */ |
| + goto success_in_beg_and_end; |
| + else |
| + /* Returns just this word match. */ |
| + goto success_in_start_and_len; |
| + } |
| + } |
| if (len > 0) |
| { |
| /* Try a shorter length anchored at the same place. */ |
| @@ -461,26 +649,154 @@ EGexecute (char const *buf, size_t size, size_t *match_size, int exact) |
| } |
| } /* for Regex patterns. */ |
| } /* for (beg = end ..) */ |
| -#ifdef MBS_SUPPORT |
| - if (MB_CUR_MAX > 1 && mb_properties) |
| - free (mb_properties); |
| -#endif /* MBS_SUPPORT */ |
| + |
| + failure: |
| return (size_t) -1; |
| |
| - success: |
| -#ifdef MBS_SUPPORT |
| - if (MB_CUR_MAX > 1 && mb_properties) |
| - free (mb_properties); |
| -#endif /* MBS_SUPPORT */ |
| - *match_size = end - beg; |
| - return beg - buf; |
| + success_in_beg_and_end: |
| + len = end - beg; |
| + start = beg - buf; |
| + /* FALLTHROUGH */ |
| + |
| + success_in_start_and_len: |
| + *match_size = len; |
| + return start; |
| } |
| |
| +#ifdef MBS_SUPPORT |
| +static int f_i_multibyte; /* whether we're using the new -Fi MB method */ |
| +static struct |
| +{ |
| + wchar_t **patterns; |
| + size_t count, maxlen; |
| + unsigned char *match; |
| +} Fimb; |
| +#endif |
| + |
| static void |
| Fcompile (char const *pattern, size_t size) |
| { |
| + int mb_cur_max = MB_CUR_MAX; |
| char const *beg, *lim, *err; |
| |
| + check_utf8 (); |
| +#ifdef MBS_SUPPORT |
| + /* Support -F -i for UTF-8 input. */ |
| + if (match_icase && mb_cur_max > 1) |
| + { |
| + mbstate_t mbs; |
| + wchar_t *wcpattern = xmalloc ((size + 1) * sizeof (wchar_t)); |
| + const char *patternend = pattern; |
| + size_t wcsize; |
| + kwset_t fimb_kwset = NULL; |
| + char *starts = NULL; |
| + wchar_t *wcbeg, *wclim; |
| + size_t allocated = 0; |
| + |
| + memset (&mbs, '\0', sizeof (mbs)); |
| +# ifdef __GNU_LIBRARY__ |
| + wcsize = mbsnrtowcs (wcpattern, &patternend, size, size, &mbs); |
| + if (patternend != pattern + size) |
| + wcsize = (size_t) -1; |
| +# else |
| + { |
| + char *patterncopy = xmalloc (size + 1); |
| + |
| + memcpy (patterncopy, pattern, size); |
| + patterncopy[size] = '\0'; |
| + patternend = patterncopy; |
| + wcsize = mbsrtowcs (wcpattern, &patternend, size, &mbs); |
| + if (patternend != patterncopy + size) |
| + wcsize = (size_t) -1; |
| + free (patterncopy); |
| + } |
| +# endif |
| + if (wcsize + 2 <= 2) |
| + { |
| +fimb_fail: |
| + free (wcpattern); |
| + free (starts); |
| + if (fimb_kwset) |
| + kwsfree (fimb_kwset); |
| + free (Fimb.patterns); |
| + Fimb.patterns = NULL; |
| + } |
| + else |
| + { |
| + if (!(fimb_kwset = kwsalloc (NULL))) |
| + error (2, 0, _("memory exhausted")); |
| + |
| + starts = xmalloc (mb_cur_max * 3); |
| + wcbeg = wcpattern; |
| + do |
| + { |
| + int i; |
| + size_t wclen; |
| + |
| + if (Fimb.count >= allocated) |
| + { |
| + if (allocated == 0) |
| + allocated = 128; |
| + else |
| + allocated *= 2; |
| + Fimb.patterns = xrealloc (Fimb.patterns, |
| + sizeof (wchar_t *) * allocated); |
| + } |
| + Fimb.patterns[Fimb.count++] = wcbeg; |
| + for (wclim = wcbeg; |
| + wclim < wcpattern + wcsize && *wclim != L'\n'; ++wclim) |
| + *wclim = towlower (*wclim); |
| + *wclim = L'\0'; |
| + wclen = wclim - wcbeg; |
| + if (wclen > Fimb.maxlen) |
| + Fimb.maxlen = wclen; |
| + if (wclen > 3) |
| + wclen = 3; |
| + if (wclen == 0) |
| + { |
| + if ((err = kwsincr (fimb_kwset, "", 0)) != 0) |
| + error (2, 0, err); |
| + } |
| + else |
| + for (i = 0; i < (1 << wclen); i++) |
| + { |
| + char *p = starts; |
| + int j, k; |
| + |
| + for (j = 0; j < wclen; ++j) |
| + { |
| + wchar_t wc = wcbeg[j]; |
| + if (i & (1 << j)) |
| + { |
| + wc = towupper (wc); |
| + if (wc == wcbeg[j]) |
| + continue; |
| + } |
| + k = wctomb (p, wc); |
| + if (k <= 0) |
| + goto fimb_fail; |
| + p += k; |
| + } |
| + if ((err = kwsincr (fimb_kwset, starts, p - starts)) != 0) |
| + error (2, 0, err); |
| + } |
| + if (wclim < wcpattern + wcsize) |
| + ++wclim; |
| + wcbeg = wclim; |
| + } |
| + while (wcbeg < wcpattern + wcsize); |
| + f_i_multibyte = 1; |
| + kwset = fimb_kwset; |
| + free (starts); |
| + Fimb.match = xmalloc (Fimb.count); |
| + if ((err = kwsprep (kwset)) != 0) |
| + error (2, 0, err); |
| + return; |
| + } |
| + } |
| +#endif /* MBS_SUPPORT */ |
| + |
| + |
| kwsinit (); |
| beg = pattern; |
| do |
| @@ -499,6 +815,76 @@ Fcompile (char const *pattern, size_t size) |
| error (2, 0, err); |
| } |
| |
| +#ifdef MBS_SUPPORT |
| +static int |
| +Fimbexec (const char *buf, size_t size, size_t *plen, int exact) |
| +{ |
| + size_t len, letter, i; |
| + int ret = -1; |
| + mbstate_t mbs; |
| + wchar_t wc; |
| + int patterns_left; |
| + |
| + assert (match_icase && f_i_multibyte == 1); |
| + assert (MB_CUR_MAX > 1); |
| + |
| + memset (&mbs, '\0', sizeof (mbs)); |
| + memset (Fimb.match, '\1', Fimb.count); |
| + letter = len = 0; |
| + patterns_left = 1; |
| + while (patterns_left && len <= size) |
| + { |
| + size_t c; |
| + |
| + patterns_left = 0; |
| + if (len < size) |
| + { |
| + c = mbrtowc (&wc, buf + len, size - len, &mbs); |
| + if (c + 2 <= 2) |
| + return ret; |
| + |
| + wc = towlower (wc); |
| + } |
| + else |
| + { |
| + c = 1; |
| + wc = L'\0'; |
| + } |
| + |
| + for (i = 0; i < Fimb.count; i++) |
| + { |
| + if (Fimb.match[i]) |
| + { |
| + if (Fimb.patterns[i][letter] == L'\0') |
| + { |
| + /* Found a match. */ |
| + *plen = len; |
| + if (!exact && !match_words) |
| + return 0; |
| + else |
| + { |
| + /* For -w or exact look for longest match. */ |
| + ret = 0; |
| + Fimb.match[i] = '\0'; |
| + continue; |
| + } |
| + } |
| + |
| + if (Fimb.patterns[i][letter] == wc) |
| + patterns_left = 1; |
| + else |
| + Fimb.match[i] = '\0'; |
| + } |
| + } |
| + |
| + len += c; |
| + letter++; |
| + } |
| + |
| + return ret; |
| +} |
| +#endif /* MBS_SUPPORT */ |
| + |
| static size_t |
| Fexecute (char const *buf, size_t size, size_t *match_size, int exact) |
| { |
| @@ -506,88 +892,268 @@ Fexecute (char const *buf, size_t size, size_t *match_size, int exact) |
| register size_t len; |
| char eol = eolbyte; |
| struct kwsmatch kwsmatch; |
| + size_t ret_val; |
| #ifdef MBS_SUPPORT |
| - char *mb_properties; |
| - if (MB_CUR_MAX > 1) |
| - mb_properties = check_multibyte_string (buf, size); |
| + int mb_cur_max = MB_CUR_MAX; |
| + mbstate_t mbs; |
| + memset (&mbs, '\0', sizeof (mbstate_t)); |
| + const char *last_char = NULL; |
| #endif /* MBS_SUPPORT */ |
| |
| - for (beg = buf; beg <= buf + size; ++beg) |
| + for (beg = buf; beg < buf + size; ++beg) |
| { |
| - size_t offset = kwsexec (kwset, beg, buf + size - beg, &kwsmatch); |
| + size_t offset; |
| + offset = kwsexec (kwset, beg, buf + size - beg, &kwsmatch); |
| + |
| if (offset == (size_t) -1) |
| - { |
| + goto failure; |
| #ifdef MBS_SUPPORT |
| - if (MB_CUR_MAX > 1) |
| - free(mb_properties); |
| -#endif /* MBS_SUPPORT */ |
| - return offset; |
| + if (mb_cur_max > 1 && !using_utf8) |
| + { |
| + size_t bytes_left = offset; |
| + while (bytes_left) |
| + { |
| + size_t mlen = mbrlen (beg, bytes_left, &mbs); |
| + |
| + last_char = beg; |
| + if (mlen == (size_t) -1 || mlen == 0) |
| + { |
| + /* Incomplete character: treat as single-byte. */ |
| + memset (&mbs, '\0', sizeof (mbstate_t)); |
| + beg++; |
| + bytes_left--; |
| + continue; |
| + } |
| + |
| + if (mlen == (size_t) -2) |
| + /* Offset points inside multibyte character: no good. */ |
| + break; |
| + |
| + beg += mlen; |
| + bytes_left -= mlen; |
| + } |
| + |
| + if (bytes_left) |
| + continue; |
| } |
| -#ifdef MBS_SUPPORT |
| - if (MB_CUR_MAX > 1 && mb_properties[offset+beg-buf] == 0) |
| - continue; /* It is a part of multibyte character. */ |
| + else |
| #endif /* MBS_SUPPORT */ |
| beg += offset; |
| - len = kwsmatch.size[0]; |
| - if (exact) |
| - { |
| - *match_size = len; |
| #ifdef MBS_SUPPORT |
| - if (MB_CUR_MAX > 1) |
| - free (mb_properties); |
| + /* For f_i_multibyte, the string at beg now matches first 3 chars of |
| + one of the search strings (less if there are shorter search strings). |
| + See if this is a real match. */ |
| + if (f_i_multibyte |
| + && Fimbexec (beg, buf + size - beg, &kwsmatch.size[0], exact)) |
| + goto next_char; |
| #endif /* MBS_SUPPORT */ |
| - return beg - buf; |
| - } |
| + len = kwsmatch.size[0]; |
| + if (exact && !match_words) |
| + goto success_in_beg_and_len; |
| if (match_lines) |
| { |
| if (beg > buf && beg[-1] != eol) |
| - continue; |
| + goto next_char; |
| if (beg + len < buf + size && beg[len] != eol) |
| - continue; |
| + goto next_char; |
| goto success; |
| } |
| else if (match_words) |
| - for (try = beg; len; ) |
| - { |
| - if (try > buf && WCHAR((unsigned char) try[-1])) |
| - break; |
| - if (try + len < buf + size && WCHAR((unsigned char) try[len])) |
| - { |
| - offset = kwsexec (kwset, beg, --len, &kwsmatch); |
| - if (offset == (size_t) -1) |
| - { |
| + { |
| + while (len) |
| + { |
| + int word_match = 0; |
| + if (beg > buf) |
| + { |
| #ifdef MBS_SUPPORT |
| - if (MB_CUR_MAX > 1) |
| - free (mb_properties); |
| + if (mb_cur_max > 1) |
| + { |
| + const char *s; |
| + int mr; |
| + wchar_t pwc; |
| + |
| + if (using_utf8) |
| + { |
| + s = beg - 1; |
| + while (s > buf |
| + && (unsigned char) *s >= 0x80 |
| + && (unsigned char) *s <= 0xbf) |
| + --s; |
| + } |
| + else |
| + s = last_char; |
| + mr = mbtowc (&pwc, s, beg - s); |
| + if (mr <= 0) |
| + memset (&mbs, '\0', sizeof (mbstate_t)); |
| + else if ((iswalnum (pwc) || pwc == L'_') |
| + && mr == (int) (beg - s)) |
| + goto next_char; |
| + } |
| + else |
| #endif /* MBS_SUPPORT */ |
| - return offset; |
| - } |
| - try = beg + offset; |
| - len = kwsmatch.size[0]; |
| - } |
| - else |
| - goto success; |
| - } |
| + if (WCHAR ((unsigned char) beg[-1])) |
| + goto next_char; |
| + } |
| +#ifdef MBS_SUPPORT |
| + if (mb_cur_max > 1) |
| + { |
| + wchar_t nwc; |
| + int mr; |
| + |
| + mr = mbtowc (&nwc, beg + len, buf + size - beg - len); |
| + if (mr <= 0) |
| + { |
| + memset (&mbs, '\0', sizeof (mbstate_t)); |
| + word_match = 1; |
| + } |
| + else if (!iswalnum (nwc) && nwc != L'_') |
| + word_match = 1; |
| + } |
| + else |
| +#endif /* MBS_SUPPORT */ |
| + if (beg + len >= buf + size || !WCHAR ((unsigned char) beg[len])) |
| + word_match = 1; |
| + if (word_match) |
| + { |
| + if (!exact) |
| + /* Returns the whole line now we know there's a word match. */ |
| + goto success; |
| + else |
| + /* Returns just this word match. */ |
| + goto success_in_beg_and_len; |
| + } |
| + if (len > 0) |
| + { |
| + /* Try a shorter length anchored at the same place. */ |
| + --len; |
| + offset = kwsexec (kwset, beg, len, &kwsmatch); |
| + |
| + if (offset == -1) |
| + goto next_char; /* Try a different anchor. */ |
| +#ifdef MBS_SUPPORT |
| + if (mb_cur_max > 1 && !using_utf8) |
| + { |
| + size_t bytes_left = offset; |
| + while (bytes_left) |
| + { |
| + size_t mlen = mbrlen (beg, bytes_left, &mbs); |
| + |
| + last_char = beg; |
| + if (mlen == (size_t) -1 || mlen == 0) |
| + { |
| + /* Incomplete character: treat as single-byte. */ |
| + memset (&mbs, '\0', sizeof (mbstate_t)); |
| + beg++; |
| + bytes_left--; |
| + continue; |
| + } |
| + |
| + if (mlen == (size_t) -2) |
| + { |
| + /* Offset points inside multibyte character: |
| + * no good. */ |
| + break; |
| + } |
| + |
| + beg += mlen; |
| + bytes_left -= mlen; |
| + } |
| + |
| + if (bytes_left) |
| + { |
| + memset (&mbs, '\0', sizeof (mbstate_t)); |
| + goto next_char; /* Try a different anchor. */ |
| + } |
| + } |
| + else |
| +#endif /* MBS_SUPPORT */ |
| + beg += offset; |
| +#ifdef MBS_SUPPORT |
| + /* The string at beg now matches first 3 chars of one of |
| + the search strings (less if there are shorter search |
| + strings). See if this is a real match. */ |
| + if (f_i_multibyte |
| + && Fimbexec (beg, len - offset, &kwsmatch.size[0], |
| + exact)) |
| + goto next_char; |
| +#endif /* MBS_SUPPORT */ |
| + len = kwsmatch.size[0]; |
| + } |
| + } |
| + } |
| else |
| goto success; |
| - } |
| - |
| +next_char:; |
| #ifdef MBS_SUPPORT |
| - if (MB_CUR_MAX > 1) |
| - free (mb_properties); |
| + /* Advance to next character. For MB_CUR_MAX == 1 case this is handled |
| + by ++beg above. */ |
| + if (mb_cur_max > 1) |
| + { |
| + if (using_utf8) |
| + { |
| + unsigned char c = *beg; |
| + if (c >= 0xc2) |
| + { |
| + if (c < 0xe0) |
| + ++beg; |
| + else if (c < 0xf0) |
| + beg += 2; |
| + else if (c < 0xf8) |
| + beg += 3; |
| + else if (c < 0xfc) |
| + beg += 4; |
| + else if (c < 0xfe) |
| + beg += 5; |
| + } |
| + } |
| + else |
| + { |
| + size_t l = mbrlen (beg, buf + size - beg, &mbs); |
| + |
| + last_char = beg; |
| + if (l + 2 >= 2) |
| + beg += l - 1; |
| + else |
| + memset (&mbs, '\0', sizeof (mbstate_t)); |
| + } |
| + } |
| #endif /* MBS_SUPPORT */ |
| + } |
| + |
| + failure: |
| return -1; |
| |
| success: |
| +#ifdef MBS_SUPPORT |
| + if (mb_cur_max > 1 && !using_utf8) |
| + { |
| + end = beg + len; |
| + while (end < buf + size) |
| + { |
| + size_t mlen = mbrlen (end, buf + size - end, &mbs); |
| + if (mlen == (size_t) -1 || mlen == (size_t) -2 || mlen == 0) |
| + { |
| + memset (&mbs, '\0', sizeof (mbstate_t)); |
| + mlen = 1; |
| + } |
| + if (mlen == 1 && *end == eol) |
| + break; |
| + |
| + end += mlen; |
| + } |
| + } |
| + else |
| +#endif /* MBS_SUPPORT */ |
| end = memchr (beg + len, eol, (buf + size) - (beg + len)); |
| + |
| end++; |
| while (buf < beg && beg[-1] != eol) |
| --beg; |
| - *match_size = end - beg; |
| -#ifdef MBS_SUPPORT |
| - if (MB_CUR_MAX > 1) |
| - free (mb_properties); |
| -#endif /* MBS_SUPPORT */ |
| + len = end - beg; |
| + /* FALLTHROUGH */ |
| + |
| + success_in_beg_and_len: |
| + *match_size = len; |
| return beg - buf; |
| } |
| |
| @@ -701,8 +1267,9 @@ Pexecute (char const *buf, size_t size, size_t *match_size, int exact) |
| char eol = eolbyte; |
| if (!exact) |
| { |
| - end = memchr (end, eol, buflim - end); |
| - end++; |
| + while (end < buflim) |
| + if (*end++ == eol) |
| + break; |
| while (buf < beg && beg[-1] != eol) |
| --beg; |
| } |
| -- |
| 1.8.4.2 |
| |