blob: 327ee56402275f6b75c5a4bcddf652801e501c33 [file] [log] [blame]
From c884dd12ec062569335702848fc5f29f436c28fa Mon Sep 17 00:00:00 2001
From: Li xin <lixin.fnst@cn.fujitsu.com>
Date: Mon, 25 May 2015 10:15:57 +0900
Subject: [PATCH] grep egrep fgrep: Fix LSB NG cases.
The LSB core test requires grep egrep and fgrep can
perform pattern matching in searches without regard
to case if -i option is specified.
Upstream-Status: backport.
Signed-off-by: Li Xin <lixin.fnst@cn.fujitsu.com>
---
lib/posix/regex.h | 4 +
src/dfa.c | 22 +-
src/grep.c | 96 ++++---
src/search.c | 833 +++++++++++++++++++++++++++++++++++++++++++++---------
4 files changed, 768 insertions(+), 187 deletions(-)
diff --git a/lib/posix/regex.h b/lib/posix/regex.h
index 63c2fef..7bb2b0e 100644
--- a/lib/posix/regex.h
+++ b/lib/posix/regex.h
@@ -109,6 +109,10 @@ typedef unsigned long int reg_syntax_t;
If not set, \{, \}, {, and } are literals. */
#define RE_INTERVALS (RE_HAT_LISTS_NOT_NEWLINE << 1)
+/* If this bit is set, then ignore case when matching.
+ If not set, then case is significant. */
+#define RE_ICASE (RE_INVALID_INTERVAL_ORD << 1)
+
/* If this bit is set, +, ? and | aren't recognized as operators.
If not set, they are. */
#define RE_LIMITED_OPS (RE_INTERVALS << 1)
diff --git a/src/dfa.c b/src/dfa.c
index 590bfa7..27c876a 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -414,7 +414,7 @@ update_mb_len_index (unsigned char const *p, int len)
/* This function fetch a wide character, and update cur_mb_len,
used only if the current locale is a multibyte environment. */
-static wchar_t
+static wint_t
fetch_wc (char const *eoferr)
{
wchar_t wc;
@@ -423,7 +423,7 @@ fetch_wc (char const *eoferr)
if (eoferr != 0)
dfaerror (eoferr);
else
- return -1;
+ return WEOF;
}
cur_mb_len = mbrtowc(&wc, lexptr, lexleft, &mbs);
@@ -459,7 +459,7 @@ fetch_wc (char const *eoferr)
static void
parse_bracket_exp_mb ()
{
- wchar_t wc, wc1, wc2;
+ wint_t wc, wc1, wc2;
/* Work area to build a mb_char_classes. */
struct mb_char_classes *work_mbc;
@@ -496,7 +496,7 @@ parse_bracket_exp_mb ()
work_mbc->invert = 0;
do
{
- wc1 = -1; /* mark wc1 is not initialized". */
+ wc1 = WEOF; /* mark wc1 is not initialized". */
/* Note that if we're looking at some other [:...:] construct,
we just treat it as a bunch of ordinary characters. We can do
@@ -586,7 +586,7 @@ parse_bracket_exp_mb ()
work_mbc->coll_elems[work_mbc->ncoll_elems++] = elem;
}
}
- wc = -1;
+ wc1 = wc = WEOF;
}
else
/* We treat '[' as a normal character here. */
@@ -600,7 +600,7 @@ parse_bracket_exp_mb ()
wc = fetch_wc(("Unbalanced ["));
}
- if (wc1 == -1)
+ if (wc1 == WEOF)
wc1 = fetch_wc(_("Unbalanced ["));
if (wc1 == L'-')
@@ -630,17 +630,17 @@ parse_bracket_exp_mb ()
}
REALLOC_IF_NECESSARY(work_mbc->range_sts, wchar_t,
range_sts_al, work_mbc->nranges + 1);
- work_mbc->range_sts[work_mbc->nranges] = wc;
+ work_mbc->range_sts[work_mbc->nranges] = (wchar_t)wc;
REALLOC_IF_NECESSARY(work_mbc->range_ends, wchar_t,
range_ends_al, work_mbc->nranges + 1);
- work_mbc->range_ends[work_mbc->nranges++] = wc2;
+ work_mbc->range_ends[work_mbc->nranges++] = (wchar_t)wc2;
}
- else if (wc != -1)
+ else if (wc != WEOF)
/* build normal characters. */
{
REALLOC_IF_NECESSARY(work_mbc->chars, wchar_t, chars_al,
work_mbc->nchars + 1);
- work_mbc->chars[work_mbc->nchars++] = wc;
+ work_mbc->chars[work_mbc->nchars++] = (wchar_t)wc;
}
}
while ((wc = wc1) != L']');
@@ -2552,6 +2552,8 @@ match_mb_charset (struct dfa *d, int s, position pos, int index)
}
/* match with a character? */
+ if (case_fold)
+ wc = towlower (wc);
for (i = 0; i<work_mbc->nchars; i++)
{
if (wc == work_mbc->chars[i])
diff --git a/src/grep.c b/src/grep.c
index 2fb2fac..3fd4b47 100644
--- a/src/grep.c
+++ b/src/grep.c
@@ -30,6 +30,12 @@
# include <sys/time.h>
# include <sys/resource.h>
#endif
+#if defined HAVE_WCTYPE_H && defined HAVE_WCHAR_H && defined HAVE_MBRTOWC
+/* We can handle multibyte string. */
+# define MBS_SUPPORT
+# include <wchar.h>
+# include <wctype.h>
+#endif
#include <stdio.h>
#include "system.h"
#include "getopt.h"
@@ -255,19 +261,6 @@ reset (int fd, char const *file, struct stats *stats)
bufbeg[-1] = eolbyte;
bufdesc = fd;
- if (fstat (fd, &stats->stat) != 0)
- {
- error (0, errno, "fstat");
- return 0;
- }
- if (directories == SKIP_DIRECTORIES && S_ISDIR (stats->stat.st_mode))
- return 0;
-#ifndef DJGPP
- if (devices == SKIP_DEVICES && (S_ISCHR(stats->stat.st_mode) || S_ISBLK(stats->stat.st_mode) || S_ISSOCK(stats->stat.st_mode)))
-#else
- if (devices == SKIP_DEVICES && (S_ISCHR(stats->stat.st_mode) || S_ISBLK(stats->stat.st_mode)))
-#endif
- return 0;
if (S_ISREG (stats->stat.st_mode))
{
if (file)
@@ -558,33 +551,6 @@ prline (char const *beg, char const *lim, int sep)
{
size_t match_size;
size_t match_offset;
- if(match_icase)
- {
- /* Yuck, this is tricky */
- char *buf = (char*) xmalloc (lim - beg);
- char *ibeg = buf;
- char *ilim = ibeg + (lim - beg);
- int i;
- for (i = 0; i < lim - beg; i++)
- ibeg[i] = tolower (beg[i]);
- while ((match_offset = (*execute) (ibeg, ilim-ibeg, &match_size, 1))
- != (size_t) -1)
- {
- char const *b = beg + match_offset;
- if (b == lim)
- break;
- fwrite (beg, sizeof (char), match_offset, stdout);
- printf ("\33[%sm", grep_color);
- fwrite (b, sizeof (char), match_size, stdout);
- fputs ("\33[00m", stdout);
- beg = b + match_size;
- ibeg = ibeg + match_offset + match_size;
- }
- fwrite (beg, 1, lim - beg, stdout);
- free (buf);
- lastout = lim;
- return;
- }
while (lim-beg && (match_offset = (*execute) (beg, lim - beg, &match_size, 1))
!= (size_t) -1)
{
@@ -601,6 +567,7 @@ prline (char const *beg, char const *lim, int sep)
fputs ("\33[00m", stdout);
beg = b + match_size;
}
+ fputs ("\33[K", stdout);
}
fwrite (beg, 1, lim - beg, stdout);
if (ferror (stdout))
@@ -623,7 +590,7 @@ prpending (char const *lim)
size_t match_size;
--pending;
if (outleft
- || (((*execute) (lastout, nl - lastout, &match_size, 0) == (size_t) -1)
+ || (((*execute) (lastout, nl + 1 - lastout, &match_size, 0) == (size_t) -1)
== !out_invert))
prline (lastout, nl + 1, '-');
else
@@ -895,6 +862,19 @@ grepfile (char const *file, struct stats *stats)
}
else
{
+ if (stat (file, &stats->stat) != 0)
+ {
+ suppressible_error (file, errno);
+ return 1;
+ }
+ if (directories == SKIP_DIRECTORIES && S_ISDIR (stats->stat.st_mode))
+ return 1;
+#ifndef DJGPP
+ if (devices == SKIP_DEVICES && (S_ISCHR(stats->stat.st_mode) || S_ISBLK(stats->stat.st_mode) || S_ISSOCK(stats->stat.st_mode) || S_ISFIFO(stats->stat.st_mode)))
+#else
+ if (devices == SKIP_DEVICES && (S_ISCHR(stats->stat.st_mode) || S_ISBLK(stats->stat.st_mode)))
+#endif
+ return 1;
while ((desc = open (file, O_RDONLY)) < 0 && errno == EINTR)
continue;
@@ -1681,9 +1661,6 @@ warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.\n"))
out_invert ^= 1;
match_lines = match_words = 0;
}
- else
- /* Strip trailing newline. */
- --keycc;
}
else
if (optind < argc)
@@ -1697,6 +1674,37 @@ warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.\n"))
if (!install_matcher (matcher) && !install_matcher ("default"))
abort ();
+#ifdef MBS_SUPPORT
+ if (MB_CUR_MAX != 1 && match_icase)
+ {
+ wchar_t wc;
+ mbstate_t cur_state, prev_state;
+ int i, len = strlen(keys);
+
+ memset(&cur_state, 0, sizeof(mbstate_t));
+ for (i = 0; i <= len ;)
+ {
+ size_t mbclen;
+ mbclen = mbrtowc(&wc, keys + i, len - i, &cur_state);
+ if (mbclen == (size_t) -1 || mbclen == (size_t) -2 || mbclen == 0)
+ {
+ /* An invalid sequence, or a truncated multibyte character.
+ We treat it as a singlebyte character. */
+ mbclen = 1;
+ }
+ else
+ {
+ if (iswupper((wint_t)wc))
+ {
+ wc = towlower((wint_t)wc);
+ wcrtomb(keys + i, wc, &cur_state);
+ }
+ }
+ i += mbclen;
+ }
+ }
+#endif /* MBS_SUPPORT */
+
(*compile)(keys, keycc);
if ((argc - optind > 1 && !no_filenames) || with_filenames)
diff --git a/src/search.c b/src/search.c
index 7bd233f..3c6a485 100644
--- a/src/search.c
+++ b/src/search.c
@@ -18,9 +18,13 @@
/* Written August 1992 by Mike Haertel. */
+#ifndef _GNU_SOURCE
+# define _GNU_SOURCE 1
+#endif
#ifdef HAVE_CONFIG_H
# include <config.h>
#endif
+#include <assert.h>
#include <sys/types.h>
#if defined HAVE_WCTYPE_H && defined HAVE_WCHAR_H && defined HAVE_MBRTOWC
/* We can handle multibyte string. */
@@ -31,7 +35,7 @@
#include "system.h"
#include "grep.h"
-#include "regex.h"
+#include <regex.h>
#include "dfa.h"
#include "kwset.h"
#include "error.h"
@@ -39,6 +43,9 @@
#ifdef HAVE_LIBPCRE
# include <pcre.h>
#endif
+#ifdef HAVE_LANGINFO_CODESET
+# include <langinfo.h>
+#endif
#define NCHAR (UCHAR_MAX + 1)
@@ -70,9 +77,10 @@ static kwset_t kwset;
call the regexp matcher at all. */
static int kwset_exact_matches;
-#if defined(MBS_SUPPORT)
-static char* check_multibyte_string PARAMS ((char const *buf, size_t size));
-#endif
+/* UTF-8 encoding allows some optimizations that we can't otherwise
+ assume in a multibyte encoding. */
+static int using_utf8;
+
static void kwsinit PARAMS ((void));
static void kwsmusts PARAMS ((void));
static void Gcompile PARAMS ((char const *, size_t));
@@ -84,6 +92,15 @@ static void Pcompile PARAMS ((char const *, size_t ));
static size_t Pexecute PARAMS ((char const *, size_t, size_t *, int));
void
+check_utf8 (void)
+{
+#ifdef HAVE_LANGINFO_CODESET
+ if (strcmp (nl_langinfo (CODESET), "UTF-8") == 0)
+ using_utf8 = 1;
+#endif
+}
+
+void
dfaerror (char const *mesg)
{
error (2, 0, mesg);
@@ -141,38 +158,6 @@ kwsmusts (void)
}
}
-#ifdef MBS_SUPPORT
-/* This function allocate the array which correspond to "buf".
- Then this check multibyte string and mark on the positions which
- are not singlebyte character nor the first byte of a multibyte
- character. Caller must free the array. */
-static char*
-check_multibyte_string(char const *buf, size_t size)
-{
- char *mb_properties = malloc(size);
- mbstate_t cur_state;
- int i;
- memset(&cur_state, 0, sizeof(mbstate_t));
- memset(mb_properties, 0, sizeof(char)*size);
- for (i = 0; i < size ;)
- {
- size_t mbclen;
- mbclen = mbrlen(buf + i, size - i, &cur_state);
-
- if (mbclen == (size_t) -1 || mbclen == (size_t) -2 || mbclen == 0)
- {
- /* An invalid sequence, or a truncated multibyte character.
- We treat it as a singlebyte character. */
- mbclen = 1;
- }
- mb_properties[i] = mbclen;
- i += mbclen;
- }
-
- return mb_properties;
-}
-#endif
-
static void
Gcompile (char const *pattern, size_t size)
{
@@ -181,7 +166,8 @@ Gcompile (char const *pattern, size_t size)
size_t total = size;
char const *motif = pattern;
- re_set_syntax (RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE);
+ check_utf8 ();
+ re_set_syntax (RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE | (match_icase ? RE_ICASE : 0));
dfasyntax (RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE, match_icase, eolbyte);
/* For GNU regex compiler we have to pass the patterns separately to detect
@@ -218,6 +204,10 @@ Gcompile (char const *pattern, size_t size)
motif = sep;
} while (sep && total != 0);
+ /* Strip trailing newline. */
+ if (size && pattern[size - 1] == '\n')
+ size--;
+
/* In the match_words and match_lines cases, we use a different pattern
for the DFA matcher that will quickly throw out cases that won't work.
Then if DFA succeeds we do some hairy stuff using the regex matcher
@@ -233,7 +223,7 @@ Gcompile (char const *pattern, size_t size)
static char const line_end[] = "\\)$";
static char const word_beg[] = "\\(^\\|[^[:alnum:]_]\\)\\(";
static char const word_end[] = "\\)\\([^[:alnum:]_]\\|$\\)";
- char *n = malloc (sizeof word_beg - 1 + size + sizeof word_end);
+ char *n = xmalloc (sizeof word_beg - 1 + size + sizeof word_end);
size_t i;
strcpy (n, match_lines ? line_beg : word_beg);
i = strlen (n);
@@ -257,14 +247,15 @@ Ecompile (char const *pattern, size_t size)
size_t total = size;
char const *motif = pattern;
+ check_utf8 ();
if (strcmp (matcher, "awk") == 0)
{
- re_set_syntax (RE_SYNTAX_AWK);
+ re_set_syntax (RE_SYNTAX_AWK | (match_icase ? RE_ICASE : 0));
dfasyntax (RE_SYNTAX_AWK, match_icase, eolbyte);
}
else
{
- re_set_syntax (RE_SYNTAX_POSIX_EGREP);
+ re_set_syntax (RE_SYNTAX_POSIX_EGREP | (match_icase ? RE_ICASE : 0));
dfasyntax (RE_SYNTAX_POSIX_EGREP, match_icase, eolbyte);
}
@@ -301,6 +292,10 @@ Ecompile (char const *pattern, size_t size)
motif = sep;
} while (sep && total != 0);
+ /* Strip trailing newline. */
+ if (size && pattern[size - 1] == '\n')
+ size--;
+
/* In the match_words and match_lines cases, we use a different pattern
for the DFA matcher that will quickly throw out cases that won't work.
Then if DFA succeeds we do some hairy stuff using the regex matcher
@@ -316,7 +311,7 @@ Ecompile (char const *pattern, size_t size)
static char const line_end[] = ")$";
static char const word_beg[] = "(^|[^[:alnum:]_])(";
static char const word_end[] = ")([^[:alnum:]_]|$)";
- char *n = malloc (sizeof word_beg - 1 + size + sizeof word_end);
+ char *n = xmalloc (sizeof word_beg - 1 + size + sizeof word_end);
size_t i;
strcpy (n, match_lines ? line_beg : word_beg);
i = strlen(n);
@@ -339,15 +334,34 @@ EGexecute (char const *buf, size_t size, size_t *match_size, int exact)
char eol = eolbyte;
int backref, start, len;
struct kwsmatch kwsm;
- size_t i;
+ size_t i, ret_val;
+ static int use_dfa;
+ static int use_dfa_checked = 0;
#ifdef MBS_SUPPORT
- char *mb_properties = NULL;
+ int mb_cur_max = MB_CUR_MAX;
+ mbstate_t mbs;
+ memset (&mbs, '\0', sizeof (mbstate_t));
#endif /* MBS_SUPPORT */
+ if (!use_dfa_checked)
+ {
+ char *grep_use_dfa = getenv ("GREP_USE_DFA");
+ if (!grep_use_dfa)
+ {
#ifdef MBS_SUPPORT
- if (MB_CUR_MAX > 1 && kwset)
- mb_properties = check_multibyte_string(buf, size);
+ /* Turn off DFA when processing multibyte input. */
+ use_dfa = (MB_CUR_MAX == 1);
+#else
+ use_dfa = 1;
#endif /* MBS_SUPPORT */
+ }
+ else
+ {
+ use_dfa = atoi (grep_use_dfa);
+ }
+
+ use_dfa_checked = 1;
+ }
buflim = buf + size;
@@ -358,47 +372,120 @@ EGexecute (char const *buf, size_t size, size_t *match_size, int exact)
if (kwset)
{
/* Find a possible match using the KWset matcher. */
- size_t offset = kwsexec (kwset, beg, buflim - beg, &kwsm);
+#ifdef MBS_SUPPORT
+ size_t bytes_left = 0;
+#endif /* MBS_SUPPORT */
+ size_t offset;
+#ifdef MBS_SUPPORT
+ /* kwsexec doesn't work with match_icase and multibyte input. */
+ if (match_icase && mb_cur_max > 1)
+ /* Avoid kwset */
+ offset = 0;
+ else
+#endif /* MBS_SUPPORT */
+ offset = kwsexec (kwset, beg, buflim - beg, &kwsm);
if (offset == (size_t) -1)
- {
+ goto failure;
#ifdef MBS_SUPPORT
- if (MB_CUR_MAX > 1)
- free(mb_properties);
-#endif
- return (size_t)-1;
+ if (mb_cur_max > 1 && !using_utf8)
+ {
+ bytes_left = offset;
+ while (bytes_left)
+ {
+ size_t mlen = mbrlen (beg, bytes_left, &mbs);
+ if (mlen == (size_t) -1 || mlen == 0)
+ {
+ /* Incomplete character: treat as single-byte. */
+ memset (&mbs, '\0', sizeof (mbstate_t));
+ beg++;
+ bytes_left--;
+ continue;
+ }
+
+ if (mlen == (size_t) -2)
+ /* Offset points inside multibyte character:
+ * no good. */
+ break;
+
+ beg += mlen;
+ bytes_left -= mlen;
+ }
}
+ else
+#endif /* MBS_SUPPORT */
beg += offset;
/* Narrow down to the line containing the candidate, and
run it through DFA. */
end = memchr(beg, eol, buflim - beg);
end++;
#ifdef MBS_SUPPORT
- if (MB_CUR_MAX > 1 && mb_properties[beg - buf] == 0)
+ if (mb_cur_max > 1 && bytes_left)
continue;
-#endif
+#endif /* MBS_SUPPORT */
while (beg > buf && beg[-1] != eol)
--beg;
- if (kwsm.index < kwset_exact_matches)
- goto success;
- if (dfaexec (&dfa, beg, end - beg, &backref) == (size_t) -1)
+ if (
+#ifdef MBS_SUPPORT
+ !(match_icase && mb_cur_max > 1) &&
+#endif /* MBS_SUPPORT */
+ (kwsm.index < kwset_exact_matches))
+ goto success_in_beg_and_end;
+ if (use_dfa &&
+ dfaexec (&dfa, beg, end - beg, &backref) == (size_t) -1)
continue;
}
else
{
/* No good fixed strings; start with DFA. */
- size_t offset = dfaexec (&dfa, beg, buflim - beg, &backref);
+#ifdef MBS_SUPPORT
+ size_t bytes_left = 0;
+#endif /* MBS_SUPPORT */
+ size_t offset = 0;
+ if (use_dfa)
+ offset = dfaexec (&dfa, beg, buflim - beg, &backref);
if (offset == (size_t) -1)
break;
/* Narrow down to the line we've found. */
+#ifdef MBS_SUPPORT
+ if (mb_cur_max > 1 && !using_utf8)
+ {
+ bytes_left = offset;
+ while (bytes_left)
+ {
+ size_t mlen = mbrlen (beg, bytes_left, &mbs);
+ if (mlen == (size_t) -1 || mlen == 0)
+ {
+ /* Incomplete character: treat as single-byte. */
+ memset (&mbs, '\0', sizeof (mbstate_t));
+ beg++;
+ bytes_left--;
+ continue;
+ }
+
+ if (mlen == (size_t) -2)
+ /* Offset points inside multibyte character:
+ * no good. */
+ break;
+
+ beg += mlen;
+ bytes_left -= mlen;
+ }
+ }
+ else
+#endif /* MBS_SUPPORT */
beg += offset;
end = memchr (beg, eol, buflim - beg);
end++;
+#ifdef MBS_SUPPORT
+ if (mb_cur_max > 1 && bytes_left)
+ continue;
+#endif /* MBS_SUPPORT */
while (beg > buf && beg[-1] != eol)
--beg;
}
/* Successful, no backreferences encountered! */
- if (!backref)
- goto success;
+ if (use_dfa && !backref)
+ goto success_in_beg_and_end;
}
else
end = beg + size;
@@ -413,14 +500,11 @@ EGexecute (char const *buf, size_t size, size_t *match_size, int exact)
end - beg - 1, &(patterns[i].regs))))
{
len = patterns[i].regs.end[0] - start;
- if (exact)
- {
- *match_size = len;
- return start;
- }
+ if (exact && !match_words)
+ goto success_in_start_and_len;
if ((!match_lines && !match_words)
|| (match_lines && len == end - beg - 1))
- goto success;
+ goto success_in_beg_and_end;
/* If -w, check if the match aligns with word boundaries.
We do this iteratively because:
(a) the line may contain more than one occurence of the
@@ -431,10 +515,114 @@ EGexecute (char const *buf, size_t size, size_t *match_size, int exact)
if (match_words)
while (start >= 0)
{
- if ((start == 0 || !WCHAR ((unsigned char) beg[start - 1]))
- && (len == end - beg - 1
- || !WCHAR ((unsigned char) beg[start + len])))
- goto success;
+ int lword_match = 0;
+ if (start == 0)
+ lword_match = 1;
+ else
+ {
+ assert (start > 0);
+#ifdef MBS_SUPPORT
+ if (mb_cur_max > 1)
+ {
+ const char *s;
+ size_t mr;
+ wchar_t pwc;
+
+ /* Locate the start of the multibyte character
+ before the match position (== beg + start). */
+ if (using_utf8)
+ {
+ /* UTF-8 is a special case: scan backwards
+ until we find a 7-bit character or a
+ lead byte. */
+ s = beg + start - 1;
+ while (s > buf
+ && (unsigned char) *s >= 0x80
+ && (unsigned char) *s <= 0xbf)
+ --s;
+ }
+ else
+ {
+ /* Scan forwards to find the start of the
+ last complete character before the
+ match position. */
+ size_t bytes_left = start - 1;
+ s = beg;
+ while (bytes_left > 0)
+ {
+ mr = mbrlen (s, bytes_left, &mbs);
+ if (mr == (size_t) -1 || mr == 0)
+ {
+ memset (&mbs, '\0', sizeof (mbs));
+ s++;
+ bytes_left--;
+ continue;
+ }
+ if (mr == (size_t) -2)
+ {
+ memset (&mbs, '\0', sizeof (mbs));
+ break;
+ }
+ s += mr;
+ bytes_left -= mr;
+ }
+ }
+ mr = mbrtowc (&pwc, s, beg + start - s, &mbs);
+ if (mr == (size_t) -2 || mr == (size_t) -1 ||
+ mr == 0)
+ {
+ memset (&mbs, '\0', sizeof (mbstate_t));
+ lword_match = 1;
+ }
+ else if (!(iswalnum (pwc) || pwc == L'_')
+ && mr == beg + start - s)
+ lword_match = 1;
+ }
+ else
+#endif /* MBS_SUPPORT */
+ if (!WCHAR ((unsigned char) beg[start - 1]))
+ lword_match = 1;
+ }
+
+ if (lword_match)
+ {
+ int rword_match = 0;
+ if (start + len == end - beg - 1)
+ rword_match = 1;
+ else
+ {
+#ifdef MBS_SUPPORT
+ if (mb_cur_max > 1)
+ {
+ wchar_t nwc;
+ int mr;
+
+ mr = mbtowc (&nwc, beg + start + len,
+ end - beg - start - len - 1);
+ if (mr <= 0)
+ {
+ memset (&mbs, '\0', sizeof (mbstate_t));
+ rword_match = 1;
+ }
+ else if (!iswalnum (nwc) && nwc != L'_')
+ rword_match = 1;
+ }
+ else
+#endif /* MBS_SUPPORT */
+ if (!WCHAR ((unsigned char) beg[start + len]))
+ rword_match = 1;
+ }
+
+ if (rword_match)
+ {
+ if (!exact)
+ /* Returns the whole line. */
+ goto success_in_beg_and_end;
+ else
+ /* Returns just this word match. */
+ goto success_in_start_and_len;
+ }
+ }
if (len > 0)
{
/* Try a shorter length anchored at the same place. */
@@ -461,26 +649,154 @@ EGexecute (char const *buf, size_t size, size_t *match_size, int exact)
}
} /* for Regex patterns. */
} /* for (beg = end ..) */
-#ifdef MBS_SUPPORT
- if (MB_CUR_MAX > 1 && mb_properties)
- free (mb_properties);
-#endif /* MBS_SUPPORT */
+
+ failure:
return (size_t) -1;
- success:
-#ifdef MBS_SUPPORT
- if (MB_CUR_MAX > 1 && mb_properties)
- free (mb_properties);
-#endif /* MBS_SUPPORT */
- *match_size = end - beg;
- return beg - buf;
+ success_in_beg_and_end:
+ len = end - beg;
+ start = beg - buf;
+ /* FALLTHROUGH */
+
+ success_in_start_and_len:
+ *match_size = len;
+ return start;
}
+#ifdef MBS_SUPPORT
+static int f_i_multibyte; /* whether we're using the new -Fi MB method */
+static struct
+{
+ wchar_t **patterns;
+ size_t count, maxlen;
+ unsigned char *match;
+} Fimb;
+#endif
+
static void
Fcompile (char const *pattern, size_t size)
{
+ int mb_cur_max = MB_CUR_MAX;
char const *beg, *lim, *err;
+ check_utf8 ();
+#ifdef MBS_SUPPORT
+ /* Support -F -i for UTF-8 input. */
+ if (match_icase && mb_cur_max > 1)
+ {
+ mbstate_t mbs;
+ wchar_t *wcpattern = xmalloc ((size + 1) * sizeof (wchar_t));
+ const char *patternend = pattern;
+ size_t wcsize;
+ kwset_t fimb_kwset = NULL;
+ char *starts = NULL;
+ wchar_t *wcbeg, *wclim;
+ size_t allocated = 0;
+
+ memset (&mbs, '\0', sizeof (mbs));
+# ifdef __GNU_LIBRARY__
+ wcsize = mbsnrtowcs (wcpattern, &patternend, size, size, &mbs);
+ if (patternend != pattern + size)
+ wcsize = (size_t) -1;
+# else
+ {
+ char *patterncopy = xmalloc (size + 1);
+
+ memcpy (patterncopy, pattern, size);
+ patterncopy[size] = '\0';
+ patternend = patterncopy;
+ wcsize = mbsrtowcs (wcpattern, &patternend, size, &mbs);
+ if (patternend != patterncopy + size)
+ wcsize = (size_t) -1;
+ free (patterncopy);
+ }
+# endif
+ if (wcsize + 2 <= 2)
+ {
+fimb_fail:
+ free (wcpattern);
+ free (starts);
+ if (fimb_kwset)
+ kwsfree (fimb_kwset);
+ free (Fimb.patterns);
+ Fimb.patterns = NULL;
+ }
+ else
+ {
+ if (!(fimb_kwset = kwsalloc (NULL)))
+ error (2, 0, _("memory exhausted"));
+
+ starts = xmalloc (mb_cur_max * 3);
+ wcbeg = wcpattern;
+ do
+ {
+ int i;
+ size_t wclen;
+
+ if (Fimb.count >= allocated)
+ {
+ if (allocated == 0)
+ allocated = 128;
+ else
+ allocated *= 2;
+ Fimb.patterns = xrealloc (Fimb.patterns,
+ sizeof (wchar_t *) * allocated);
+ }
+ Fimb.patterns[Fimb.count++] = wcbeg;
+ for (wclim = wcbeg;
+ wclim < wcpattern + wcsize && *wclim != L'\n'; ++wclim)
+ *wclim = towlower (*wclim);
+ *wclim = L'\0';
+ wclen = wclim - wcbeg;
+ if (wclen > Fimb.maxlen)
+ Fimb.maxlen = wclen;
+ if (wclen > 3)
+ wclen = 3;
+ if (wclen == 0)
+ {
+ if ((err = kwsincr (fimb_kwset, "", 0)) != 0)
+ error (2, 0, err);
+ }
+ else
+ for (i = 0; i < (1 << wclen); i++)
+ {
+ char *p = starts;
+ int j, k;
+
+ for (j = 0; j < wclen; ++j)
+ {
+ wchar_t wc = wcbeg[j];
+ if (i & (1 << j))
+ {
+ wc = towupper (wc);
+ if (wc == wcbeg[j])
+ continue;
+ }
+ k = wctomb (p, wc);
+ if (k <= 0)
+ goto fimb_fail;
+ p += k;
+ }
+ if ((err = kwsincr (fimb_kwset, starts, p - starts)) != 0)
+ error (2, 0, err);
+ }
+ if (wclim < wcpattern + wcsize)
+ ++wclim;
+ wcbeg = wclim;
+ }
+ while (wcbeg < wcpattern + wcsize);
+ f_i_multibyte = 1;
+ kwset = fimb_kwset;
+ free (starts);
+ Fimb.match = xmalloc (Fimb.count);
+ if ((err = kwsprep (kwset)) != 0)
+ error (2, 0, err);
+ return;
+ }
+ }
+#endif /* MBS_SUPPORT */
+
+
kwsinit ();
beg = pattern;
do
@@ -499,6 +815,76 @@ Fcompile (char const *pattern, size_t size)
error (2, 0, err);
}
+#ifdef MBS_SUPPORT
+static int
+Fimbexec (const char *buf, size_t size, size_t *plen, int exact)
+{
+ size_t len, letter, i;
+ int ret = -1;
+ mbstate_t mbs;
+ wchar_t wc;
+ int patterns_left;
+
+ assert (match_icase && f_i_multibyte == 1);
+ assert (MB_CUR_MAX > 1);
+
+ memset (&mbs, '\0', sizeof (mbs));
+ memset (Fimb.match, '\1', Fimb.count);
+ letter = len = 0;
+ patterns_left = 1;
+ while (patterns_left && len <= size)
+ {
+ size_t c;
+
+ patterns_left = 0;
+ if (len < size)
+ {
+ c = mbrtowc (&wc, buf + len, size - len, &mbs);
+ if (c + 2 <= 2)
+ return ret;
+
+ wc = towlower (wc);
+ }
+ else
+ {
+ c = 1;
+ wc = L'\0';
+ }
+
+ for (i = 0; i < Fimb.count; i++)
+ {
+ if (Fimb.match[i])
+ {
+ if (Fimb.patterns[i][letter] == L'\0')
+ {
+ /* Found a match. */
+ *plen = len;
+ if (!exact && !match_words)
+ return 0;
+ else
+ {
+ /* For -w or exact look for longest match. */
+ ret = 0;
+ Fimb.match[i] = '\0';
+ continue;
+ }
+ }
+
+ if (Fimb.patterns[i][letter] == wc)
+ patterns_left = 1;
+ else
+ Fimb.match[i] = '\0';
+ }
+ }
+
+ len += c;
+ letter++;
+ }
+
+ return ret;
+}
+#endif /* MBS_SUPPORT */
+
static size_t
Fexecute (char const *buf, size_t size, size_t *match_size, int exact)
{
@@ -506,88 +892,268 @@ Fexecute (char const *buf, size_t size, size_t *match_size, int exact)
register size_t len;
char eol = eolbyte;
struct kwsmatch kwsmatch;
+ size_t ret_val;
#ifdef MBS_SUPPORT
- char *mb_properties;
- if (MB_CUR_MAX > 1)
- mb_properties = check_multibyte_string (buf, size);
+ int mb_cur_max = MB_CUR_MAX;
+ mbstate_t mbs;
+ memset (&mbs, '\0', sizeof (mbstate_t));
+ const char *last_char = NULL;
#endif /* MBS_SUPPORT */
- for (beg = buf; beg <= buf + size; ++beg)
+ for (beg = buf; beg < buf + size; ++beg)
{
- size_t offset = kwsexec (kwset, beg, buf + size - beg, &kwsmatch);
+ size_t offset;
+ offset = kwsexec (kwset, beg, buf + size - beg, &kwsmatch);
+
if (offset == (size_t) -1)
- {
+ goto failure;
#ifdef MBS_SUPPORT
- if (MB_CUR_MAX > 1)
- free(mb_properties);
-#endif /* MBS_SUPPORT */
- return offset;
+ if (mb_cur_max > 1 && !using_utf8)
+ {
+ size_t bytes_left = offset;
+ while (bytes_left)
+ {
+ size_t mlen = mbrlen (beg, bytes_left, &mbs);
+
+ last_char = beg;
+ if (mlen == (size_t) -1 || mlen == 0)
+ {
+ /* Incomplete character: treat as single-byte. */
+ memset (&mbs, '\0', sizeof (mbstate_t));
+ beg++;
+ bytes_left--;
+ continue;
+ }
+
+ if (mlen == (size_t) -2)
+ /* Offset points inside multibyte character: no good. */
+ break;
+
+ beg += mlen;
+ bytes_left -= mlen;
+ }
+
+ if (bytes_left)
+ continue;
}
-#ifdef MBS_SUPPORT
- if (MB_CUR_MAX > 1 && mb_properties[offset+beg-buf] == 0)
- continue; /* It is a part of multibyte character. */
+ else
#endif /* MBS_SUPPORT */
beg += offset;
- len = kwsmatch.size[0];
- if (exact)
- {
- *match_size = len;
#ifdef MBS_SUPPORT
- if (MB_CUR_MAX > 1)
- free (mb_properties);
+ /* For f_i_multibyte, the string at beg now matches first 3 chars of
+ one of the search strings (less if there are shorter search strings).
+ See if this is a real match. */
+ if (f_i_multibyte
+ && Fimbexec (beg, buf + size - beg, &kwsmatch.size[0], exact))
+ goto next_char;
#endif /* MBS_SUPPORT */
- return beg - buf;
- }
+ len = kwsmatch.size[0];
+ if (exact && !match_words)
+ goto success_in_beg_and_len;
if (match_lines)
{
if (beg > buf && beg[-1] != eol)
- continue;
+ goto next_char;
if (beg + len < buf + size && beg[len] != eol)
- continue;
+ goto next_char;
goto success;
}
else if (match_words)
- for (try = beg; len; )
- {
- if (try > buf && WCHAR((unsigned char) try[-1]))
- break;
- if (try + len < buf + size && WCHAR((unsigned char) try[len]))
- {
- offset = kwsexec (kwset, beg, --len, &kwsmatch);
- if (offset == (size_t) -1)
- {
+ {
+ while (len)
+ {
+ int word_match = 0;
+ if (beg > buf)
+ {
#ifdef MBS_SUPPORT
- if (MB_CUR_MAX > 1)
- free (mb_properties);
+ if (mb_cur_max > 1)
+ {
+ const char *s;
+ int mr;
+ wchar_t pwc;
+
+ if (using_utf8)
+ {
+ s = beg - 1;
+ while (s > buf
+ && (unsigned char) *s >= 0x80
+ && (unsigned char) *s <= 0xbf)
+ --s;
+ }
+ else
+ s = last_char;
+ mr = mbtowc (&pwc, s, beg - s);
+ if (mr <= 0)
+ memset (&mbs, '\0', sizeof (mbstate_t));
+ else if ((iswalnum (pwc) || pwc == L'_')
+ && mr == (int) (beg - s))
+ goto next_char;
+ }
+ else
#endif /* MBS_SUPPORT */
- return offset;
- }
- try = beg + offset;
- len = kwsmatch.size[0];
- }
- else
- goto success;
- }
+ if (WCHAR ((unsigned char) beg[-1]))
+ goto next_char;
+ }
+#ifdef MBS_SUPPORT
+ if (mb_cur_max > 1)
+ {
+ wchar_t nwc;
+ int mr;
+
+ mr = mbtowc (&nwc, beg + len, buf + size - beg - len);
+ if (mr <= 0)
+ {
+ memset (&mbs, '\0', sizeof (mbstate_t));
+ word_match = 1;
+ }
+ else if (!iswalnum (nwc) && nwc != L'_')
+ word_match = 1;
+ }
+ else
+#endif /* MBS_SUPPORT */
+ if (beg + len >= buf + size || !WCHAR ((unsigned char) beg[len]))
+ word_match = 1;
+ if (word_match)
+ {
+ if (!exact)
+ /* Returns the whole line now we know there's a word match. */
+ goto success;
+ else
+ /* Returns just this word match. */
+ goto success_in_beg_and_len;
+ }
+ if (len > 0)
+ {
+ /* Try a shorter length anchored at the same place. */
+ --len;
+ offset = kwsexec (kwset, beg, len, &kwsmatch);
+
+ if (offset == -1)
+ goto next_char; /* Try a different anchor. */
+#ifdef MBS_SUPPORT
+ if (mb_cur_max > 1 && !using_utf8)
+ {
+ size_t bytes_left = offset;
+ while (bytes_left)
+ {
+ size_t mlen = mbrlen (beg, bytes_left, &mbs);
+
+ last_char = beg;
+ if (mlen == (size_t) -1 || mlen == 0)
+ {
+ /* Incomplete character: treat as single-byte. */
+ memset (&mbs, '\0', sizeof (mbstate_t));
+ beg++;
+ bytes_left--;
+ continue;
+ }
+
+ if (mlen == (size_t) -2)
+ {
+ /* Offset points inside multibyte character:
+ * no good. */
+ break;
+ }
+
+ beg += mlen;
+ bytes_left -= mlen;
+ }
+
+ if (bytes_left)
+ {
+ memset (&mbs, '\0', sizeof (mbstate_t));
+ goto next_char; /* Try a different anchor. */
+ }
+ }
+ else
+#endif /* MBS_SUPPORT */
+ beg += offset;
+#ifdef MBS_SUPPORT
+ /* The string at beg now matches first 3 chars of one of
+ the search strings (less if there are shorter search
+ strings). See if this is a real match. */
+ if (f_i_multibyte
+ && Fimbexec (beg, len - offset, &kwsmatch.size[0],
+ exact))
+ goto next_char;
+#endif /* MBS_SUPPORT */
+ len = kwsmatch.size[0];
+ }
+ }
+ }
else
goto success;
- }
-
+next_char:;
#ifdef MBS_SUPPORT
- if (MB_CUR_MAX > 1)
- free (mb_properties);
+ /* Advance to next character. For MB_CUR_MAX == 1 case this is handled
+ by ++beg above. */
+ if (mb_cur_max > 1)
+ {
+ if (using_utf8)
+ {
+ unsigned char c = *beg;
+ if (c >= 0xc2)
+ {
+ if (c < 0xe0)
+ ++beg;
+ else if (c < 0xf0)
+ beg += 2;
+ else if (c < 0xf8)
+ beg += 3;
+ else if (c < 0xfc)
+ beg += 4;
+ else if (c < 0xfe)
+ beg += 5;
+ }
+ }
+ else
+ {
+ size_t l = mbrlen (beg, buf + size - beg, &mbs);
+
+ last_char = beg;
+ if (l + 2 >= 2)
+ beg += l - 1;
+ else
+ memset (&mbs, '\0', sizeof (mbstate_t));
+ }
+ }
#endif /* MBS_SUPPORT */
+ }
+
+ failure:
return -1;
success:
+#ifdef MBS_SUPPORT
+ if (mb_cur_max > 1 && !using_utf8)
+ {
+ end = beg + len;
+ while (end < buf + size)
+ {
+ size_t mlen = mbrlen (end, buf + size - end, &mbs);
+ if (mlen == (size_t) -1 || mlen == (size_t) -2 || mlen == 0)
+ {
+ memset (&mbs, '\0', sizeof (mbstate_t));
+ mlen = 1;
+ }
+ if (mlen == 1 && *end == eol)
+ break;
+
+ end += mlen;
+ }
+ }
+ else
+#endif /* MBS_SUPPORT */
end = memchr (beg + len, eol, (buf + size) - (beg + len));
+
end++;
while (buf < beg && beg[-1] != eol)
--beg;
- *match_size = end - beg;
-#ifdef MBS_SUPPORT
- if (MB_CUR_MAX > 1)
- free (mb_properties);
-#endif /* MBS_SUPPORT */
+ len = end - beg;
+ /* FALLTHROUGH */
+
+ success_in_beg_and_len:
+ *match_size = len;
return beg - buf;
}
@@ -701,8 +1267,9 @@ Pexecute (char const *buf, size_t size, size_t *match_size, int exact)
char eol = eolbyte;
if (!exact)
{
- end = memchr (end, eol, buflim - end);
- end++;
+ while (end < buflim)
+ if (*end++ == eol)
+ break;
while (buf < beg && beg[-1] != eol)
--beg;
}
--
1.8.4.2