blob: 327ee56402275f6b75c5a4bcddf652801e501c33 [file] [log] [blame]
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001From c884dd12ec062569335702848fc5f29f436c28fa Mon Sep 17 00:00:00 2001
2From: Li xin <lixin.fnst@cn.fujitsu.com>
3Date: Mon, 25 May 2015 10:15:57 +0900
4Subject: [PATCH] grep egrep fgrep: Fix LSB NG cases.
5
6The LSB core test requires grep egrep and fgrep can
7perform pattern matching in searches without regard
8to case if -i option is specified.
9
10Upstream-Status: backport.
11
12Signed-off-by: Li Xin <lixin.fnst@cn.fujitsu.com>
13---
14 lib/posix/regex.h | 4 +
15 src/dfa.c | 22 +-
16 src/grep.c | 96 ++++---
17 src/search.c | 833 +++++++++++++++++++++++++++++++++++++++++++++---------
18 4 files changed, 768 insertions(+), 187 deletions(-)
19
20diff --git a/lib/posix/regex.h b/lib/posix/regex.h
21index 63c2fef..7bb2b0e 100644
22--- a/lib/posix/regex.h
23+++ b/lib/posix/regex.h
24@@ -109,6 +109,10 @@ typedef unsigned long int reg_syntax_t;
25 If not set, \{, \}, {, and } are literals. */
26 #define RE_INTERVALS (RE_HAT_LISTS_NOT_NEWLINE << 1)
27
28+/* If this bit is set, then ignore case when matching.
29+ If not set, then case is significant. */
30+#define RE_ICASE (RE_INVALID_INTERVAL_ORD << 1)
31+
32 /* If this bit is set, +, ? and | aren't recognized as operators.
33 If not set, they are. */
34 #define RE_LIMITED_OPS (RE_INTERVALS << 1)
35diff --git a/src/dfa.c b/src/dfa.c
36index 590bfa7..27c876a 100644
37--- a/src/dfa.c
38+++ b/src/dfa.c
39@@ -414,7 +414,7 @@ update_mb_len_index (unsigned char const *p, int len)
40
41 /* This function fetch a wide character, and update cur_mb_len,
42 used only if the current locale is a multibyte environment. */
43-static wchar_t
44+static wint_t
45 fetch_wc (char const *eoferr)
46 {
47 wchar_t wc;
48@@ -423,7 +423,7 @@ fetch_wc (char const *eoferr)
49 if (eoferr != 0)
50 dfaerror (eoferr);
51 else
52- return -1;
53+ return WEOF;
54 }
55
56 cur_mb_len = mbrtowc(&wc, lexptr, lexleft, &mbs);
57@@ -459,7 +459,7 @@ fetch_wc (char const *eoferr)
58 static void
59 parse_bracket_exp_mb ()
60 {
61- wchar_t wc, wc1, wc2;
62+ wint_t wc, wc1, wc2;
63
64 /* Work area to build a mb_char_classes. */
65 struct mb_char_classes *work_mbc;
66@@ -496,7 +496,7 @@ parse_bracket_exp_mb ()
67 work_mbc->invert = 0;
68 do
69 {
70- wc1 = -1; /* mark wc1 is not initialized". */
71+ wc1 = WEOF; /* mark wc1 is not initialized". */
72
73 /* Note that if we're looking at some other [:...:] construct,
74 we just treat it as a bunch of ordinary characters. We can do
75@@ -586,7 +586,7 @@ parse_bracket_exp_mb ()
76 work_mbc->coll_elems[work_mbc->ncoll_elems++] = elem;
77 }
78 }
79- wc = -1;
80+ wc1 = wc = WEOF;
81 }
82 else
83 /* We treat '[' as a normal character here. */
84@@ -600,7 +600,7 @@ parse_bracket_exp_mb ()
85 wc = fetch_wc(("Unbalanced ["));
86 }
87
88- if (wc1 == -1)
89+ if (wc1 == WEOF)
90 wc1 = fetch_wc(_("Unbalanced ["));
91
92 if (wc1 == L'-')
93@@ -630,17 +630,17 @@ parse_bracket_exp_mb ()
94 }
95 REALLOC_IF_NECESSARY(work_mbc->range_sts, wchar_t,
96 range_sts_al, work_mbc->nranges + 1);
97- work_mbc->range_sts[work_mbc->nranges] = wc;
98+ work_mbc->range_sts[work_mbc->nranges] = (wchar_t)wc;
99 REALLOC_IF_NECESSARY(work_mbc->range_ends, wchar_t,
100 range_ends_al, work_mbc->nranges + 1);
101- work_mbc->range_ends[work_mbc->nranges++] = wc2;
102+ work_mbc->range_ends[work_mbc->nranges++] = (wchar_t)wc2;
103 }
104- else if (wc != -1)
105+ else if (wc != WEOF)
106 /* build normal characters. */
107 {
108 REALLOC_IF_NECESSARY(work_mbc->chars, wchar_t, chars_al,
109 work_mbc->nchars + 1);
110- work_mbc->chars[work_mbc->nchars++] = wc;
111+ work_mbc->chars[work_mbc->nchars++] = (wchar_t)wc;
112 }
113 }
114 while ((wc = wc1) != L']');
115@@ -2552,6 +2552,8 @@ match_mb_charset (struct dfa *d, int s, position pos, int index)
116 }
117
118 /* match with a character? */
119+ if (case_fold)
120+ wc = towlower (wc);
121 for (i = 0; i<work_mbc->nchars; i++)
122 {
123 if (wc == work_mbc->chars[i])
124diff --git a/src/grep.c b/src/grep.c
125index 2fb2fac..3fd4b47 100644
126--- a/src/grep.c
127+++ b/src/grep.c
128@@ -30,6 +30,12 @@
129 # include <sys/time.h>
130 # include <sys/resource.h>
131 #endif
132+#if defined HAVE_WCTYPE_H && defined HAVE_WCHAR_H && defined HAVE_MBRTOWC
133+/* We can handle multibyte string. */
134+# define MBS_SUPPORT
135+# include <wchar.h>
136+# include <wctype.h>
137+#endif
138 #include <stdio.h>
139 #include "system.h"
140 #include "getopt.h"
141@@ -255,19 +261,6 @@ reset (int fd, char const *file, struct stats *stats)
142 bufbeg[-1] = eolbyte;
143 bufdesc = fd;
144
145- if (fstat (fd, &stats->stat) != 0)
146- {
147- error (0, errno, "fstat");
148- return 0;
149- }
150- if (directories == SKIP_DIRECTORIES && S_ISDIR (stats->stat.st_mode))
151- return 0;
152-#ifndef DJGPP
153- if (devices == SKIP_DEVICES && (S_ISCHR(stats->stat.st_mode) || S_ISBLK(stats->stat.st_mode) || S_ISSOCK(stats->stat.st_mode)))
154-#else
155- if (devices == SKIP_DEVICES && (S_ISCHR(stats->stat.st_mode) || S_ISBLK(stats->stat.st_mode)))
156-#endif
157- return 0;
158 if (S_ISREG (stats->stat.st_mode))
159 {
160 if (file)
161@@ -558,33 +551,6 @@ prline (char const *beg, char const *lim, int sep)
162 {
163 size_t match_size;
164 size_t match_offset;
165- if(match_icase)
166- {
167- /* Yuck, this is tricky */
168- char *buf = (char*) xmalloc (lim - beg);
169- char *ibeg = buf;
170- char *ilim = ibeg + (lim - beg);
171- int i;
172- for (i = 0; i < lim - beg; i++)
173- ibeg[i] = tolower (beg[i]);
174- while ((match_offset = (*execute) (ibeg, ilim-ibeg, &match_size, 1))
175- != (size_t) -1)
176- {
177- char const *b = beg + match_offset;
178- if (b == lim)
179- break;
180- fwrite (beg, sizeof (char), match_offset, stdout);
181- printf ("\33[%sm", grep_color);
182- fwrite (b, sizeof (char), match_size, stdout);
183- fputs ("\33[00m", stdout);
184- beg = b + match_size;
185- ibeg = ibeg + match_offset + match_size;
186- }
187- fwrite (beg, 1, lim - beg, stdout);
188- free (buf);
189- lastout = lim;
190- return;
191- }
192 while (lim-beg && (match_offset = (*execute) (beg, lim - beg, &match_size, 1))
193 != (size_t) -1)
194 {
195@@ -601,6 +567,7 @@ prline (char const *beg, char const *lim, int sep)
196 fputs ("\33[00m", stdout);
197 beg = b + match_size;
198 }
199+ fputs ("\33[K", stdout);
200 }
201 fwrite (beg, 1, lim - beg, stdout);
202 if (ferror (stdout))
203@@ -623,7 +590,7 @@ prpending (char const *lim)
204 size_t match_size;
205 --pending;
206 if (outleft
207- || (((*execute) (lastout, nl - lastout, &match_size, 0) == (size_t) -1)
208+ || (((*execute) (lastout, nl + 1 - lastout, &match_size, 0) == (size_t) -1)
209 == !out_invert))
210 prline (lastout, nl + 1, '-');
211 else
212@@ -895,6 +862,19 @@ grepfile (char const *file, struct stats *stats)
213 }
214 else
215 {
216+ if (stat (file, &stats->stat) != 0)
217+ {
218+ suppressible_error (file, errno);
219+ return 1;
220+ }
221+ if (directories == SKIP_DIRECTORIES && S_ISDIR (stats->stat.st_mode))
222+ return 1;
223+#ifndef DJGPP
224+ if (devices == SKIP_DEVICES && (S_ISCHR(stats->stat.st_mode) || S_ISBLK(stats->stat.st_mode) || S_ISSOCK(stats->stat.st_mode) || S_ISFIFO(stats->stat.st_mode)))
225+#else
226+ if (devices == SKIP_DEVICES && (S_ISCHR(stats->stat.st_mode) || S_ISBLK(stats->stat.st_mode)))
227+#endif
228+ return 1;
229 while ((desc = open (file, O_RDONLY)) < 0 && errno == EINTR)
230 continue;
231
232@@ -1681,9 +1661,6 @@ warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.\n"))
233 out_invert ^= 1;
234 match_lines = match_words = 0;
235 }
236- else
237- /* Strip trailing newline. */
238- --keycc;
239 }
240 else
241 if (optind < argc)
242@@ -1697,6 +1674,37 @@ warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.\n"))
243 if (!install_matcher (matcher) && !install_matcher ("default"))
244 abort ();
245
246+#ifdef MBS_SUPPORT
247+ if (MB_CUR_MAX != 1 && match_icase)
248+ {
249+ wchar_t wc;
250+ mbstate_t cur_state, prev_state;
251+ int i, len = strlen(keys);
252+
253+ memset(&cur_state, 0, sizeof(mbstate_t));
254+ for (i = 0; i <= len ;)
255+ {
256+ size_t mbclen;
257+ mbclen = mbrtowc(&wc, keys + i, len - i, &cur_state);
258+ if (mbclen == (size_t) -1 || mbclen == (size_t) -2 || mbclen == 0)
259+ {
260+ /* An invalid sequence, or a truncated multibyte character.
261+ We treat it as a singlebyte character. */
262+ mbclen = 1;
263+ }
264+ else
265+ {
266+ if (iswupper((wint_t)wc))
267+ {
268+ wc = towlower((wint_t)wc);
269+ wcrtomb(keys + i, wc, &cur_state);
270+ }
271+ }
272+ i += mbclen;
273+ }
274+ }
275+#endif /* MBS_SUPPORT */
276+
277 (*compile)(keys, keycc);
278
279 if ((argc - optind > 1 && !no_filenames) || with_filenames)
280diff --git a/src/search.c b/src/search.c
281index 7bd233f..3c6a485 100644
282--- a/src/search.c
283+++ b/src/search.c
284@@ -18,9 +18,13 @@
285
286 /* Written August 1992 by Mike Haertel. */
287
288+#ifndef _GNU_SOURCE
289+# define _GNU_SOURCE 1
290+#endif
291 #ifdef HAVE_CONFIG_H
292 # include <config.h>
293 #endif
294+#include <assert.h>
295 #include <sys/types.h>
296 #if defined HAVE_WCTYPE_H && defined HAVE_WCHAR_H && defined HAVE_MBRTOWC
297 /* We can handle multibyte string. */
298@@ -31,7 +35,7 @@
299
300 #include "system.h"
301 #include "grep.h"
302-#include "regex.h"
303+#include <regex.h>
304 #include "dfa.h"
305 #include "kwset.h"
306 #include "error.h"
307@@ -39,6 +43,9 @@
308 #ifdef HAVE_LIBPCRE
309 # include <pcre.h>
310 #endif
311+#ifdef HAVE_LANGINFO_CODESET
312+# include <langinfo.h>
313+#endif
314
315 #define NCHAR (UCHAR_MAX + 1)
316
317@@ -70,9 +77,10 @@ static kwset_t kwset;
318 call the regexp matcher at all. */
319 static int kwset_exact_matches;
320
321-#if defined(MBS_SUPPORT)
322-static char* check_multibyte_string PARAMS ((char const *buf, size_t size));
323-#endif
324+/* UTF-8 encoding allows some optimizations that we can't otherwise
325+ assume in a multibyte encoding. */
326+static int using_utf8;
327+
328 static void kwsinit PARAMS ((void));
329 static void kwsmusts PARAMS ((void));
330 static void Gcompile PARAMS ((char const *, size_t));
331@@ -84,6 +92,15 @@ static void Pcompile PARAMS ((char const *, size_t ));
332 static size_t Pexecute PARAMS ((char const *, size_t, size_t *, int));
333
334 void
335+check_utf8 (void)
336+{
337+#ifdef HAVE_LANGINFO_CODESET
338+ if (strcmp (nl_langinfo (CODESET), "UTF-8") == 0)
339+ using_utf8 = 1;
340+#endif
341+}
342+
343+void
344 dfaerror (char const *mesg)
345 {
346 error (2, 0, mesg);
347@@ -141,38 +158,6 @@ kwsmusts (void)
348 }
349 }
350
351-#ifdef MBS_SUPPORT
352-/* This function allocate the array which correspond to "buf".
353- Then this check multibyte string and mark on the positions which
354- are not singlebyte character nor the first byte of a multibyte
355- character. Caller must free the array. */
356-static char*
357-check_multibyte_string(char const *buf, size_t size)
358-{
359- char *mb_properties = malloc(size);
360- mbstate_t cur_state;
361- int i;
362- memset(&cur_state, 0, sizeof(mbstate_t));
363- memset(mb_properties, 0, sizeof(char)*size);
364- for (i = 0; i < size ;)
365- {
366- size_t mbclen;
367- mbclen = mbrlen(buf + i, size - i, &cur_state);
368-
369- if (mbclen == (size_t) -1 || mbclen == (size_t) -2 || mbclen == 0)
370- {
371- /* An invalid sequence, or a truncated multibyte character.
372- We treat it as a singlebyte character. */
373- mbclen = 1;
374- }
375- mb_properties[i] = mbclen;
376- i += mbclen;
377- }
378-
379- return mb_properties;
380-}
381-#endif
382-
383 static void
384 Gcompile (char const *pattern, size_t size)
385 {
386@@ -181,7 +166,8 @@ Gcompile (char const *pattern, size_t size)
387 size_t total = size;
388 char const *motif = pattern;
389
390- re_set_syntax (RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE);
391+ check_utf8 ();
392+ re_set_syntax (RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE | (match_icase ? RE_ICASE : 0));
393 dfasyntax (RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE, match_icase, eolbyte);
394
395 /* For GNU regex compiler we have to pass the patterns separately to detect
396@@ -218,6 +204,10 @@ Gcompile (char const *pattern, size_t size)
397 motif = sep;
398 } while (sep && total != 0);
399
400+ /* Strip trailing newline. */
401+ if (size && pattern[size - 1] == '\n')
402+ size--;
403+
404 /* In the match_words and match_lines cases, we use a different pattern
405 for the DFA matcher that will quickly throw out cases that won't work.
406 Then if DFA succeeds we do some hairy stuff using the regex matcher
407@@ -233,7 +223,7 @@ Gcompile (char const *pattern, size_t size)
408 static char const line_end[] = "\\)$";
409 static char const word_beg[] = "\\(^\\|[^[:alnum:]_]\\)\\(";
410 static char const word_end[] = "\\)\\([^[:alnum:]_]\\|$\\)";
411- char *n = malloc (sizeof word_beg - 1 + size + sizeof word_end);
412+ char *n = xmalloc (sizeof word_beg - 1 + size + sizeof word_end);
413 size_t i;
414 strcpy (n, match_lines ? line_beg : word_beg);
415 i = strlen (n);
416@@ -257,14 +247,15 @@ Ecompile (char const *pattern, size_t size)
417 size_t total = size;
418 char const *motif = pattern;
419
420+ check_utf8 ();
421 if (strcmp (matcher, "awk") == 0)
422 {
423- re_set_syntax (RE_SYNTAX_AWK);
424+ re_set_syntax (RE_SYNTAX_AWK | (match_icase ? RE_ICASE : 0));
425 dfasyntax (RE_SYNTAX_AWK, match_icase, eolbyte);
426 }
427 else
428 {
429- re_set_syntax (RE_SYNTAX_POSIX_EGREP);
430+ re_set_syntax (RE_SYNTAX_POSIX_EGREP | (match_icase ? RE_ICASE : 0));
431 dfasyntax (RE_SYNTAX_POSIX_EGREP, match_icase, eolbyte);
432 }
433
434@@ -301,6 +292,10 @@ Ecompile (char const *pattern, size_t size)
435 motif = sep;
436 } while (sep && total != 0);
437
438+ /* Strip trailing newline. */
439+ if (size && pattern[size - 1] == '\n')
440+ size--;
441+
442 /* In the match_words and match_lines cases, we use a different pattern
443 for the DFA matcher that will quickly throw out cases that won't work.
444 Then if DFA succeeds we do some hairy stuff using the regex matcher
445@@ -316,7 +311,7 @@ Ecompile (char const *pattern, size_t size)
446 static char const line_end[] = ")$";
447 static char const word_beg[] = "(^|[^[:alnum:]_])(";
448 static char const word_end[] = ")([^[:alnum:]_]|$)";
449- char *n = malloc (sizeof word_beg - 1 + size + sizeof word_end);
450+ char *n = xmalloc (sizeof word_beg - 1 + size + sizeof word_end);
451 size_t i;
452 strcpy (n, match_lines ? line_beg : word_beg);
453 i = strlen(n);
454@@ -339,15 +334,34 @@ EGexecute (char const *buf, size_t size, size_t *match_size, int exact)
455 char eol = eolbyte;
456 int backref, start, len;
457 struct kwsmatch kwsm;
458- size_t i;
459+ size_t i, ret_val;
460+ static int use_dfa;
461+ static int use_dfa_checked = 0;
462 #ifdef MBS_SUPPORT
463- char *mb_properties = NULL;
464+ int mb_cur_max = MB_CUR_MAX;
465+ mbstate_t mbs;
466+ memset (&mbs, '\0', sizeof (mbstate_t));
467 #endif /* MBS_SUPPORT */
468
469+ if (!use_dfa_checked)
470+ {
471+ char *grep_use_dfa = getenv ("GREP_USE_DFA");
472+ if (!grep_use_dfa)
473+ {
474 #ifdef MBS_SUPPORT
475- if (MB_CUR_MAX > 1 && kwset)
476- mb_properties = check_multibyte_string(buf, size);
477+ /* Turn off DFA when processing multibyte input. */
478+ use_dfa = (MB_CUR_MAX == 1);
479+#else
480+ use_dfa = 1;
481 #endif /* MBS_SUPPORT */
482+ }
483+ else
484+ {
485+ use_dfa = atoi (grep_use_dfa);
486+ }
487+
488+ use_dfa_checked = 1;
489+ }
490
491 buflim = buf + size;
492
493@@ -358,47 +372,120 @@ EGexecute (char const *buf, size_t size, size_t *match_size, int exact)
494 if (kwset)
495 {
496 /* Find a possible match using the KWset matcher. */
497- size_t offset = kwsexec (kwset, beg, buflim - beg, &kwsm);
498+#ifdef MBS_SUPPORT
499+ size_t bytes_left = 0;
500+#endif /* MBS_SUPPORT */
501+ size_t offset;
502+#ifdef MBS_SUPPORT
503+ /* kwsexec doesn't work with match_icase and multibyte input. */
504+ if (match_icase && mb_cur_max > 1)
505+ /* Avoid kwset */
506+ offset = 0;
507+ else
508+#endif /* MBS_SUPPORT */
509+ offset = kwsexec (kwset, beg, buflim - beg, &kwsm);
510 if (offset == (size_t) -1)
511- {
512+ goto failure;
513 #ifdef MBS_SUPPORT
514- if (MB_CUR_MAX > 1)
515- free(mb_properties);
516-#endif
517- return (size_t)-1;
518+ if (mb_cur_max > 1 && !using_utf8)
519+ {
520+ bytes_left = offset;
521+ while (bytes_left)
522+ {
523+ size_t mlen = mbrlen (beg, bytes_left, &mbs);
524+ if (mlen == (size_t) -1 || mlen == 0)
525+ {
526+ /* Incomplete character: treat as single-byte. */
527+ memset (&mbs, '\0', sizeof (mbstate_t));
528+ beg++;
529+ bytes_left--;
530+ continue;
531+ }
532+
533+ if (mlen == (size_t) -2)
534+ /* Offset points inside multibyte character:
535+ * no good. */
536+ break;
537+
538+ beg += mlen;
539+ bytes_left -= mlen;
540+ }
541 }
542+ else
543+#endif /* MBS_SUPPORT */
544 beg += offset;
545 /* Narrow down to the line containing the candidate, and
546 run it through DFA. */
547 end = memchr(beg, eol, buflim - beg);
548 end++;
549 #ifdef MBS_SUPPORT
550- if (MB_CUR_MAX > 1 && mb_properties[beg - buf] == 0)
551+ if (mb_cur_max > 1 && bytes_left)
552 continue;
553-#endif
554+#endif /* MBS_SUPPORT */
555 while (beg > buf && beg[-1] != eol)
556 --beg;
557- if (kwsm.index < kwset_exact_matches)
558- goto success;
559- if (dfaexec (&dfa, beg, end - beg, &backref) == (size_t) -1)
560+ if (
561+#ifdef MBS_SUPPORT
562+ !(match_icase && mb_cur_max > 1) &&
563+#endif /* MBS_SUPPORT */
564+ (kwsm.index < kwset_exact_matches))
565+ goto success_in_beg_and_end;
566+ if (use_dfa &&
567+ dfaexec (&dfa, beg, end - beg, &backref) == (size_t) -1)
568 continue;
569 }
570 else
571 {
572 /* No good fixed strings; start with DFA. */
573- size_t offset = dfaexec (&dfa, beg, buflim - beg, &backref);
574+#ifdef MBS_SUPPORT
575+ size_t bytes_left = 0;
576+#endif /* MBS_SUPPORT */
577+ size_t offset = 0;
578+ if (use_dfa)
579+ offset = dfaexec (&dfa, beg, buflim - beg, &backref);
580 if (offset == (size_t) -1)
581 break;
582 /* Narrow down to the line we've found. */
583+#ifdef MBS_SUPPORT
584+ if (mb_cur_max > 1 && !using_utf8)
585+ {
586+ bytes_left = offset;
587+ while (bytes_left)
588+ {
589+ size_t mlen = mbrlen (beg, bytes_left, &mbs);
590+ if (mlen == (size_t) -1 || mlen == 0)
591+ {
592+ /* Incomplete character: treat as single-byte. */
593+ memset (&mbs, '\0', sizeof (mbstate_t));
594+ beg++;
595+ bytes_left--;
596+ continue;
597+ }
598+
599+ if (mlen == (size_t) -2)
600+ /* Offset points inside multibyte character:
601+ * no good. */
602+ break;
603+
604+ beg += mlen;
605+ bytes_left -= mlen;
606+ }
607+ }
608+ else
609+#endif /* MBS_SUPPORT */
610 beg += offset;
611 end = memchr (beg, eol, buflim - beg);
612 end++;
613+#ifdef MBS_SUPPORT
614+ if (mb_cur_max > 1 && bytes_left)
615+ continue;
616+#endif /* MBS_SUPPORT */
617 while (beg > buf && beg[-1] != eol)
618 --beg;
619 }
620 /* Successful, no backreferences encountered! */
621- if (!backref)
622- goto success;
623+ if (use_dfa && !backref)
624+ goto success_in_beg_and_end;
625 }
626 else
627 end = beg + size;
628@@ -413,14 +500,11 @@ EGexecute (char const *buf, size_t size, size_t *match_size, int exact)
629 end - beg - 1, &(patterns[i].regs))))
630 {
631 len = patterns[i].regs.end[0] - start;
632- if (exact)
633- {
634- *match_size = len;
635- return start;
636- }
637+ if (exact && !match_words)
638+ goto success_in_start_and_len;
639 if ((!match_lines && !match_words)
640 || (match_lines && len == end - beg - 1))
641- goto success;
642+ goto success_in_beg_and_end;
643 /* If -w, check if the match aligns with word boundaries.
644 We do this iteratively because:
645 (a) the line may contain more than one occurence of the
646@@ -431,10 +515,114 @@ EGexecute (char const *buf, size_t size, size_t *match_size, int exact)
647 if (match_words)
648 while (start >= 0)
649 {
650- if ((start == 0 || !WCHAR ((unsigned char) beg[start - 1]))
651- && (len == end - beg - 1
652- || !WCHAR ((unsigned char) beg[start + len])))
653- goto success;
654+ int lword_match = 0;
655+ if (start == 0)
656+ lword_match = 1;
657+ else
658+ {
659+ assert (start > 0);
660+#ifdef MBS_SUPPORT
661+ if (mb_cur_max > 1)
662+ {
663+ const char *s;
664+ size_t mr;
665+ wchar_t pwc;
666+
667+ /* Locate the start of the multibyte character
668+ before the match position (== beg + start). */
669+ if (using_utf8)
670+ {
671+ /* UTF-8 is a special case: scan backwards
672+ until we find a 7-bit character or a
673+ lead byte. */
674+ s = beg + start - 1;
675+ while (s > buf
676+ && (unsigned char) *s >= 0x80
677+ && (unsigned char) *s <= 0xbf)
678+ --s;
679+ }
680+ else
681+ {
682+ /* Scan forwards to find the start of the
683+ last complete character before the
684+ match position. */
685+ size_t bytes_left = start - 1;
686+ s = beg;
687+ while (bytes_left > 0)
688+ {
689+ mr = mbrlen (s, bytes_left, &mbs);
690+ if (mr == (size_t) -1 || mr == 0)
691+ {
692+ memset (&mbs, '\0', sizeof (mbs));
693+ s++;
694+ bytes_left--;
695+ continue;
696+ }
697+ if (mr == (size_t) -2)
698+ {
699+ memset (&mbs, '\0', sizeof (mbs));
700+ break;
701+ }
702+ s += mr;
703+ bytes_left -= mr;
704+ }
705+ }
706+ mr = mbrtowc (&pwc, s, beg + start - s, &mbs);
707+ if (mr == (size_t) -2 || mr == (size_t) -1 ||
708+ mr == 0)
709+ {
710+ memset (&mbs, '\0', sizeof (mbstate_t));
711+ lword_match = 1;
712+ }
713+ else if (!(iswalnum (pwc) || pwc == L'_')
714+ && mr == beg + start - s)
715+ lword_match = 1;
716+ }
717+ else
718+#endif /* MBS_SUPPORT */
719+ if (!WCHAR ((unsigned char) beg[start - 1]))
720+ lword_match = 1;
721+ }
722+
723+ if (lword_match)
724+ {
725+ int rword_match = 0;
726+ if (start + len == end - beg - 1)
727+ rword_match = 1;
728+ else
729+ {
730+#ifdef MBS_SUPPORT
731+ if (mb_cur_max > 1)
732+ {
733+ wchar_t nwc;
734+ int mr;
735+
736+ mr = mbtowc (&nwc, beg + start + len,
737+ end - beg - start - len - 1);
738+ if (mr <= 0)
739+ {
740+ memset (&mbs, '\0', sizeof (mbstate_t));
741+ rword_match = 1;
742+ }
743+ else if (!iswalnum (nwc) && nwc != L'_')
744+ rword_match = 1;
745+ }
746+ else
747+#endif /* MBS_SUPPORT */
748+ if (!WCHAR ((unsigned char) beg[start + len]))
749+ rword_match = 1;
750+ }
751+
752+ if (rword_match)
753+ {
754+ if (!exact)
755+ /* Returns the whole line. */
756+ goto success_in_beg_and_end;
757+ else
758+ /* Returns just this word match. */
759+ goto success_in_start_and_len;
760+ }
761+ }
762 if (len > 0)
763 {
764 /* Try a shorter length anchored at the same place. */
765@@ -461,26 +649,154 @@ EGexecute (char const *buf, size_t size, size_t *match_size, int exact)
766 }
767 } /* for Regex patterns. */
768 } /* for (beg = end ..) */
769-#ifdef MBS_SUPPORT
770- if (MB_CUR_MAX > 1 && mb_properties)
771- free (mb_properties);
772-#endif /* MBS_SUPPORT */
773+
774+ failure:
775 return (size_t) -1;
776
777- success:
778-#ifdef MBS_SUPPORT
779- if (MB_CUR_MAX > 1 && mb_properties)
780- free (mb_properties);
781-#endif /* MBS_SUPPORT */
782- *match_size = end - beg;
783- return beg - buf;
784+ success_in_beg_and_end:
785+ len = end - beg;
786+ start = beg - buf;
787+ /* FALLTHROUGH */
788+
789+ success_in_start_and_len:
790+ *match_size = len;
791+ return start;
792 }
793
794+#ifdef MBS_SUPPORT
795+static int f_i_multibyte; /* whether we're using the new -Fi MB method */
796+static struct
797+{
798+ wchar_t **patterns;
799+ size_t count, maxlen;
800+ unsigned char *match;
801+} Fimb;
802+#endif
803+
804 static void
805 Fcompile (char const *pattern, size_t size)
806 {
807+ int mb_cur_max = MB_CUR_MAX;
808 char const *beg, *lim, *err;
809
810+ check_utf8 ();
811+#ifdef MBS_SUPPORT
812+ /* Support -F -i for UTF-8 input. */
813+ if (match_icase && mb_cur_max > 1)
814+ {
815+ mbstate_t mbs;
816+ wchar_t *wcpattern = xmalloc ((size + 1) * sizeof (wchar_t));
817+ const char *patternend = pattern;
818+ size_t wcsize;
819+ kwset_t fimb_kwset = NULL;
820+ char *starts = NULL;
821+ wchar_t *wcbeg, *wclim;
822+ size_t allocated = 0;
823+
824+ memset (&mbs, '\0', sizeof (mbs));
825+# ifdef __GNU_LIBRARY__
826+ wcsize = mbsnrtowcs (wcpattern, &patternend, size, size, &mbs);
827+ if (patternend != pattern + size)
828+ wcsize = (size_t) -1;
829+# else
830+ {
831+ char *patterncopy = xmalloc (size + 1);
832+
833+ memcpy (patterncopy, pattern, size);
834+ patterncopy[size] = '\0';
835+ patternend = patterncopy;
836+ wcsize = mbsrtowcs (wcpattern, &patternend, size, &mbs);
837+ if (patternend != patterncopy + size)
838+ wcsize = (size_t) -1;
839+ free (patterncopy);
840+ }
841+# endif
842+ if (wcsize + 2 <= 2)
843+ {
844+fimb_fail:
845+ free (wcpattern);
846+ free (starts);
847+ if (fimb_kwset)
848+ kwsfree (fimb_kwset);
849+ free (Fimb.patterns);
850+ Fimb.patterns = NULL;
851+ }
852+ else
853+ {
854+ if (!(fimb_kwset = kwsalloc (NULL)))
855+ error (2, 0, _("memory exhausted"));
856+
857+ starts = xmalloc (mb_cur_max * 3);
858+ wcbeg = wcpattern;
859+ do
860+ {
861+ int i;
862+ size_t wclen;
863+
864+ if (Fimb.count >= allocated)
865+ {
866+ if (allocated == 0)
867+ allocated = 128;
868+ else
869+ allocated *= 2;
870+ Fimb.patterns = xrealloc (Fimb.patterns,
871+ sizeof (wchar_t *) * allocated);
872+ }
873+ Fimb.patterns[Fimb.count++] = wcbeg;
874+ for (wclim = wcbeg;
875+ wclim < wcpattern + wcsize && *wclim != L'\n'; ++wclim)
876+ *wclim = towlower (*wclim);
877+ *wclim = L'\0';
878+ wclen = wclim - wcbeg;
879+ if (wclen > Fimb.maxlen)
880+ Fimb.maxlen = wclen;
881+ if (wclen > 3)
882+ wclen = 3;
883+ if (wclen == 0)
884+ {
885+ if ((err = kwsincr (fimb_kwset, "", 0)) != 0)
886+ error (2, 0, err);
887+ }
888+ else
889+ for (i = 0; i < (1 << wclen); i++)
890+ {
891+ char *p = starts;
892+ int j, k;
893+
894+ for (j = 0; j < wclen; ++j)
895+ {
896+ wchar_t wc = wcbeg[j];
897+ if (i & (1 << j))
898+ {
899+ wc = towupper (wc);
900+ if (wc == wcbeg[j])
901+ continue;
902+ }
903+ k = wctomb (p, wc);
904+ if (k <= 0)
905+ goto fimb_fail;
906+ p += k;
907+ }
908+ if ((err = kwsincr (fimb_kwset, starts, p - starts)) != 0)
909+ error (2, 0, err);
910+ }
911+ if (wclim < wcpattern + wcsize)
912+ ++wclim;
913+ wcbeg = wclim;
914+ }
915+ while (wcbeg < wcpattern + wcsize);
916+ f_i_multibyte = 1;
917+ kwset = fimb_kwset;
918+ free (starts);
919+ Fimb.match = xmalloc (Fimb.count);
920+ if ((err = kwsprep (kwset)) != 0)
921+ error (2, 0, err);
922+ return;
923+ }
924+ }
925+#endif /* MBS_SUPPORT */
926+
927+
928 kwsinit ();
929 beg = pattern;
930 do
931@@ -499,6 +815,76 @@ Fcompile (char const *pattern, size_t size)
932 error (2, 0, err);
933 }
934
935+#ifdef MBS_SUPPORT
936+static int
937+Fimbexec (const char *buf, size_t size, size_t *plen, int exact)
938+{
939+ size_t len, letter, i;
940+ int ret = -1;
941+ mbstate_t mbs;
942+ wchar_t wc;
943+ int patterns_left;
944+
945+ assert (match_icase && f_i_multibyte == 1);
946+ assert (MB_CUR_MAX > 1);
947+
948+ memset (&mbs, '\0', sizeof (mbs));
949+ memset (Fimb.match, '\1', Fimb.count);
950+ letter = len = 0;
951+ patterns_left = 1;
952+ while (patterns_left && len <= size)
953+ {
954+ size_t c;
955+
956+ patterns_left = 0;
957+ if (len < size)
958+ {
959+ c = mbrtowc (&wc, buf + len, size - len, &mbs);
960+ if (c + 2 <= 2)
961+ return ret;
962+
963+ wc = towlower (wc);
964+ }
965+ else
966+ {
967+ c = 1;
968+ wc = L'\0';
969+ }
970+
971+ for (i = 0; i < Fimb.count; i++)
972+ {
973+ if (Fimb.match[i])
974+ {
975+ if (Fimb.patterns[i][letter] == L'\0')
976+ {
977+ /* Found a match. */
978+ *plen = len;
979+ if (!exact && !match_words)
980+ return 0;
981+ else
982+ {
983+ /* For -w or exact look for longest match. */
984+ ret = 0;
985+ Fimb.match[i] = '\0';
986+ continue;
987+ }
988+ }
989+
990+ if (Fimb.patterns[i][letter] == wc)
991+ patterns_left = 1;
992+ else
993+ Fimb.match[i] = '\0';
994+ }
995+ }
996+
997+ len += c;
998+ letter++;
999+ }
1000+
1001+ return ret;
1002+}
1003+#endif /* MBS_SUPPORT */
1004+
1005 static size_t
1006 Fexecute (char const *buf, size_t size, size_t *match_size, int exact)
1007 {
1008@@ -506,88 +892,268 @@ Fexecute (char const *buf, size_t size, size_t *match_size, int exact)
1009 register size_t len;
1010 char eol = eolbyte;
1011 struct kwsmatch kwsmatch;
1012+ size_t ret_val;
1013 #ifdef MBS_SUPPORT
1014- char *mb_properties;
1015- if (MB_CUR_MAX > 1)
1016- mb_properties = check_multibyte_string (buf, size);
1017+ int mb_cur_max = MB_CUR_MAX;
1018+ mbstate_t mbs;
1019+ memset (&mbs, '\0', sizeof (mbstate_t));
1020+ const char *last_char = NULL;
1021 #endif /* MBS_SUPPORT */
1022
1023- for (beg = buf; beg <= buf + size; ++beg)
1024+ for (beg = buf; beg < buf + size; ++beg)
1025 {
1026- size_t offset = kwsexec (kwset, beg, buf + size - beg, &kwsmatch);
1027+ size_t offset;
1028+ offset = kwsexec (kwset, beg, buf + size - beg, &kwsmatch);
1029+
1030 if (offset == (size_t) -1)
1031- {
1032+ goto failure;
1033 #ifdef MBS_SUPPORT
1034- if (MB_CUR_MAX > 1)
1035- free(mb_properties);
1036-#endif /* MBS_SUPPORT */
1037- return offset;
1038+ if (mb_cur_max > 1 && !using_utf8)
1039+ {
1040+ size_t bytes_left = offset;
1041+ while (bytes_left)
1042+ {
1043+ size_t mlen = mbrlen (beg, bytes_left, &mbs);
1044+
1045+ last_char = beg;
1046+ if (mlen == (size_t) -1 || mlen == 0)
1047+ {
1048+ /* Incomplete character: treat as single-byte. */
1049+ memset (&mbs, '\0', sizeof (mbstate_t));
1050+ beg++;
1051+ bytes_left--;
1052+ continue;
1053+ }
1054+
1055+ if (mlen == (size_t) -2)
1056+ /* Offset points inside multibyte character: no good. */
1057+ break;
1058+
1059+ beg += mlen;
1060+ bytes_left -= mlen;
1061+ }
1062+
1063+ if (bytes_left)
1064+ continue;
1065 }
1066-#ifdef MBS_SUPPORT
1067- if (MB_CUR_MAX > 1 && mb_properties[offset+beg-buf] == 0)
1068- continue; /* It is a part of multibyte character. */
1069+ else
1070 #endif /* MBS_SUPPORT */
1071 beg += offset;
1072- len = kwsmatch.size[0];
1073- if (exact)
1074- {
1075- *match_size = len;
1076 #ifdef MBS_SUPPORT
1077- if (MB_CUR_MAX > 1)
1078- free (mb_properties);
1079+ /* For f_i_multibyte, the string at beg now matches first 3 chars of
1080+ one of the search strings (less if there are shorter search strings).
1081+ See if this is a real match. */
1082+ if (f_i_multibyte
1083+ && Fimbexec (beg, buf + size - beg, &kwsmatch.size[0], exact))
1084+ goto next_char;
1085 #endif /* MBS_SUPPORT */
1086- return beg - buf;
1087- }
1088+ len = kwsmatch.size[0];
1089+ if (exact && !match_words)
1090+ goto success_in_beg_and_len;
1091 if (match_lines)
1092 {
1093 if (beg > buf && beg[-1] != eol)
1094- continue;
1095+ goto next_char;
1096 if (beg + len < buf + size && beg[len] != eol)
1097- continue;
1098+ goto next_char;
1099 goto success;
1100 }
1101 else if (match_words)
1102- for (try = beg; len; )
1103- {
1104- if (try > buf && WCHAR((unsigned char) try[-1]))
1105- break;
1106- if (try + len < buf + size && WCHAR((unsigned char) try[len]))
1107- {
1108- offset = kwsexec (kwset, beg, --len, &kwsmatch);
1109- if (offset == (size_t) -1)
1110- {
1111+ {
1112+ while (len)
1113+ {
1114+ int word_match = 0;
1115+ if (beg > buf)
1116+ {
1117 #ifdef MBS_SUPPORT
1118- if (MB_CUR_MAX > 1)
1119- free (mb_properties);
1120+ if (mb_cur_max > 1)
1121+ {
1122+ const char *s;
1123+ int mr;
1124+ wchar_t pwc;
1125+
1126+ if (using_utf8)
1127+ {
1128+ s = beg - 1;
1129+ while (s > buf
1130+ && (unsigned char) *s >= 0x80
1131+ && (unsigned char) *s <= 0xbf)
1132+ --s;
1133+ }
1134+ else
1135+ s = last_char;
1136+ mr = mbtowc (&pwc, s, beg - s);
1137+ if (mr <= 0)
1138+ memset (&mbs, '\0', sizeof (mbstate_t));
1139+ else if ((iswalnum (pwc) || pwc == L'_')
1140+ && mr == (int) (beg - s))
1141+ goto next_char;
1142+ }
1143+ else
1144 #endif /* MBS_SUPPORT */
1145- return offset;
1146- }
1147- try = beg + offset;
1148- len = kwsmatch.size[0];
1149- }
1150- else
1151- goto success;
1152- }
1153+ if (WCHAR ((unsigned char) beg[-1]))
1154+ goto next_char;
1155+ }
1156+#ifdef MBS_SUPPORT
1157+ if (mb_cur_max > 1)
1158+ {
1159+ wchar_t nwc;
1160+ int mr;
1161+
1162+ mr = mbtowc (&nwc, beg + len, buf + size - beg - len);
1163+ if (mr <= 0)
1164+ {
1165+ memset (&mbs, '\0', sizeof (mbstate_t));
1166+ word_match = 1;
1167+ }
1168+ else if (!iswalnum (nwc) && nwc != L'_')
1169+ word_match = 1;
1170+ }
1171+ else
1172+#endif /* MBS_SUPPORT */
1173+ if (beg + len >= buf + size || !WCHAR ((unsigned char) beg[len]))
1174+ word_match = 1;
1175+ if (word_match)
1176+ {
1177+ if (!exact)
1178+ /* Returns the whole line now we know there's a word match. */
1179+ goto success;
1180+ else
1181+ /* Returns just this word match. */
1182+ goto success_in_beg_and_len;
1183+ }
1184+ if (len > 0)
1185+ {
1186+ /* Try a shorter length anchored at the same place. */
1187+ --len;
1188+ offset = kwsexec (kwset, beg, len, &kwsmatch);
1189+
1190+ if (offset == -1)
1191+ goto next_char; /* Try a different anchor. */
1192+#ifdef MBS_SUPPORT
1193+ if (mb_cur_max > 1 && !using_utf8)
1194+ {
1195+ size_t bytes_left = offset;
1196+ while (bytes_left)
1197+ {
1198+ size_t mlen = mbrlen (beg, bytes_left, &mbs);
1199+
1200+ last_char = beg;
1201+ if (mlen == (size_t) -1 || mlen == 0)
1202+ {
1203+ /* Incomplete character: treat as single-byte. */
1204+ memset (&mbs, '\0', sizeof (mbstate_t));
1205+ beg++;
1206+ bytes_left--;
1207+ continue;
1208+ }
1209+
1210+ if (mlen == (size_t) -2)
1211+ {
1212+ /* Offset points inside multibyte character:
1213+ * no good. */
1214+ break;
1215+ }
1216+
1217+ beg += mlen;
1218+ bytes_left -= mlen;
1219+ }
1220+
1221+ if (bytes_left)
1222+ {
1223+ memset (&mbs, '\0', sizeof (mbstate_t));
1224+ goto next_char; /* Try a different anchor. */
1225+ }
1226+ }
1227+ else
1228+#endif /* MBS_SUPPORT */
1229+ beg += offset;
1230+#ifdef MBS_SUPPORT
1231+ /* The string at beg now matches first 3 chars of one of
1232+ the search strings (less if there are shorter search
1233+ strings). See if this is a real match. */
1234+ if (f_i_multibyte
1235+ && Fimbexec (beg, len - offset, &kwsmatch.size[0],
1236+ exact))
1237+ goto next_char;
1238+#endif /* MBS_SUPPORT */
1239+ len = kwsmatch.size[0];
1240+ }
1241+ }
1242+ }
1243 else
1244 goto success;
1245- }
1246-
1247+next_char:;
1248 #ifdef MBS_SUPPORT
1249- if (MB_CUR_MAX > 1)
1250- free (mb_properties);
1251+ /* Advance to next character. For MB_CUR_MAX == 1 case this is handled
1252+ by ++beg above. */
1253+ if (mb_cur_max > 1)
1254+ {
1255+ if (using_utf8)
1256+ {
1257+ unsigned char c = *beg;
1258+ if (c >= 0xc2)
1259+ {
1260+ if (c < 0xe0)
1261+ ++beg;
1262+ else if (c < 0xf0)
1263+ beg += 2;
1264+ else if (c < 0xf8)
1265+ beg += 3;
1266+ else if (c < 0xfc)
1267+ beg += 4;
1268+ else if (c < 0xfe)
1269+ beg += 5;
1270+ }
1271+ }
1272+ else
1273+ {
1274+ size_t l = mbrlen (beg, buf + size - beg, &mbs);
1275+
1276+ last_char = beg;
1277+ if (l + 2 >= 2)
1278+ beg += l - 1;
1279+ else
1280+ memset (&mbs, '\0', sizeof (mbstate_t));
1281+ }
1282+ }
1283 #endif /* MBS_SUPPORT */
1284+ }
1285+
1286+ failure:
1287 return -1;
1288
1289 success:
1290+#ifdef MBS_SUPPORT
1291+ if (mb_cur_max > 1 && !using_utf8)
1292+ {
1293+ end = beg + len;
1294+ while (end < buf + size)
1295+ {
1296+ size_t mlen = mbrlen (end, buf + size - end, &mbs);
1297+ if (mlen == (size_t) -1 || mlen == (size_t) -2 || mlen == 0)
1298+ {
1299+ memset (&mbs, '\0', sizeof (mbstate_t));
1300+ mlen = 1;
1301+ }
1302+ if (mlen == 1 && *end == eol)
1303+ break;
1304+
1305+ end += mlen;
1306+ }
1307+ }
1308+ else
1309+#endif /* MBS_SUPPORT */
1310 end = memchr (beg + len, eol, (buf + size) - (beg + len));
1311+
1312 end++;
1313 while (buf < beg && beg[-1] != eol)
1314 --beg;
1315- *match_size = end - beg;
1316-#ifdef MBS_SUPPORT
1317- if (MB_CUR_MAX > 1)
1318- free (mb_properties);
1319-#endif /* MBS_SUPPORT */
1320+ len = end - beg;
1321+ /* FALLTHROUGH */
1322+
1323+ success_in_beg_and_len:
1324+ *match_size = len;
1325 return beg - buf;
1326 }
1327
1328@@ -701,8 +1267,9 @@ Pexecute (char const *buf, size_t size, size_t *match_size, int exact)
1329 char eol = eolbyte;
1330 if (!exact)
1331 {
1332- end = memchr (end, eol, buflim - end);
1333- end++;
1334+ while (end < buflim)
1335+ if (*end++ == eol)
1336+ break;
1337 while (buf < beg && beg[-1] != eol)
1338 --beg;
1339 }
1340--
13411.8.4.2
1342