Ruby  3.1.4p223 (2023-03-30 revision HEAD)
regparse.c
1 /**********************************************************************
2  regparse.c - Onigmo (Oniguruma-mod) (regular expression library)
3 **********************************************************************/
4 /*-
5  * Copyright (c) 2002-2008 K.Kosako <sndgk393 AT ybb DOT ne DOT jp>
6  * Copyright (c) 2011-2016 K.Takata <kentkt AT csc DOT jp>
7  * All rights reserved.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  * notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  * notice, this list of conditions and the following disclaimer in the
16  * documentation and/or other materials provided with the distribution.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28  * SUCH DAMAGE.
29  */
30 
31 #include "regparse.h"
32 #include <stdarg.h>
33 
34 #define WARN_BUFSIZE 256
35 
36 #define CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS
37 
38 
39 const OnigSyntaxType OnigSyntaxRuby = {
40  (( SYN_GNU_REGEX_OP | ONIG_SYN_OP_QMARK_NON_GREEDY |
41  ONIG_SYN_OP_ESC_OCTAL3 | ONIG_SYN_OP_ESC_X_HEX2 |
42  ONIG_SYN_OP_ESC_X_BRACE_HEX8 | ONIG_SYN_OP_ESC_CONTROL_CHARS |
43  ONIG_SYN_OP_ESC_C_CONTROL )
44  & ~ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END )
45  , ( ONIG_SYN_OP2_QMARK_GROUP_EFFECT |
46  ONIG_SYN_OP2_OPTION_RUBY |
47  ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP | ONIG_SYN_OP2_ESC_K_NAMED_BACKREF |
48  ONIG_SYN_OP2_ESC_G_SUBEXP_CALL |
49  ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY |
50  ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT |
51  ONIG_SYN_OP2_PLUS_POSSESSIVE_REPEAT |
52  ONIG_SYN_OP2_CCLASS_SET_OP | ONIG_SYN_OP2_ESC_CAPITAL_C_BAR_CONTROL |
53  ONIG_SYN_OP2_ESC_CAPITAL_M_BAR_META | ONIG_SYN_OP2_ESC_V_VTAB |
54  ONIG_SYN_OP2_ESC_H_XDIGIT |
55 #ifndef RUBY
56  ONIG_SYN_OP2_ESC_U_HEX4 |
57 #endif
58  ONIG_SYN_OP2_ESC_CAPITAL_X_EXTENDED_GRAPHEME_CLUSTER |
59  ONIG_SYN_OP2_QMARK_LPAREN_CONDITION |
60  ONIG_SYN_OP2_ESC_CAPITAL_R_LINEBREAK |
61  ONIG_SYN_OP2_ESC_CAPITAL_K_KEEP |
62  ONIG_SYN_OP2_QMARK_TILDE_ABSENT )
63  , ( SYN_GNU_REGEX_BV |
64  ONIG_SYN_ALLOW_INTERVAL_LOW_ABBREV |
65  ONIG_SYN_DIFFERENT_LEN_ALT_LOOK_BEHIND |
66  ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP |
67  ONIG_SYN_ALLOW_MULTIPLEX_DEFINITION_NAME |
68  ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY |
69  ONIG_SYN_WARN_CC_OP_NOT_ESCAPED |
70  ONIG_SYN_WARN_CC_DUP |
71  ONIG_SYN_WARN_REDUNDANT_NESTED_REPEAT )
72  , ( ONIG_OPTION_ASCII_RANGE | ONIG_OPTION_POSIX_BRACKET_ALL_RANGE |
73  ONIG_OPTION_WORD_BOUND_ALL_RANGE )
74  ,
75  {
76  (OnigCodePoint )'\\' /* esc */
77  , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar '.' */
78  , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anytime '*' */
79  , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* zero or one time '?' */
80  , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* one or more time '+' */
81  , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar anytime */
82  }
83 };
84 
85 const OnigSyntaxType* OnigDefaultSyntax = ONIG_SYNTAX_RUBY;
86 
87 extern void onig_null_warn(const char* s ARG_UNUSED) { }
88 
89 #ifdef DEFAULT_WARN_FUNCTION
90 static OnigWarnFunc onig_warn = (OnigWarnFunc )DEFAULT_WARN_FUNCTION;
91 #else
92 static OnigWarnFunc onig_warn = onig_null_warn;
93 #endif
94 
95 #ifdef DEFAULT_VERB_WARN_FUNCTION
96 static OnigWarnFunc onig_verb_warn = (OnigWarnFunc )DEFAULT_VERB_WARN_FUNCTION;
97 #else
98 static OnigWarnFunc onig_verb_warn = onig_null_warn;
99 #endif
100 
101 extern void onig_set_warn_func(OnigWarnFunc f)
102 {
103  onig_warn = f;
104 }
105 
106 extern void onig_set_verb_warn_func(OnigWarnFunc f)
107 {
108  onig_verb_warn = f;
109 }
110 
111 static void CC_DUP_WARN(ScanEnv *env, OnigCodePoint from, OnigCodePoint to);
112 
113 
114 static unsigned int ParseDepthLimit = DEFAULT_PARSE_DEPTH_LIMIT;
115 
116 extern unsigned int
117 onig_get_parse_depth_limit(void)
118 {
119  return ParseDepthLimit;
120 }
121 
122 extern int
123 onig_set_parse_depth_limit(unsigned int depth)
124 {
125  if (depth == 0)
126  ParseDepthLimit = DEFAULT_PARSE_DEPTH_LIMIT;
127  else
128  ParseDepthLimit = depth;
129  return 0;
130 }
131 
132 
133 static void
134 bbuf_free(BBuf* bbuf)
135 {
136  if (IS_NOT_NULL(bbuf)) {
137  if (IS_NOT_NULL(bbuf->p)) xfree(bbuf->p);
138  xfree(bbuf);
139  }
140 }
141 
142 static int
143 bbuf_clone(BBuf** rto, BBuf* from)
144 {
145  int r;
146  BBuf *to;
147 
148  *rto = to = (BBuf* )xmalloc(sizeof(BBuf));
149  CHECK_NULL_RETURN_MEMERR(to);
150  r = BBUF_INIT(to, from->alloc);
151  if (r != 0) return r;
152  to->used = from->used;
153  xmemcpy(to->p, from->p, from->used);
154  return 0;
155 }
156 
157 #define BACKREF_REL_TO_ABS(rel_no, env) \
158  ((env)->num_mem + 1 + (rel_no))
159 
160 #define ONOFF(v,f,negative) (negative) ? ((v) &= ~(f)) : ((v) |= (f))
161 
162 #define MBCODE_START_POS(enc) \
163  (OnigCodePoint )(ONIGENC_MBC_MINLEN(enc) > 1 ? 0 : 0x80)
164 
165 #define SET_ALL_MULTI_BYTE_RANGE(enc, pbuf) \
166  add_code_range_to_buf(pbuf, env, MBCODE_START_POS(enc), ONIG_LAST_CODE_POINT)
167 
168 #define ADD_ALL_MULTI_BYTE_RANGE(enc, mbuf) do {\
169  if (! ONIGENC_IS_SINGLEBYTE(enc)) {\
170  r = SET_ALL_MULTI_BYTE_RANGE(enc, &(mbuf));\
171  if (r) return r;\
172  }\
173 } while (0)
174 
175 
176 #define BITSET_SET_BIT_CHKDUP(bs, pos) do { \
177  if (BITSET_AT(bs, pos)) CC_DUP_WARN(env, pos, pos); \
178  BS_ROOM(bs, pos) |= BS_BIT(pos); \
179 } while (0)
180 
181 #define BITSET_IS_EMPTY(bs,empty) do {\
182  int i;\
183  empty = 1;\
184  for (i = 0; i < BITSET_SIZE; i++) {\
185  if ((bs)[i] != 0) {\
186  empty = 0; break;\
187  }\
188  }\
189 } while (0)
190 
191 static void
192 bitset_set_range(ScanEnv *env, BitSetRef bs, int from, int to)
193 {
194  int i;
195  for (i = from; i <= to && i < SINGLE_BYTE_SIZE; i++) {
196  BITSET_SET_BIT_CHKDUP(bs, i);
197  }
198 }
199 
200 #if 0
201 static void
202 bitset_set_all(BitSetRef bs)
203 {
204  int i;
205  for (i = 0; i < BITSET_SIZE; i++) { bs[i] = ~((Bits )0); }
206 }
207 #endif
208 
209 static void
210 bitset_invert(BitSetRef bs)
211 {
212  int i;
213  for (i = 0; i < BITSET_SIZE; i++) { bs[i] = ~(bs[i]); }
214 }
215 
216 static void
217 bitset_invert_to(BitSetRef from, BitSetRef to)
218 {
219  int i;
220  for (i = 0; i < BITSET_SIZE; i++) { to[i] = ~(from[i]); }
221 }
222 
223 static void
224 bitset_and(BitSetRef dest, BitSetRef bs)
225 {
226  int i;
227  for (i = 0; i < BITSET_SIZE; i++) { dest[i] &= bs[i]; }
228 }
229 
230 static void
231 bitset_or(BitSetRef dest, BitSetRef bs)
232 {
233  int i;
234  for (i = 0; i < BITSET_SIZE; i++) { dest[i] |= bs[i]; }
235 }
236 
237 static void
238 bitset_copy(BitSetRef dest, BitSetRef bs)
239 {
240  int i;
241  for (i = 0; i < BITSET_SIZE; i++) { dest[i] = bs[i]; }
242 }
243 
244 #if defined(USE_NAMED_GROUP) && !defined(USE_ST_LIBRARY)
245 extern int
246 onig_strncmp(const UChar* s1, const UChar* s2, int n)
247 {
248  int x;
249 
250  while (n-- > 0) {
251  x = *s2++ - *s1++;
252  if (x) return x;
253  }
254  return 0;
255 }
256 #endif
257 
258 extern void
259 onig_strcpy(UChar* dest, const UChar* src, const UChar* end)
260 {
261  ptrdiff_t len = end - src;
262  if (len > 0) {
263  xmemcpy(dest, src, len);
264  dest[len] = (UChar )0;
265  }
266 }
267 
268 #ifdef USE_NAMED_GROUP
269 static UChar*
270 strdup_with_null(OnigEncoding enc, UChar* s, UChar* end)
271 {
272  ptrdiff_t slen;
273  int term_len, i;
274  UChar *r;
275 
276  slen = end - s;
277  term_len = ONIGENC_MBC_MINLEN(enc);
278 
279  r = (UChar* )xmalloc(slen + term_len);
280  CHECK_NULL_RETURN(r);
281  xmemcpy(r, s, slen);
282 
283  for (i = 0; i < term_len; i++)
284  r[slen + i] = (UChar )0;
285 
286  return r;
287 }
288 #endif
289 
290 /* scan pattern methods */
291 #define PEND_VALUE 0
292 
293 #ifdef __GNUC__
294 /* get rid of Wunused-but-set-variable and Wuninitialized */
295 # define PFETCH_READY UChar* pfetch_prev = NULL; (void)pfetch_prev
296 #else
297 # define PFETCH_READY UChar* pfetch_prev
298 #endif
299 #define PEND (p < end ? 0 : 1)
300 #define PUNFETCH p = pfetch_prev
301 #define PINC do { \
302  pfetch_prev = p; \
303  p += enclen(enc, p, end); \
304 } while (0)
305 #define PFETCH(c) do { \
306  c = ((enc->max_enc_len == 1) ? *p : ONIGENC_MBC_TO_CODE(enc, p, end)); \
307  pfetch_prev = p; \
308  p += enclen(enc, p, end); \
309 } while (0)
310 
311 #define PINC_S do { \
312  p += enclen(enc, p, end); \
313 } while (0)
314 #define PFETCH_S(c) do { \
315  c = ((enc->max_enc_len == 1) ? *p : ONIGENC_MBC_TO_CODE(enc, p, end)); \
316  p += enclen(enc, p, end); \
317 } while (0)
318 
319 #define PPEEK (p < end ? ONIGENC_MBC_TO_CODE(enc, p, end) : PEND_VALUE)
320 #define PPEEK_IS(c) (PPEEK == (OnigCodePoint )c)
321 
322 static UChar*
323 strcat_capa(UChar* dest, UChar* dest_end, const UChar* src, const UChar* src_end,
324  size_t capa)
325 {
326  UChar* r;
327 
328  if (dest)
329  r = (UChar* )xrealloc(dest, capa + 1);
330  else
331  r = (UChar* )xmalloc(capa + 1);
332 
333  CHECK_NULL_RETURN(r);
334  onig_strcpy(r + (dest_end - dest), src, src_end);
335  return r;
336 }
337 
338 /* dest on static area */
339 static UChar*
340 strcat_capa_from_static(UChar* dest, UChar* dest_end,
341  const UChar* src, const UChar* src_end, size_t capa)
342 {
343  UChar* r;
344 
345  r = (UChar* )xmalloc(capa + 1);
346  CHECK_NULL_RETURN(r);
347  onig_strcpy(r, dest, dest_end);
348  onig_strcpy(r + (dest_end - dest), src, src_end);
349  return r;
350 }
351 
352 
353 #ifdef USE_ST_LIBRARY
354 
355 # ifdef RUBY
356 # include "ruby/st.h"
357 # else
358 # include "st.h"
359 # endif
360 
361 typedef struct {
362  const UChar* s;
363  const UChar* end;
365 
366 static int
367 str_end_cmp(st_data_t xp, st_data_t yp)
368 {
369  const st_str_end_key *x, *y;
370  const UChar *p, *q;
371  int c;
372 
373  x = (const st_str_end_key *)xp;
374  y = (const st_str_end_key *)yp;
375  if ((x->end - x->s) != (y->end - y->s))
376  return 1;
377 
378  p = x->s;
379  q = y->s;
380  while (p < x->end) {
381  c = (int )*p - (int )*q;
382  if (c != 0) return c;
383 
384  p++; q++;
385  }
386 
387  return 0;
388 }
389 
390 static st_index_t
391 str_end_hash(st_data_t xp)
392 {
393  const st_str_end_key *x = (const st_str_end_key *)xp;
394  const UChar *p;
395  st_index_t val = 0;
396 
397  p = x->s;
398  while (p < x->end) {
399  val = val * 997 + (int )*p++;
400  }
401 
402  return val + (val >> 5);
403 }
404 
405 extern hash_table_type*
406 onig_st_init_strend_table_with_size(st_index_t size)
407 {
408  static const struct st_hash_type hashType = {
409  str_end_cmp,
410  str_end_hash,
411  };
412 
413  return (hash_table_type* )
414  onig_st_init_table_with_size(&hashType, size);
415 }
416 
417 extern int
418 onig_st_lookup_strend(hash_table_type* table, const UChar* str_key,
419  const UChar* end_key, hash_data_type *value)
420 {
421  st_str_end_key key;
422 
423  key.s = (UChar* )str_key;
424  key.end = (UChar* )end_key;
425 
426  return onig_st_lookup(table, (st_data_t )(&key), value);
427 }
428 
429 extern int
430 onig_st_insert_strend(hash_table_type* table, const UChar* str_key,
431  const UChar* end_key, hash_data_type value)
432 {
433  st_str_end_key* key;
434  int result;
435 
436  key = (st_str_end_key* )xmalloc(sizeof(st_str_end_key));
437  key->s = (UChar* )str_key;
438  key->end = (UChar* )end_key;
439  result = onig_st_insert(table, (st_data_t )key, value);
440  if (result) {
441  xfree(key);
442  }
443  return result;
444 }
445 
446 #endif /* USE_ST_LIBRARY */
447 
448 
449 #ifdef USE_NAMED_GROUP
450 
451 # define INIT_NAME_BACKREFS_ALLOC_NUM 8
452 
453 typedef struct {
454  UChar* name;
455  size_t name_len; /* byte length */
456  int back_num; /* number of backrefs */
457  int back_alloc;
458  int back_ref1;
459  int* back_refs;
460 } NameEntry;
461 
462 # ifdef USE_ST_LIBRARY
463 
464 typedef st_table NameTable;
465 typedef st_data_t HashDataType; /* 1.6 st.h doesn't define st_data_t type */
466 
467 # ifdef ONIG_DEBUG
468 static int
469 i_print_name_entry(UChar* key, NameEntry* e, void* arg)
470 {
471  int i;
472  FILE* fp = (FILE* )arg;
473 
474  fprintf(fp, "%s: ", e->name);
475  if (e->back_num == 0)
476  fputs("-", fp);
477  else if (e->back_num == 1)
478  fprintf(fp, "%d", e->back_ref1);
479  else {
480  for (i = 0; i < e->back_num; i++) {
481  if (i > 0) fprintf(fp, ", ");
482  fprintf(fp, "%d", e->back_refs[i]);
483  }
484  }
485  fputs("\n", fp);
486  return ST_CONTINUE;
487 }
488 
489 extern int
490 onig_print_names(FILE* fp, regex_t* reg)
491 {
492  NameTable* t = (NameTable* )reg->name_table;
493 
494  if (IS_NOT_NULL(t)) {
495  fprintf(fp, "name table\n");
496  onig_st_foreach(t, (st_foreach_callback_func *)i_print_name_entry, (HashDataType )fp);
497  fputs("\n", fp);
498  }
499  return 0;
500 }
501 # endif /* ONIG_DEBUG */
502 
503 static int
504 i_free_name_entry(UChar* key, NameEntry* e, void* arg ARG_UNUSED)
505 {
506  xfree(e->name);
507  if (IS_NOT_NULL(e->back_refs)) xfree(e->back_refs);
508  xfree(key);
509  xfree(e);
510  return ST_DELETE;
511 }
512 
513 static int
514 names_clear(regex_t* reg)
515 {
516  NameTable* t = (NameTable* )reg->name_table;
517 
518  if (IS_NOT_NULL(t)) {
519  onig_st_foreach(t, (st_foreach_callback_func *)i_free_name_entry, 0);
520  }
521  return 0;
522 }
523 
524 extern int
525 onig_names_free(regex_t* reg)
526 {
527  int r;
528  NameTable* t;
529 
530  r = names_clear(reg);
531  if (r) return r;
532 
533  t = (NameTable* )reg->name_table;
534  if (IS_NOT_NULL(t)) onig_st_free_table(t);
535  reg->name_table = (void* )NULL;
536  return 0;
537 }
538 
539 static NameEntry*
540 name_find(regex_t* reg, const UChar* name, const UChar* name_end)
541 {
542  NameEntry* e;
543  NameTable* t = (NameTable* )reg->name_table;
544 
545  e = (NameEntry* )NULL;
546  if (IS_NOT_NULL(t)) {
547  onig_st_lookup_strend(t, name, name_end, (HashDataType* )((void* )(&e)));
548  }
549  return e;
550 }
551 
552 typedef struct {
553  int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*);
554  regex_t* reg;
555  void* arg;
556  int ret;
557  OnigEncoding enc;
558 } INamesArg;
559 
560 static int
561 i_names(UChar* key ARG_UNUSED, NameEntry* e, INamesArg* arg)
562 {
563  int r = (*(arg->func))(e->name,
564  e->name + e->name_len,
565  e->back_num,
566  (e->back_num > 1 ? e->back_refs : &(e->back_ref1)),
567  arg->reg, arg->arg);
568  if (r != 0) {
569  arg->ret = r;
570  return ST_STOP;
571  }
572  return ST_CONTINUE;
573 }
574 
575 extern int
576 onig_foreach_name(regex_t* reg,
577  int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*), void* arg)
578 {
579  INamesArg narg;
580  NameTable* t = (NameTable* )reg->name_table;
581 
582  narg.ret = 0;
583  if (IS_NOT_NULL(t)) {
584  narg.func = func;
585  narg.reg = reg;
586  narg.arg = arg;
587  narg.enc = reg->enc; /* should be pattern encoding. */
588  onig_st_foreach(t, (st_foreach_callback_func *)i_names, (HashDataType )&narg);
589  }
590  return narg.ret;
591 }
592 
593 static int
594 i_renumber_name(UChar* key ARG_UNUSED, NameEntry* e, GroupNumRemap* map)
595 {
596  int i;
597 
598  if (e->back_num > 1) {
599  for (i = 0; i < e->back_num; i++) {
600  e->back_refs[i] = map[e->back_refs[i]].new_val;
601  }
602  }
603  else if (e->back_num == 1) {
604  e->back_ref1 = map[e->back_ref1].new_val;
605  }
606 
607  return ST_CONTINUE;
608 }
609 
610 extern int
611 onig_renumber_name_table(regex_t* reg, GroupNumRemap* map)
612 {
613  NameTable* t = (NameTable* )reg->name_table;
614 
615  if (IS_NOT_NULL(t)) {
616  onig_st_foreach(t, (st_foreach_callback_func *)i_renumber_name, (HashDataType )map);
617  }
618  return 0;
619 }
620 
621 
622 extern int
623 onig_number_of_names(const regex_t* reg)
624 {
625  NameTable* t = (NameTable* )reg->name_table;
626 
627  if (IS_NOT_NULL(t))
628  return (int )t->num_entries;
629  else
630  return 0;
631 }
632 
633 # else /* USE_ST_LIBRARY */
634 
635 # define INIT_NAMES_ALLOC_NUM 8
636 
637 typedef struct {
638  NameEntry* e;
639  int num;
640  int alloc;
641 } NameTable;
642 
643 # ifdef ONIG_DEBUG
644 extern int
645 onig_print_names(FILE* fp, regex_t* reg)
646 {
647  int i, j;
648  NameEntry* e;
649  NameTable* t = (NameTable* )reg->name_table;
650 
651  if (IS_NOT_NULL(t) && t->num > 0) {
652  fprintf(fp, "name table\n");
653  for (i = 0; i < t->num; i++) {
654  e = &(t->e[i]);
655  fprintf(fp, "%s: ", e->name);
656  if (e->back_num == 0) {
657  fputs("-", fp);
658  }
659  else if (e->back_num == 1) {
660  fprintf(fp, "%d", e->back_ref1);
661  }
662  else {
663  for (j = 0; j < e->back_num; j++) {
664  if (j > 0) fprintf(fp, ", ");
665  fprintf(fp, "%d", e->back_refs[j]);
666  }
667  }
668  fputs("\n", fp);
669  }
670  fputs("\n", fp);
671  }
672  return 0;
673 }
674 # endif
675 
676 static int
677 names_clear(regex_t* reg)
678 {
679  int i;
680  NameEntry* e;
681  NameTable* t = (NameTable* )reg->name_table;
682 
683  if (IS_NOT_NULL(t)) {
684  for (i = 0; i < t->num; i++) {
685  e = &(t->e[i]);
686  if (IS_NOT_NULL(e->name)) {
687  xfree(e->name);
688  e->name = NULL;
689  e->name_len = 0;
690  e->back_num = 0;
691  e->back_alloc = 0;
692  if (IS_NOT_NULL(e->back_refs)) xfree(e->back_refs);
693  e->back_refs = (int* )NULL;
694  }
695  }
696  if (IS_NOT_NULL(t->e)) {
697  xfree(t->e);
698  t->e = NULL;
699  }
700  t->num = 0;
701  }
702  return 0;
703 }
704 
705 extern int
706 onig_names_free(regex_t* reg)
707 {
708  int r;
709  NameTable* t;
710 
711  r = names_clear(reg);
712  if (r) return r;
713 
714  t = (NameTable* )reg->name_table;
715  if (IS_NOT_NULL(t)) xfree(t);
716  reg->name_table = NULL;
717  return 0;
718 }
719 
720 static NameEntry*
721 name_find(regex_t* reg, const UChar* name, const UChar* name_end)
722 {
723  int i, len;
724  NameEntry* e;
725  NameTable* t = (NameTable* )reg->name_table;
726 
727  if (IS_NOT_NULL(t)) {
728  len = name_end - name;
729  for (i = 0; i < t->num; i++) {
730  e = &(t->e[i]);
731  if (len == e->name_len && onig_strncmp(name, e->name, len) == 0)
732  return e;
733  }
734  }
735  return (NameEntry* )NULL;
736 }
737 
738 extern int
739 onig_foreach_name(regex_t* reg,
740  int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*), void* arg)
741 {
742  int i, r;
743  NameEntry* e;
744  NameTable* t = (NameTable* )reg->name_table;
745 
746  if (IS_NOT_NULL(t)) {
747  for (i = 0; i < t->num; i++) {
748  e = &(t->e[i]);
749  r = (*func)(e->name, e->name + e->name_len, e->back_num,
750  (e->back_num > 1 ? e->back_refs : &(e->back_ref1)),
751  reg, arg);
752  if (r != 0) return r;
753  }
754  }
755  return 0;
756 }
757 
758 extern int
759 onig_number_of_names(const regex_t* reg)
760 {
761  NameTable* t = (NameTable* )reg->name_table;
762 
763  if (IS_NOT_NULL(t))
764  return t->num;
765  else
766  return 0;
767 }
768 
769 # endif /* else USE_ST_LIBRARY */
770 
771 static int
772 name_add(regex_t* reg, UChar* name, UChar* name_end, int backref, ScanEnv* env)
773 {
774  int alloc;
775  NameEntry* e;
776  NameTable* t = (NameTable* )reg->name_table;
777 
778  if (name_end - name <= 0)
779  return ONIGERR_EMPTY_GROUP_NAME;
780 
781  e = name_find(reg, name, name_end);
782  if (IS_NULL(e)) {
783 # ifdef USE_ST_LIBRARY
784  if (IS_NULL(t)) {
785  t = onig_st_init_strend_table_with_size(5);
786  reg->name_table = (void* )t;
787  }
788  e = (NameEntry* )xmalloc(sizeof(NameEntry));
789  CHECK_NULL_RETURN_MEMERR(e);
790 
791  e->name = strdup_with_null(reg->enc, name, name_end);
792  if (IS_NULL(e->name)) {
793  xfree(e);
794  return ONIGERR_MEMORY;
795  }
796  onig_st_insert_strend(t, e->name, (e->name + (name_end - name)),
797  (HashDataType )e);
798 
799  e->name_len = name_end - name;
800  e->back_num = 0;
801  e->back_alloc = 0;
802  e->back_refs = (int* )NULL;
803 
804 # else
805 
806  if (IS_NULL(t)) {
807  alloc = INIT_NAMES_ALLOC_NUM;
808  t = (NameTable* )xmalloc(sizeof(NameTable));
809  CHECK_NULL_RETURN_MEMERR(t);
810  t->e = NULL;
811  t->alloc = 0;
812  t->num = 0;
813 
814  t->e = (NameEntry* )xmalloc(sizeof(NameEntry) * alloc);
815  if (IS_NULL(t->e)) {
816  xfree(t);
817  return ONIGERR_MEMORY;
818  }
819  t->alloc = alloc;
820  reg->name_table = t;
821  goto clear;
822  }
823  else if (t->num == t->alloc) {
824  int i;
825  NameEntry* p;
826 
827  alloc = t->alloc * 2;
828  p = (NameEntry* )xrealloc(t->e, sizeof(NameEntry) * alloc);
829  CHECK_NULL_RETURN_MEMERR(p);
830  t->e = p;
831  t->alloc = alloc;
832 
833  clear:
834  for (i = t->num; i < t->alloc; i++) {
835  t->e[i].name = NULL;
836  t->e[i].name_len = 0;
837  t->e[i].back_num = 0;
838  t->e[i].back_alloc = 0;
839  t->e[i].back_refs = (int* )NULL;
840  }
841  }
842  e = &(t->e[t->num]);
843  t->num++;
844  e->name = strdup_with_null(reg->enc, name, name_end);
845  if (IS_NULL(e->name)) return ONIGERR_MEMORY;
846  e->name_len = name_end - name;
847 # endif
848  }
849 
850  if (e->back_num >= 1 &&
851  ! IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_MULTIPLEX_DEFINITION_NAME)) {
852  onig_scan_env_set_error_string(env, ONIGERR_MULTIPLEX_DEFINED_NAME,
853  name, name_end);
854  return ONIGERR_MULTIPLEX_DEFINED_NAME;
855  }
856 
857  e->back_num++;
858  if (e->back_num == 1) {
859  e->back_ref1 = backref;
860  }
861  else {
862  if (e->back_num == 2) {
863  alloc = INIT_NAME_BACKREFS_ALLOC_NUM;
864  e->back_refs = (int* )xmalloc(sizeof(int) * alloc);
865  CHECK_NULL_RETURN_MEMERR(e->back_refs);
866  e->back_alloc = alloc;
867  e->back_refs[0] = e->back_ref1;
868  e->back_refs[1] = backref;
869  }
870  else {
871  if (e->back_num > e->back_alloc) {
872  int* p;
873  alloc = e->back_alloc * 2;
874  p = (int* )xrealloc(e->back_refs, sizeof(int) * alloc);
875  CHECK_NULL_RETURN_MEMERR(p);
876  e->back_refs = p;
877  e->back_alloc = alloc;
878  }
879  e->back_refs[e->back_num - 1] = backref;
880  }
881  }
882 
883  return 0;
884 }
885 
886 extern int
887 onig_name_to_group_numbers(regex_t* reg, const UChar* name,
888  const UChar* name_end, int** nums)
889 {
890  NameEntry* e = name_find(reg, name, name_end);
891 
892  if (IS_NULL(e)) return ONIGERR_UNDEFINED_NAME_REFERENCE;
893 
894  switch (e->back_num) {
895  case 0:
896  *nums = 0;
897  break;
898  case 1:
899  *nums = &(e->back_ref1);
900  break;
901  default:
902  *nums = e->back_refs;
903  break;
904  }
905  return e->back_num;
906 }
907 
908 extern int
909 onig_name_to_backref_number(regex_t* reg, const UChar* name,
910  const UChar* name_end, const OnigRegion *region)
911 {
912  int i, n, *nums;
913 
914  n = onig_name_to_group_numbers(reg, name, name_end, &nums);
915  if (n < 0)
916  return n;
917  else if (n == 0)
918  return ONIGERR_PARSER_BUG;
919  else if (n == 1)
920  return nums[0];
921  else {
922  if (IS_NOT_NULL(region)) {
923  for (i = n - 1; i >= 0; i--) {
924  if (region->beg[nums[i]] != ONIG_REGION_NOTPOS)
925  return nums[i];
926  }
927  }
928  return nums[n - 1];
929  }
930 }
931 
932 #else /* USE_NAMED_GROUP */
933 
934 extern int
935 onig_name_to_group_numbers(regex_t* reg, const UChar* name,
936  const UChar* name_end, int** nums)
937 {
938  return ONIG_NO_SUPPORT_CONFIG;
939 }
940 
941 extern int
942 onig_name_to_backref_number(regex_t* reg, const UChar* name,
943  const UChar* name_end, const OnigRegion* region)
944 {
945  return ONIG_NO_SUPPORT_CONFIG;
946 }
947 
948 extern int
949 onig_foreach_name(regex_t* reg,
950  int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*), void* arg)
951 {
952  return ONIG_NO_SUPPORT_CONFIG;
953 }
954 
955 extern int
956 onig_number_of_names(const regex_t* reg)
957 {
958  return 0;
959 }
960 #endif /* else USE_NAMED_GROUP */
961 
962 extern int
963 onig_noname_group_capture_is_active(const regex_t* reg)
964 {
965  if (ONIG_IS_OPTION_ON(reg->options, ONIG_OPTION_DONT_CAPTURE_GROUP))
966  return 0;
967 
968 #ifdef USE_NAMED_GROUP
969  if (onig_number_of_names(reg) > 0 &&
970  IS_SYNTAX_BV(reg->syntax, ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP) &&
971  !ONIG_IS_OPTION_ON(reg->options, ONIG_OPTION_CAPTURE_GROUP)) {
972  return 0;
973  }
974 #endif
975 
976  return 1;
977 }
978 
979 
980 #define INIT_SCANENV_MEMNODES_ALLOC_SIZE 16
981 
982 static void
983 scan_env_clear(ScanEnv* env)
984 {
985  int i;
986 
987  BIT_STATUS_CLEAR(env->capture_history);
988  BIT_STATUS_CLEAR(env->bt_mem_start);
989  BIT_STATUS_CLEAR(env->bt_mem_end);
990  BIT_STATUS_CLEAR(env->backrefed_mem);
991  env->error = (UChar* )NULL;
992  env->error_end = (UChar* )NULL;
993  env->num_call = 0;
994  env->num_mem = 0;
995 #ifdef USE_NAMED_GROUP
996  env->num_named = 0;
997 #endif
998  env->mem_alloc = 0;
999  env->mem_nodes_dynamic = (Node** )NULL;
1000 
1001  for (i = 0; i < SCANENV_MEMNODES_SIZE; i++)
1002  env->mem_nodes_static[i] = NULL_NODE;
1003 
1004 #ifdef USE_COMBINATION_EXPLOSION_CHECK
1005  env->num_comb_exp_check = 0;
1006  env->comb_exp_max_regnum = 0;
1007  env->curr_max_regnum = 0;
1008  env->has_recursion = 0;
1009 #endif
1010  env->parse_depth = 0;
1011  env->warnings_flag = 0;
1012 }
1013 
1014 static int
1015 scan_env_add_mem_entry(ScanEnv* env)
1016 {
1017  int i, need, alloc;
1018  Node** p;
1019 
1020  need = env->num_mem + 1;
1021  if (need > ONIG_MAX_CAPTURE_GROUP_NUM)
1022  return ONIGERR_TOO_MANY_CAPTURE_GROUPS;
1023  if (need >= SCANENV_MEMNODES_SIZE) {
1024  if (env->mem_alloc <= need) {
1025  if (IS_NULL(env->mem_nodes_dynamic)) {
1026  alloc = INIT_SCANENV_MEMNODES_ALLOC_SIZE;
1027  p = (Node** )xmalloc(sizeof(Node*) * alloc);
1028  CHECK_NULL_RETURN_MEMERR(p);
1029  xmemcpy(p, env->mem_nodes_static,
1030  sizeof(Node*) * SCANENV_MEMNODES_SIZE);
1031  }
1032  else {
1033  alloc = env->mem_alloc * 2;
1034  p = (Node** )xrealloc(env->mem_nodes_dynamic, sizeof(Node*) * alloc);
1035  CHECK_NULL_RETURN_MEMERR(p);
1036  }
1037 
1038  for (i = env->num_mem + 1; i < alloc; i++)
1039  p[i] = NULL_NODE;
1040 
1041  env->mem_nodes_dynamic = p;
1042  env->mem_alloc = alloc;
1043  }
1044  }
1045 
1046  env->num_mem++;
1047  return env->num_mem;
1048 }
1049 
1050 static int
1051 scan_env_set_mem_node(ScanEnv* env, int num, Node* node)
1052 {
1053  if (env->num_mem >= num)
1054  SCANENV_MEM_NODES(env)[num] = node;
1055  else
1056  return ONIGERR_PARSER_BUG;
1057  return 0;
1058 }
1059 
1060 
1061 extern void
1062 onig_node_free(Node* node)
1063 {
1064  start:
1065  if (IS_NULL(node)) return ;
1066 
1067  switch (NTYPE(node)) {
1068  case NT_STR:
1069  if (NSTR(node)->capa != 0 &&
1070  IS_NOT_NULL(NSTR(node)->s) && NSTR(node)->s != NSTR(node)->buf) {
1071  xfree(NSTR(node)->s);
1072  }
1073  break;
1074 
1075  case NT_LIST:
1076  case NT_ALT:
1077  onig_node_free(NCAR(node));
1078  {
1079  Node* next_node = NCDR(node);
1080 
1081  xfree(node);
1082  node = next_node;
1083  goto start;
1084  }
1085  break;
1086 
1087  case NT_CCLASS:
1088  {
1089  CClassNode* cc = NCCLASS(node);
1090 
1091  if (cc->mbuf)
1092  bbuf_free(cc->mbuf);
1093  }
1094  break;
1095 
1096  case NT_QTFR:
1097  if (NQTFR(node)->target)
1098  onig_node_free(NQTFR(node)->target);
1099  break;
1100 
1101  case NT_ENCLOSE:
1102  if (NENCLOSE(node)->target)
1103  onig_node_free(NENCLOSE(node)->target);
1104  break;
1105 
1106  case NT_BREF:
1107  if (IS_NOT_NULL(NBREF(node)->back_dynamic))
1108  xfree(NBREF(node)->back_dynamic);
1109  break;
1110 
1111  case NT_ANCHOR:
1112  if (NANCHOR(node)->target)
1113  onig_node_free(NANCHOR(node)->target);
1114  break;
1115  }
1116 
1117  xfree(node);
1118 }
1119 
1120 static Node*
1121 node_new(void)
1122 {
1123  Node* node;
1124 
1125  node = (Node* )xmalloc(sizeof(Node));
1126  /* xmemset(node, 0, sizeof(Node)); */
1127  return node;
1128 }
1129 
1130 static void
1131 initialize_cclass(CClassNode* cc)
1132 {
1133  BITSET_CLEAR(cc->bs);
1134  /* cc->base.flags = 0; */
1135  cc->flags = 0;
1136  cc->mbuf = NULL;
1137 }
1138 
1139 static Node*
1140 node_new_cclass(void)
1141 {
1142  Node* node = node_new();
1143  CHECK_NULL_RETURN(node);
1144 
1145  SET_NTYPE(node, NT_CCLASS);
1146  initialize_cclass(NCCLASS(node));
1147  return node;
1148 }
1149 
1150 static Node*
1151 node_new_ctype(int type, int not, int ascii_range)
1152 {
1153  Node* node = node_new();
1154  CHECK_NULL_RETURN(node);
1155 
1156  SET_NTYPE(node, NT_CTYPE);
1157  NCTYPE(node)->ctype = type;
1158  NCTYPE(node)->not = not;
1159  NCTYPE(node)->ascii_range = ascii_range;
1160  return node;
1161 }
1162 
1163 static Node*
1164 node_new_anychar(void)
1165 {
1166  Node* node = node_new();
1167  CHECK_NULL_RETURN(node);
1168 
1169  SET_NTYPE(node, NT_CANY);
1170  return node;
1171 }
1172 
1173 static Node*
1174 node_new_list(Node* left, Node* right)
1175 {
1176  Node* node = node_new();
1177  CHECK_NULL_RETURN(node);
1178 
1179  SET_NTYPE(node, NT_LIST);
1180  NCAR(node) = left;
1181  NCDR(node) = right;
1182  return node;
1183 }
1184 
1185 extern Node*
1186 onig_node_new_list(Node* left, Node* right)
1187 {
1188  return node_new_list(left, right);
1189 }
1190 
1191 extern Node*
1192 onig_node_list_add(Node* list, Node* x)
1193 {
1194  Node *n;
1195 
1196  n = onig_node_new_list(x, NULL);
1197  if (IS_NULL(n)) return NULL_NODE;
1198 
1199  if (IS_NOT_NULL(list)) {
1200  while (IS_NOT_NULL(NCDR(list)))
1201  list = NCDR(list);
1202 
1203  NCDR(list) = n;
1204  }
1205 
1206  return n;
1207 }
1208 
1209 extern Node*
1210 onig_node_new_alt(Node* left, Node* right)
1211 {
1212  Node* node = node_new();
1213  CHECK_NULL_RETURN(node);
1214 
1215  SET_NTYPE(node, NT_ALT);
1216  NCAR(node) = left;
1217  NCDR(node) = right;
1218  return node;
1219 }
1220 
1221 extern Node*
1222 onig_node_new_anchor(int type)
1223 {
1224  Node* node = node_new();
1225  CHECK_NULL_RETURN(node);
1226 
1227  SET_NTYPE(node, NT_ANCHOR);
1228  NANCHOR(node)->type = type;
1229  NANCHOR(node)->target = NULL;
1230  NANCHOR(node)->char_len = -1;
1231  NANCHOR(node)->ascii_range = 0;
1232  return node;
1233 }
1234 
1235 static Node*
1236 node_new_backref(int back_num, int* backrefs, int by_name,
1237 #ifdef USE_BACKREF_WITH_LEVEL
1238  int exist_level, int nest_level,
1239 #endif
1240  ScanEnv* env)
1241 {
1242  int i;
1243  Node* node = node_new();
1244 
1245  CHECK_NULL_RETURN(node);
1246 
1247  SET_NTYPE(node, NT_BREF);
1248  NBREF(node)->state = 0;
1249  NBREF(node)->back_num = back_num;
1250  NBREF(node)->back_dynamic = (int* )NULL;
1251  if (by_name != 0)
1252  NBREF(node)->state |= NST_NAME_REF;
1253 
1254 #ifdef USE_BACKREF_WITH_LEVEL
1255  if (exist_level != 0) {
1256  NBREF(node)->state |= NST_NEST_LEVEL;
1257  NBREF(node)->nest_level = nest_level;
1258  }
1259 #endif
1260 
1261  for (i = 0; i < back_num; i++) {
1262  if (backrefs[i] <= env->num_mem &&
1263  IS_NULL(SCANENV_MEM_NODES(env)[backrefs[i]])) {
1264  NBREF(node)->state |= NST_RECURSION; /* /...(\1).../ */
1265  break;
1266  }
1267  }
1268 
1269  if (back_num <= NODE_BACKREFS_SIZE) {
1270  for (i = 0; i < back_num; i++)
1271  NBREF(node)->back_static[i] = backrefs[i];
1272  }
1273  else {
1274  int* p = (int* )xmalloc(sizeof(int) * back_num);
1275  if (IS_NULL(p)) {
1276  onig_node_free(node);
1277  return NULL;
1278  }
1279  NBREF(node)->back_dynamic = p;
1280  for (i = 0; i < back_num; i++)
1281  p[i] = backrefs[i];
1282  }
1283  return node;
1284 }
1285 
1286 #ifdef USE_SUBEXP_CALL
1287 static Node*
1288 node_new_call(UChar* name, UChar* name_end, int gnum)
1289 {
1290  Node* node = node_new();
1291  CHECK_NULL_RETURN(node);
1292 
1293  SET_NTYPE(node, NT_CALL);
1294  NCALL(node)->state = 0;
1295  NCALL(node)->target = NULL_NODE;
1296  NCALL(node)->name = name;
1297  NCALL(node)->name_end = name_end;
1298  NCALL(node)->group_num = gnum; /* call by number if gnum != 0 */
1299  return node;
1300 }
1301 #endif
1302 
1303 static Node*
1304 node_new_quantifier(int lower, int upper, int by_number)
1305 {
1306  Node* node = node_new();
1307  CHECK_NULL_RETURN(node);
1308 
1309  SET_NTYPE(node, NT_QTFR);
1310  NQTFR(node)->state = 0;
1311  NQTFR(node)->target = NULL;
1312  NQTFR(node)->lower = lower;
1313  NQTFR(node)->upper = upper;
1314  NQTFR(node)->greedy = 1;
1315  NQTFR(node)->target_empty_info = NQ_TARGET_ISNOT_EMPTY;
1316  NQTFR(node)->head_exact = NULL_NODE;
1317  NQTFR(node)->next_head_exact = NULL_NODE;
1318  NQTFR(node)->is_referred = 0;
1319  if (by_number != 0)
1320  NQTFR(node)->state |= NST_BY_NUMBER;
1321 
1322 #ifdef USE_COMBINATION_EXPLOSION_CHECK
1323  NQTFR(node)->comb_exp_check_num = 0;
1324 #endif
1325 
1326  return node;
1327 }
1328 
1329 static Node*
1330 node_new_enclose(int type)
1331 {
1332  Node* node = node_new();
1333  CHECK_NULL_RETURN(node);
1334 
1335  SET_NTYPE(node, NT_ENCLOSE);
1336  NENCLOSE(node)->type = type;
1337  NENCLOSE(node)->state = 0;
1338  NENCLOSE(node)->regnum = 0;
1339  NENCLOSE(node)->option = 0;
1340  NENCLOSE(node)->target = NULL;
1341  NENCLOSE(node)->call_addr = -1;
1342  NENCLOSE(node)->opt_count = 0;
1343  return node;
1344 }
1345 
1346 extern Node*
1347 onig_node_new_enclose(int type)
1348 {
1349  return node_new_enclose(type);
1350 }
1351 
1352 static Node*
1353 node_new_enclose_memory(OnigOptionType option, int is_named)
1354 {
1355  Node* node = node_new_enclose(ENCLOSE_MEMORY);
1356  CHECK_NULL_RETURN(node);
1357  if (is_named != 0)
1358  SET_ENCLOSE_STATUS(node, NST_NAMED_GROUP);
1359 
1360 #ifdef USE_SUBEXP_CALL
1361  NENCLOSE(node)->option = option;
1362 #endif
1363  return node;
1364 }
1365 
1366 static Node*
1367 node_new_option(OnigOptionType option)
1368 {
1369  Node* node = node_new_enclose(ENCLOSE_OPTION);
1370  CHECK_NULL_RETURN(node);
1371  NENCLOSE(node)->option = option;
1372  return node;
1373 }
1374 
1375 extern int
1376 onig_node_str_cat(Node* node, const UChar* s, const UChar* end)
1377 {
1378  ptrdiff_t addlen = end - s;
1379 
1380  if (addlen > 0) {
1381  ptrdiff_t len = NSTR(node)->end - NSTR(node)->s;
1382 
1383  if (NSTR(node)->capa > 0 || (len + addlen > NODE_STR_BUF_SIZE - 1)) {
1384  UChar* p;
1385  ptrdiff_t capa = len + addlen + NODE_STR_MARGIN;
1386 
1387  if (capa <= NSTR(node)->capa) {
1388  onig_strcpy(NSTR(node)->s + len, s, end);
1389  }
1390  else {
1391  if (NSTR(node)->s == NSTR(node)->buf)
1392  p = strcat_capa_from_static(NSTR(node)->s, NSTR(node)->end,
1393  s, end, capa);
1394  else
1395  p = strcat_capa(NSTR(node)->s, NSTR(node)->end, s, end, capa);
1396 
1397  CHECK_NULL_RETURN_MEMERR(p);
1398  NSTR(node)->s = p;
1399  NSTR(node)->capa = (int )capa;
1400  }
1401  }
1402  else {
1403  onig_strcpy(NSTR(node)->s + len, s, end);
1404  }
1405  NSTR(node)->end = NSTR(node)->s + len + addlen;
1406  }
1407 
1408  return 0;
1409 }
1410 
1411 extern int
1412 onig_node_str_set(Node* node, const UChar* s, const UChar* end)
1413 {
1414  onig_node_str_clear(node);
1415  return onig_node_str_cat(node, s, end);
1416 }
1417 
1418 static int
1419 node_str_cat_char(Node* node, UChar c)
1420 {
1421  UChar s[1];
1422 
1423  s[0] = c;
1424  return onig_node_str_cat(node, s, s + 1);
1425 }
1426 
1427 static int
1428 node_str_cat_codepoint(Node* node, OnigEncoding enc, OnigCodePoint c)
1429 {
1430  UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN];
1431  int num = ONIGENC_CODE_TO_MBC(enc, c, buf);
1432  if (num < 0) return num;
1433  return onig_node_str_cat(node, buf, buf + num);
1434 }
1435 
1436 #if 0
1437 extern void
1438 onig_node_conv_to_str_node(Node* node, int flag)
1439 {
1440  SET_NTYPE(node, NT_STR);
1441  NSTR(node)->flag = flag;
1442  NSTR(node)->capa = 0;
1443  NSTR(node)->s = NSTR(node)->buf;
1444  NSTR(node)->end = NSTR(node)->buf;
1445 }
1446 #endif
1447 
1448 extern void
1449 onig_node_str_clear(Node* node)
1450 {
1451  if (NSTR(node)->capa != 0 &&
1452  IS_NOT_NULL(NSTR(node)->s) && NSTR(node)->s != NSTR(node)->buf) {
1453  xfree(NSTR(node)->s);
1454  }
1455 
1456  NSTR(node)->capa = 0;
1457  NSTR(node)->flag = 0;
1458  NSTR(node)->s = NSTR(node)->buf;
1459  NSTR(node)->end = NSTR(node)->buf;
1460 }
1461 
1462 static Node*
1463 node_new_str(const UChar* s, const UChar* end)
1464 {
1465  Node* node = node_new();
1466  CHECK_NULL_RETURN(node);
1467 
1468  SET_NTYPE(node, NT_STR);
1469  NSTR(node)->capa = 0;
1470  NSTR(node)->flag = 0;
1471  NSTR(node)->s = NSTR(node)->buf;
1472  NSTR(node)->end = NSTR(node)->buf;
1473  if (onig_node_str_cat(node, s, end)) {
1474  onig_node_free(node);
1475  return NULL;
1476  }
1477  return node;
1478 }
1479 
1480 extern Node*
1481 onig_node_new_str(const UChar* s, const UChar* end)
1482 {
1483  return node_new_str(s, end);
1484 }
1485 
1486 static Node*
1487 node_new_str_raw(UChar* s, UChar* end)
1488 {
1489  Node* node = node_new_str(s, end);
1490  if (IS_NOT_NULL(node))
1491  NSTRING_SET_RAW(node);
1492  return node;
1493 }
1494 
1495 static Node*
1496 node_new_empty(void)
1497 {
1498  return node_new_str(NULL, NULL);
1499 }
1500 
1501 static Node*
1502 node_new_str_raw_char(UChar c)
1503 {
1504  UChar p[1];
1505 
1506  p[0] = c;
1507  return node_new_str_raw(p, p + 1);
1508 }
1509 
1510 static Node*
1511 str_node_split_last_char(StrNode* sn, OnigEncoding enc)
1512 {
1513  const UChar *p;
1514  Node* n = NULL_NODE;
1515 
1516  if (sn->end > sn->s) {
1517  p = onigenc_get_prev_char_head(enc, sn->s, sn->end, sn->end);
1518  if (p && p > sn->s) { /* can be split. */
1519  n = node_new_str(p, sn->end);
1520  if (IS_NOT_NULL(n) && (sn->flag & NSTR_RAW) != 0)
1521  NSTRING_SET_RAW(n);
1522  sn->end = (UChar* )p;
1523  }
1524  }
1525  return n;
1526 }
1527 
1528 static int
1529 str_node_can_be_split(StrNode* sn, OnigEncoding enc)
1530 {
1531  if (sn->end > sn->s) {
1532  return ((enclen(enc, sn->s, sn->end) < sn->end - sn->s) ? 1 : 0);
1533  }
1534  return 0;
1535 }
1536 
1537 #ifdef USE_PAD_TO_SHORT_BYTE_CHAR
1538 static int
1539 node_str_head_pad(StrNode* sn, int num, UChar val)
1540 {
1541  UChar buf[NODE_STR_BUF_SIZE];
1542  int i, len;
1543 
1544  len = sn->end - sn->s;
1545  onig_strcpy(buf, sn->s, sn->end);
1546  onig_strcpy(&(sn->s[num]), buf, buf + len);
1547  sn->end += num;
1548 
1549  for (i = 0; i < num; i++) {
1550  sn->s[i] = val;
1551  }
1552 }
1553 #endif
1554 
1555 extern int
1556 onig_scan_unsigned_number(UChar** src, const UChar* end, OnigEncoding enc)
1557 {
1558  unsigned int num, val;
1559  OnigCodePoint c;
1560  UChar* p = *src;
1561  PFETCH_READY;
1562 
1563  num = 0;
1564  while (!PEND) {
1565  PFETCH(c);
1566  if (ONIGENC_IS_CODE_DIGIT(enc, c)) {
1567  val = (unsigned int )DIGITVAL(c);
1568  if ((INT_MAX_LIMIT - val) / 10UL < num)
1569  return -1; /* overflow */
1570 
1571  num = num * 10 + val;
1572  }
1573  else {
1574  PUNFETCH;
1575  break;
1576  }
1577  }
1578  *src = p;
1579  return num;
1580 }
1581 
1582 static int
1583 scan_unsigned_hexadecimal_number(UChar** src, UChar* end, int minlen,
1584  int maxlen, OnigEncoding enc)
1585 {
1586  OnigCodePoint c;
1587  unsigned int num, val;
1588  int restlen;
1589  UChar* p = *src;
1590  PFETCH_READY;
1591 
1592  restlen = maxlen - minlen;
1593  num = 0;
1594  while (!PEND && maxlen-- != 0) {
1595  PFETCH(c);
1596  if (ONIGENC_IS_CODE_XDIGIT(enc, c)) {
1597  val = (unsigned int )XDIGITVAL(enc,c);
1598  if ((INT_MAX_LIMIT - val) / 16UL < num)
1599  return -1; /* overflow */
1600 
1601  num = (num << 4) + XDIGITVAL(enc,c);
1602  }
1603  else {
1604  PUNFETCH;
1605  maxlen++;
1606  break;
1607  }
1608  }
1609  if (maxlen > restlen)
1610  return -2; /* not enough digits */
1611  *src = p;
1612  return num;
1613 }
1614 
1615 static int
1616 scan_unsigned_octal_number(UChar** src, UChar* end, int maxlen,
1617  OnigEncoding enc)
1618 {
1619  OnigCodePoint c;
1620  unsigned int num, val;
1621  UChar* p = *src;
1622  PFETCH_READY;
1623 
1624  num = 0;
1625  while (!PEND && maxlen-- != 0) {
1626  PFETCH(c);
1627  if (ONIGENC_IS_CODE_DIGIT(enc, c) && c < '8') {
1628  val = ODIGITVAL(c);
1629  if ((INT_MAX_LIMIT - val) / 8UL < num)
1630  return -1; /* overflow */
1631 
1632  num = (num << 3) + val;
1633  }
1634  else {
1635  PUNFETCH;
1636  break;
1637  }
1638  }
1639  *src = p;
1640  return num;
1641 }
1642 
1643 
1644 #define BBUF_WRITE_CODE_POINT(bbuf,pos,code) \
1645  BBUF_WRITE(bbuf, pos, &(code), SIZE_CODE_POINT)
1646 
1647 /* data format:
1648  [n][from-1][to-1][from-2][to-2] ... [from-n][to-n]
1649  (all data size is OnigCodePoint)
1650  */
1651 static int
1652 new_code_range(BBuf** pbuf)
1653 {
1654 #define INIT_MULTI_BYTE_RANGE_SIZE (SIZE_CODE_POINT * 5)
1655  int r;
1656  OnigCodePoint n;
1657  BBuf* bbuf;
1658 
1659  bbuf = *pbuf = (BBuf* )xmalloc(sizeof(BBuf));
1660  CHECK_NULL_RETURN_MEMERR(*pbuf);
1661  r = BBUF_INIT(*pbuf, INIT_MULTI_BYTE_RANGE_SIZE);
1662  if (r) return r;
1663 
1664  n = 0;
1665  BBUF_WRITE_CODE_POINT(bbuf, 0, n);
1666  return 0;
1667 }
1668 
1669 static int
1670 add_code_range_to_buf0(BBuf** pbuf, ScanEnv* env, OnigCodePoint from, OnigCodePoint to,
1671  int checkdup)
1672 {
1673  int r, inc_n, pos;
1674  OnigCodePoint low, high, bound, x;
1675  OnigCodePoint n, *data;
1676  BBuf* bbuf;
1677 
1678  if (from > to) {
1679  n = from; from = to; to = n;
1680  }
1681 
1682  if (IS_NULL(*pbuf)) {
1683  r = new_code_range(pbuf);
1684  if (r) return r;
1685  bbuf = *pbuf;
1686  n = 0;
1687  }
1688  else {
1689  bbuf = *pbuf;
1690  GET_CODE_POINT(n, bbuf->p);
1691  }
1692  data = (OnigCodePoint* )(bbuf->p);
1693  data++;
1694 
1695  bound = (from == 0) ? 0 : n;
1696  for (low = 0; low < bound; ) {
1697  x = (low + bound) >> 1;
1698  if (from - 1 > data[x*2 + 1])
1699  low = x + 1;
1700  else
1701  bound = x;
1702  }
1703 
1704  high = (to == ONIG_LAST_CODE_POINT) ? n : low;
1705  for (bound = n; high < bound; ) {
1706  x = (high + bound) >> 1;
1707  if (to + 1 >= data[x*2])
1708  high = x + 1;
1709  else
1710  bound = x;
1711  }
1712  /* data[(low-1)*2+1] << from <= data[low*2]
1713  * data[(high-1)*2+1] <= to << data[high*2]
1714  */
1715 
1716  inc_n = low + 1 - high;
1717  if (n + inc_n > ONIG_MAX_MULTI_BYTE_RANGES_NUM)
1718  return ONIGERR_TOO_MANY_MULTI_BYTE_RANGES;
1719 
1720  if (inc_n != 1) {
1721  if (checkdup && from <= data[low*2+1]
1722  && (data[low*2] <= from || data[low*2+1] <= to))
1723  CC_DUP_WARN(env, from, to);
1724  if (from > data[low*2])
1725  from = data[low*2];
1726  if (to < data[(high - 1)*2 + 1])
1727  to = data[(high - 1)*2 + 1];
1728  }
1729 
1730  if (inc_n != 0) {
1731  int from_pos = SIZE_CODE_POINT * (1 + high * 2);
1732  int to_pos = SIZE_CODE_POINT * (1 + (low + 1) * 2);
1733 
1734  if (inc_n > 0) {
1735  if (high < n) {
1736  int size = (n - high) * 2 * SIZE_CODE_POINT;
1737  BBUF_MOVE_RIGHT(bbuf, from_pos, to_pos, size);
1738  }
1739  }
1740  else {
1741  BBUF_MOVE_LEFT_REDUCE(bbuf, from_pos, to_pos);
1742  }
1743  }
1744 
1745  pos = SIZE_CODE_POINT * (1 + low * 2);
1746  BBUF_ENSURE_SIZE(bbuf, pos + SIZE_CODE_POINT * 2);
1747  BBUF_WRITE_CODE_POINT(bbuf, pos, from);
1748  BBUF_WRITE_CODE_POINT(bbuf, pos + SIZE_CODE_POINT, to);
1749  n += inc_n;
1750  BBUF_WRITE_CODE_POINT(bbuf, 0, n);
1751 
1752  return 0;
1753 }
1754 
1755 static int
1756 add_code_range_to_buf(BBuf** pbuf, ScanEnv* env, OnigCodePoint from, OnigCodePoint to)
1757 {
1758  return add_code_range_to_buf0(pbuf, env, from, to, 1);
1759 }
1760 
1761 static int
1762 add_code_range0(BBuf** pbuf, ScanEnv* env, OnigCodePoint from, OnigCodePoint to, int checkdup)
1763 {
1764  if (from > to) {
1765  if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC))
1766  return 0;
1767  else
1768  return ONIGERR_EMPTY_RANGE_IN_CHAR_CLASS;
1769  }
1770 
1771  return add_code_range_to_buf0(pbuf, env, from, to, checkdup);
1772 }
1773 
1774 static int
1775 add_code_range(BBuf** pbuf, ScanEnv* env, OnigCodePoint from, OnigCodePoint to)
1776 {
1777  return add_code_range0(pbuf, env, from, to, 1);
1778 }
1779 
1780 static int
1781 not_code_range_buf(OnigEncoding enc, BBuf* bbuf, BBuf** pbuf, ScanEnv* env)
1782 {
1783  int r, i, n;
1784  OnigCodePoint pre, from, *data, to = 0;
1785 
1786  *pbuf = (BBuf* )NULL;
1787  if (IS_NULL(bbuf)) {
1788  set_all:
1789  return SET_ALL_MULTI_BYTE_RANGE(enc, pbuf);
1790  }
1791 
1792  data = (OnigCodePoint* )(bbuf->p);
1793  GET_CODE_POINT(n, data);
1794  data++;
1795  if (n <= 0) goto set_all;
1796 
1797  r = 0;
1798  pre = MBCODE_START_POS(enc);
1799  for (i = 0; i < n; i++) {
1800  from = data[i*2];
1801  to = data[i*2+1];
1802  if (pre <= from - 1) {
1803  r = add_code_range_to_buf(pbuf, env, pre, from - 1);
1804  if (r != 0) return r;
1805  }
1806  if (to == ONIG_LAST_CODE_POINT) break;
1807  pre = to + 1;
1808  }
1809  if (to < ONIG_LAST_CODE_POINT) {
1810  r = add_code_range_to_buf(pbuf, env, to + 1, ONIG_LAST_CODE_POINT);
1811  }
1812  return r;
1813 }
1814 
1815 #define SWAP_BBUF_NOT(bbuf1, not1, bbuf2, not2) do {\
1816  BBuf *tbuf; \
1817  int tnot; \
1818  tnot = not1; not1 = not2; not2 = tnot; \
1819  tbuf = bbuf1; bbuf1 = bbuf2; bbuf2 = tbuf; \
1820 } while (0)
1821 
1822 static int
1823 or_code_range_buf(OnigEncoding enc, BBuf* bbuf1, int not1,
1824  BBuf* bbuf2, int not2, BBuf** pbuf, ScanEnv* env)
1825 {
1826  int r;
1827  OnigCodePoint i, n1, *data1;
1828  OnigCodePoint from, to;
1829 
1830  *pbuf = (BBuf* )NULL;
1831  if (IS_NULL(bbuf1) && IS_NULL(bbuf2)) {
1832  if (not1 != 0 || not2 != 0)
1833  return SET_ALL_MULTI_BYTE_RANGE(enc, pbuf);
1834  return 0;
1835  }
1836 
1837  r = 0;
1838  if (IS_NULL(bbuf2))
1839  SWAP_BBUF_NOT(bbuf1, not1, bbuf2, not2);
1840 
1841  if (IS_NULL(bbuf1)) {
1842  if (not1 != 0) {
1843  return SET_ALL_MULTI_BYTE_RANGE(enc, pbuf);
1844  }
1845  else {
1846  if (not2 == 0) {
1847  return bbuf_clone(pbuf, bbuf2);
1848  }
1849  else {
1850  return not_code_range_buf(enc, bbuf2, pbuf, env);
1851  }
1852  }
1853  }
1854 
1855  if (not1 != 0)
1856  SWAP_BBUF_NOT(bbuf1, not1, bbuf2, not2);
1857 
1858  data1 = (OnigCodePoint* )(bbuf1->p);
1859  GET_CODE_POINT(n1, data1);
1860  data1++;
1861 
1862  if (not2 == 0 && not1 == 0) { /* 1 OR 2 */
1863  r = bbuf_clone(pbuf, bbuf2);
1864  }
1865  else if (not1 == 0) { /* 1 OR (not 2) */
1866  r = not_code_range_buf(enc, bbuf2, pbuf, env);
1867  }
1868  if (r != 0) return r;
1869 
1870  for (i = 0; i < n1; i++) {
1871  from = data1[i*2];
1872  to = data1[i*2+1];
1873  r = add_code_range_to_buf(pbuf, env, from, to);
1874  if (r != 0) return r;
1875  }
1876  return 0;
1877 }
1878 
1879 static int
1880 and_code_range1(BBuf** pbuf, ScanEnv* env, OnigCodePoint from1, OnigCodePoint to1,
1881  OnigCodePoint* data, int n)
1882 {
1883  int i, r;
1884  OnigCodePoint from2, to2;
1885 
1886  for (i = 0; i < n; i++) {
1887  from2 = data[i*2];
1888  to2 = data[i*2+1];
1889  if (from2 < from1) {
1890  if (to2 < from1) continue;
1891  else {
1892  from1 = to2 + 1;
1893  }
1894  }
1895  else if (from2 <= to1) {
1896  if (to2 < to1) {
1897  if (from1 <= from2 - 1) {
1898  r = add_code_range_to_buf(pbuf, env, from1, from2-1);
1899  if (r != 0) return r;
1900  }
1901  from1 = to2 + 1;
1902  }
1903  else {
1904  to1 = from2 - 1;
1905  }
1906  }
1907  else {
1908  from1 = from2;
1909  }
1910  if (from1 > to1) break;
1911  }
1912  if (from1 <= to1) {
1913  r = add_code_range_to_buf(pbuf, env, from1, to1);
1914  if (r != 0) return r;
1915  }
1916  return 0;
1917 }
1918 
1919 static int
1920 and_code_range_buf(BBuf* bbuf1, int not1, BBuf* bbuf2, int not2, BBuf** pbuf, ScanEnv* env)
1921 {
1922  int r;
1923  OnigCodePoint i, j, n1, n2, *data1, *data2;
1924  OnigCodePoint from, to, from1, to1, from2, to2;
1925 
1926  *pbuf = (BBuf* )NULL;
1927  if (IS_NULL(bbuf1)) {
1928  if (not1 != 0 && IS_NOT_NULL(bbuf2)) /* not1 != 0 -> not2 == 0 */
1929  return bbuf_clone(pbuf, bbuf2);
1930  return 0;
1931  }
1932  else if (IS_NULL(bbuf2)) {
1933  if (not2 != 0)
1934  return bbuf_clone(pbuf, bbuf1);
1935  return 0;
1936  }
1937 
1938  if (not1 != 0)
1939  SWAP_BBUF_NOT(bbuf1, not1, bbuf2, not2);
1940 
1941  data1 = (OnigCodePoint* )(bbuf1->p);
1942  data2 = (OnigCodePoint* )(bbuf2->p);
1943  GET_CODE_POINT(n1, data1);
1944  GET_CODE_POINT(n2, data2);
1945  data1++;
1946  data2++;
1947 
1948  if (not2 == 0 && not1 == 0) { /* 1 AND 2 */
1949  for (i = 0; i < n1; i++) {
1950  from1 = data1[i*2];
1951  to1 = data1[i*2+1];
1952  for (j = 0; j < n2; j++) {
1953  from2 = data2[j*2];
1954  to2 = data2[j*2+1];
1955  if (from2 > to1) break;
1956  if (to2 < from1) continue;
1957  from = MAX(from1, from2);
1958  to = MIN(to1, to2);
1959  r = add_code_range_to_buf(pbuf, env, from, to);
1960  if (r != 0) return r;
1961  }
1962  }
1963  }
1964  else if (not1 == 0) { /* 1 AND (not 2) */
1965  for (i = 0; i < n1; i++) {
1966  from1 = data1[i*2];
1967  to1 = data1[i*2+1];
1968  r = and_code_range1(pbuf, env, from1, to1, data2, n2);
1969  if (r != 0) return r;
1970  }
1971  }
1972 
1973  return 0;
1974 }
1975 
1976 static int
1977 and_cclass(CClassNode* dest, CClassNode* cc, ScanEnv* env)
1978 {
1979  OnigEncoding enc = env->enc;
1980  int r, not1, not2;
1981  BBuf *buf1, *buf2, *pbuf = 0;
1982  BitSetRef bsr1, bsr2;
1983  BitSet bs1, bs2;
1984 
1985  not1 = IS_NCCLASS_NOT(dest);
1986  bsr1 = dest->bs;
1987  buf1 = dest->mbuf;
1988  not2 = IS_NCCLASS_NOT(cc);
1989  bsr2 = cc->bs;
1990  buf2 = cc->mbuf;
1991 
1992  if (not1 != 0) {
1993  bitset_invert_to(bsr1, bs1);
1994  bsr1 = bs1;
1995  }
1996  if (not2 != 0) {
1997  bitset_invert_to(bsr2, bs2);
1998  bsr2 = bs2;
1999  }
2000  bitset_and(bsr1, bsr2);
2001  if (bsr1 != dest->bs) {
2002  bitset_copy(dest->bs, bsr1);
2003  bsr1 = dest->bs;
2004  }
2005  if (not1 != 0) {
2006  bitset_invert(dest->bs);
2007  }
2008 
2009  if (! ONIGENC_IS_SINGLEBYTE(enc)) {
2010  if (not1 != 0 && not2 != 0) {
2011  r = or_code_range_buf(enc, buf1, 0, buf2, 0, &pbuf, env);
2012  }
2013  else {
2014  r = and_code_range_buf(buf1, not1, buf2, not2, &pbuf, env);
2015  if (r == 0 && not1 != 0) {
2016  BBuf *tbuf = 0;
2017  r = not_code_range_buf(enc, pbuf, &tbuf, env);
2018  bbuf_free(pbuf);
2019  pbuf = tbuf;
2020  }
2021  }
2022  if (r != 0) {
2023  bbuf_free(pbuf);
2024  return r;
2025  }
2026 
2027  dest->mbuf = pbuf;
2028  bbuf_free(buf1);
2029  return r;
2030  }
2031  return 0;
2032 }
2033 
2034 static int
2035 or_cclass(CClassNode* dest, CClassNode* cc, ScanEnv* env)
2036 {
2037  OnigEncoding enc = env->enc;
2038  int r, not1, not2;
2039  BBuf *buf1, *buf2, *pbuf = 0;
2040  BitSetRef bsr1, bsr2;
2041  BitSet bs1, bs2;
2042 
2043  not1 = IS_NCCLASS_NOT(dest);
2044  bsr1 = dest->bs;
2045  buf1 = dest->mbuf;
2046  not2 = IS_NCCLASS_NOT(cc);
2047  bsr2 = cc->bs;
2048  buf2 = cc->mbuf;
2049 
2050  if (not1 != 0) {
2051  bitset_invert_to(bsr1, bs1);
2052  bsr1 = bs1;
2053  }
2054  if (not2 != 0) {
2055  bitset_invert_to(bsr2, bs2);
2056  bsr2 = bs2;
2057  }
2058  bitset_or(bsr1, bsr2);
2059  if (bsr1 != dest->bs) {
2060  bitset_copy(dest->bs, bsr1);
2061  bsr1 = dest->bs;
2062  }
2063  if (not1 != 0) {
2064  bitset_invert(dest->bs);
2065  }
2066 
2067  if (! ONIGENC_IS_SINGLEBYTE(enc)) {
2068  if (not1 != 0 && not2 != 0) {
2069  r = and_code_range_buf(buf1, 0, buf2, 0, &pbuf, env);
2070  }
2071  else {
2072  r = or_code_range_buf(enc, buf1, not1, buf2, not2, &pbuf, env);
2073  if (r == 0 && not1 != 0) {
2074  BBuf *tbuf = 0;
2075  r = not_code_range_buf(enc, pbuf, &tbuf, env);
2076  bbuf_free(pbuf);
2077  pbuf = tbuf;
2078  }
2079  }
2080  if (r != 0) {
2081  bbuf_free(pbuf);
2082  return r;
2083  }
2084 
2085  dest->mbuf = pbuf;
2086  bbuf_free(buf1);
2087  return r;
2088  }
2089  else
2090  return 0;
2091 }
2092 
2093 static void UNKNOWN_ESC_WARN(ScanEnv *env, int c);
2094 
2095 static OnigCodePoint
2096 conv_backslash_value(OnigCodePoint c, ScanEnv* env)
2097 {
2098  if (IS_SYNTAX_OP(env->syntax, ONIG_SYN_OP_ESC_CONTROL_CHARS)) {
2099  switch (c) {
2100  case 'n': return '\n';
2101  case 't': return '\t';
2102  case 'r': return '\r';
2103  case 'f': return '\f';
2104  case 'a': return '\007';
2105  case 'b': return '\010';
2106  case 'e': return '\033';
2107  case 'v':
2108  if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ESC_V_VTAB))
2109  return '\v';
2110  break;
2111 
2112  default:
2113  if (('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z'))
2114  UNKNOWN_ESC_WARN(env, c);
2115  break;
2116  }
2117  }
2118  return c;
2119 }
2120 
2121 #ifdef USE_NO_INVALID_QUANTIFIER
2122 # define is_invalid_quantifier_target(node) 0
2123 #else
2124 static int
2125 is_invalid_quantifier_target(Node* node)
2126 {
2127  switch (NTYPE(node)) {
2128  case NT_ANCHOR:
2129  return 1;
2130  break;
2131 
2132  case NT_ENCLOSE:
2133  /* allow enclosed elements */
2134  /* return is_invalid_quantifier_target(NENCLOSE(node)->target); */
2135  break;
2136 
2137  case NT_LIST:
2138  do {
2139  if (! is_invalid_quantifier_target(NCAR(node))) return 0;
2140  } while (IS_NOT_NULL(node = NCDR(node)));
2141  return 0;
2142  break;
2143 
2144  case NT_ALT:
2145  do {
2146  if (is_invalid_quantifier_target(NCAR(node))) return 1;
2147  } while (IS_NOT_NULL(node = NCDR(node)));
2148  break;
2149 
2150  default:
2151  break;
2152  }
2153  return 0;
2154 }
2155 #endif
2156 
2157 /* ?:0, *:1, +:2, ??:3, *?:4, +?:5 */
2158 static int
2159 popular_quantifier_num(QtfrNode* q)
2160 {
2161  if (q->greedy) {
2162  if (q->lower == 0) {
2163  if (q->upper == 1) return 0;
2164  else if (IS_REPEAT_INFINITE(q->upper)) return 1;
2165  }
2166  else if (q->lower == 1) {
2167  if (IS_REPEAT_INFINITE(q->upper)) return 2;
2168  }
2169  }
2170  else {
2171  if (q->lower == 0) {
2172  if (q->upper == 1) return 3;
2173  else if (IS_REPEAT_INFINITE(q->upper)) return 4;
2174  }
2175  else if (q->lower == 1) {
2176  if (IS_REPEAT_INFINITE(q->upper)) return 5;
2177  }
2178  }
2179  return -1;
2180 }
2181 
2182 
2183 enum ReduceType {
2184  RQ_ASIS = 0, /* as is */
2185  RQ_DEL = 1, /* delete parent */
2186  RQ_A, /* to '*' */
2187  RQ_AQ, /* to '*?' */
2188  RQ_QQ, /* to '??' */
2189  RQ_P_QQ, /* to '+)??' */
2190 };
2191 
2192 static enum ReduceType const ReduceTypeTable[6][6] = {
2193 /* '?', '*', '+', '??', '*?', '+?' p / c */
2194  {RQ_DEL, RQ_A, RQ_A, RQ_QQ, RQ_AQ, RQ_ASIS}, /* '?' */
2195  {RQ_DEL, RQ_DEL, RQ_DEL, RQ_P_QQ, RQ_P_QQ, RQ_DEL}, /* '*' */
2196  {RQ_A, RQ_A, RQ_DEL, RQ_ASIS, RQ_P_QQ, RQ_DEL}, /* '+' */
2197  {RQ_DEL, RQ_AQ, RQ_AQ, RQ_DEL, RQ_AQ, RQ_AQ}, /* '??' */
2198  {RQ_DEL, RQ_DEL, RQ_DEL, RQ_DEL, RQ_DEL, RQ_DEL}, /* '*?' */
2199  {RQ_ASIS, RQ_ASIS, RQ_ASIS, RQ_AQ, RQ_AQ, RQ_DEL} /* '+?' */
2200 };
2201 
2202 extern void
2203 onig_reduce_nested_quantifier(Node* pnode, Node* cnode)
2204 {
2205  int pnum, cnum;
2206  QtfrNode *p, *c;
2207 
2208  p = NQTFR(pnode);
2209  c = NQTFR(cnode);
2210  pnum = popular_quantifier_num(p);
2211  cnum = popular_quantifier_num(c);
2212  if (pnum < 0 || cnum < 0) return ;
2213 
2214  switch (ReduceTypeTable[cnum][pnum]) {
2215  case RQ_DEL:
2216  *pnode = *cnode;
2217  break;
2218  case RQ_A:
2219  p->target = c->target;
2220  p->lower = 0; p->upper = REPEAT_INFINITE; p->greedy = 1;
2221  break;
2222  case RQ_AQ:
2223  p->target = c->target;
2224  p->lower = 0; p->upper = REPEAT_INFINITE; p->greedy = 0;
2225  break;
2226  case RQ_QQ:
2227  p->target = c->target;
2228  p->lower = 0; p->upper = 1; p->greedy = 0;
2229  break;
2230  case RQ_P_QQ:
2231  p->target = cnode;
2232  p->lower = 0; p->upper = 1; p->greedy = 0;
2233  c->lower = 1; c->upper = REPEAT_INFINITE; c->greedy = 1;
2234  return ;
2235  break;
2236  case RQ_ASIS:
2237  p->target = cnode;
2238  return ;
2239  break;
2240  }
2241 
2242  c->target = NULL_NODE;
2243  onig_node_free(cnode);
2244 }
2245 
2246 
2247 enum TokenSyms {
2248  TK_EOT = 0, /* end of token */
2249  TK_RAW_BYTE = 1,
2250  TK_CHAR,
2251  TK_STRING,
2252  TK_CODE_POINT,
2253  TK_ANYCHAR,
2254  TK_CHAR_TYPE,
2255  TK_BACKREF,
2256  TK_CALL,
2257  TK_ANCHOR,
2258  TK_OP_REPEAT,
2259  TK_INTERVAL,
2260  TK_ANYCHAR_ANYTIME, /* SQL '%' == .* */
2261  TK_ALT,
2262  TK_SUBEXP_OPEN,
2263  TK_SUBEXP_CLOSE,
2264  TK_CC_OPEN,
2265  TK_QUOTE_OPEN,
2266  TK_CHAR_PROPERTY, /* \p{...}, \P{...} */
2267  TK_LINEBREAK,
2268  TK_EXTENDED_GRAPHEME_CLUSTER,
2269  TK_KEEP,
2270  /* in cc */
2271  TK_CC_CLOSE,
2272  TK_CC_RANGE,
2273  TK_POSIX_BRACKET_OPEN,
2274  TK_CC_AND, /* && */
2275  TK_CC_CC_OPEN /* [ */
2276 };
2277 
2278 typedef struct {
2279  enum TokenSyms type;
2280  int escaped;
2281  int base; /* is number: 8, 16 (used in [....]) */
2282  UChar* backp;
2283  union {
2284  UChar* s;
2285  int c;
2286  OnigCodePoint code;
2287  struct {
2288  int subtype;
2289  int ascii_range;
2290  } anchor;
2291  struct {
2292  int lower;
2293  int upper;
2294  int greedy;
2295  int possessive;
2296  } repeat;
2297  struct {
2298  int num;
2299  int ref1;
2300  int* refs;
2301  int by_name;
2302 #ifdef USE_BACKREF_WITH_LEVEL
2303  int exist_level;
2304  int level; /* \k<name+n> */
2305 #endif
2306  } backref;
2307  struct {
2308  UChar* name;
2309  UChar* name_end;
2310  int gnum;
2311  int rel;
2312  } call;
2313  struct {
2314  int ctype;
2315  int not;
2316  } prop;
2317  } u;
2318 } OnigToken;
2319 
2320 
2321 static int
2322 fetch_range_quantifier(UChar** src, UChar* end, OnigToken* tok, ScanEnv* env)
2323 {
2324  int low, up, syn_allow, non_low = 0;
2325  int r = 0;
2326  OnigCodePoint c;
2327  OnigEncoding enc = env->enc;
2328  UChar* p = *src;
2329  PFETCH_READY;
2330 
2331  syn_allow = IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_INVALID_INTERVAL);
2332 
2333  if (PEND) {
2334  if (syn_allow)
2335  return 1; /* "....{" : OK! */
2336  else
2337  return ONIGERR_END_PATTERN_AT_LEFT_BRACE; /* "....{" syntax error */
2338  }
2339 
2340  if (! syn_allow) {
2341  c = PPEEK;
2342  if (c == ')' || c == '(' || c == '|') {
2343  return ONIGERR_END_PATTERN_AT_LEFT_BRACE;
2344  }
2345  }
2346 
2347  low = onig_scan_unsigned_number(&p, end, env->enc);
2348  if (low < 0) return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE;
2349  if (low > ONIG_MAX_REPEAT_NUM)
2350  return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE;
2351 
2352  if (p == *src) { /* can't read low */
2353  if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_INTERVAL_LOW_ABBREV)) {
2354  /* allow {,n} as {0,n} */
2355  low = 0;
2356  non_low = 1;
2357  }
2358  else
2359  goto invalid;
2360  }
2361 
2362  if (PEND) goto invalid;
2363  PFETCH(c);
2364  if (c == ',') {
2365  UChar* prev = p;
2366  up = onig_scan_unsigned_number(&p, end, env->enc);
2367  if (up < 0) return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE;
2368  if (up > ONIG_MAX_REPEAT_NUM)
2369  return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE;
2370 
2371  if (p == prev) {
2372  if (non_low != 0)
2373  goto invalid;
2374  up = REPEAT_INFINITE; /* {n,} : {n,infinite} */
2375  }
2376  }
2377  else {
2378  if (non_low != 0)
2379  goto invalid;
2380 
2381  PUNFETCH;
2382  up = low; /* {n} : exact n times */
2383  r = 2; /* fixed */
2384  }
2385 
2386  if (PEND) goto invalid;
2387  PFETCH(c);
2388  if (IS_SYNTAX_OP(env->syntax, ONIG_SYN_OP_ESC_BRACE_INTERVAL)) {
2389  if (c != MC_ESC(env->syntax)) goto invalid;
2390  if (PEND) goto invalid;
2391  PFETCH(c);
2392  }
2393  if (c != '}') goto invalid;
2394 
2395  if (!IS_REPEAT_INFINITE(up) && low > up) {
2396  return ONIGERR_UPPER_SMALLER_THAN_LOWER_IN_REPEAT_RANGE;
2397  }
2398 
2399  tok->type = TK_INTERVAL;
2400  tok->u.repeat.lower = low;
2401  tok->u.repeat.upper = up;
2402  *src = p;
2403  return r; /* 0: normal {n,m}, 2: fixed {n} */
2404 
2405  invalid:
2406  if (syn_allow)
2407  return 1; /* OK */
2408  else
2409  return ONIGERR_INVALID_REPEAT_RANGE_PATTERN;
2410 }
2411 
2412 /* \M-, \C-, \c, or \... */
2413 static int
2414 fetch_escaped_value(UChar** src, UChar* end, ScanEnv* env, OnigCodePoint* val)
2415 {
2416  int v;
2417  OnigCodePoint c;
2418  OnigEncoding enc = env->enc;
2419  UChar* p = *src;
2420 
2421  if (PEND) return ONIGERR_END_PATTERN_AT_ESCAPE;
2422 
2423  PFETCH_S(c);
2424  switch (c) {
2425  case 'M':
2426  if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ESC_CAPITAL_M_BAR_META)) {
2427  if (PEND) return ONIGERR_END_PATTERN_AT_META;
2428  PFETCH_S(c);
2429  if (c != '-') return ONIGERR_META_CODE_SYNTAX;
2430  if (PEND) return ONIGERR_END_PATTERN_AT_META;
2431  PFETCH_S(c);
2432  if (c == MC_ESC(env->syntax)) {
2433  v = fetch_escaped_value(&p, end, env, &c);
2434  if (v < 0) return v;
2435  }
2436  c = ((c & 0xff) | 0x80);
2437  }
2438  else
2439  goto backslash;
2440  break;
2441 
2442  case 'C':
2443  if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ESC_CAPITAL_C_BAR_CONTROL)) {
2444  if (PEND) return ONIGERR_END_PATTERN_AT_CONTROL;
2445  PFETCH_S(c);
2446  if (c != '-') return ONIGERR_CONTROL_CODE_SYNTAX;
2447  goto control;
2448  }
2449  else
2450  goto backslash;
2451 
2452  case 'c':
2453  if (IS_SYNTAX_OP(env->syntax, ONIG_SYN_OP_ESC_C_CONTROL)) {
2454  control:
2455  if (PEND) return ONIGERR_END_PATTERN_AT_CONTROL;
2456  PFETCH_S(c);
2457  if (c == '?') {
2458  c = 0177;
2459  }
2460  else {
2461  if (c == MC_ESC(env->syntax)) {
2462  v = fetch_escaped_value(&p, end, env, &c);
2463  if (v < 0) return v;
2464  }
2465  c &= 0x9f;
2466  }
2467  break;
2468  }
2469  /* fall through */
2470 
2471  default:
2472  {
2473  backslash:
2474  c = conv_backslash_value(c, env);
2475  }
2476  break;
2477  }
2478 
2479  *src = p;
2480  *val = c;
2481  return 0;
2482 }
2483 
2484 static int fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env);
2485 
2486 static OnigCodePoint
2487 get_name_end_code_point(OnigCodePoint start)
2488 {
2489  switch (start) {
2490  case '<': return (OnigCodePoint )'>'; break;
2491  case '\'': return (OnigCodePoint )'\''; break;
2492  case '(': return (OnigCodePoint )')'; break;
2493  case '{': return (OnigCodePoint )'}'; break;
2494  default:
2495  break;
2496  }
2497 
2498  return (OnigCodePoint )0;
2499 }
2500 
2501 #ifdef USE_NAMED_GROUP
2502 # ifdef RUBY
2503 # define ONIGENC_IS_CODE_NAME(enc, c) TRUE
2504 # else
2505 # define ONIGENC_IS_CODE_NAME(enc, c) ONIGENC_IS_CODE_WORD(enc, c)
2506 # endif
2507 
2508 # ifdef USE_BACKREF_WITH_LEVEL
2509 /*
2510  \k<name+n>, \k<name-n>
2511  \k<num+n>, \k<num-n>
2512  \k<-num+n>, \k<-num-n>
2513 */
2514 static int
2515 fetch_name_with_level(OnigCodePoint start_code, UChar** src, UChar* end,
2516  UChar** rname_end, ScanEnv* env,
2517  int* rback_num, int* rlevel)
2518 {
2519  int r, sign, is_num, exist_level;
2520  OnigCodePoint end_code;
2521  OnigCodePoint c = 0;
2522  OnigEncoding enc = env->enc;
2523  UChar *name_end;
2524  UChar *pnum_head;
2525  UChar *p = *src;
2526  PFETCH_READY;
2527 
2528  *rback_num = 0;
2529  is_num = exist_level = 0;
2530  sign = 1;
2531  pnum_head = *src;
2532 
2533  end_code = get_name_end_code_point(start_code);
2534 
2535  name_end = end;
2536  r = 0;
2537  if (PEND) {
2538  return ONIGERR_EMPTY_GROUP_NAME;
2539  }
2540  else {
2541  PFETCH(c);
2542  if (c == end_code)
2543  return ONIGERR_EMPTY_GROUP_NAME;
2544 
2545  if (ONIGENC_IS_CODE_DIGIT(enc, c)) {
2546  is_num = 1;
2547  }
2548  else if (c == '-') {
2549  is_num = 2;
2550  sign = -1;
2551  pnum_head = p;
2552  }
2553  else if (!ONIGENC_IS_CODE_NAME(enc, c)) {
2554  r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
2555  }
2556  }
2557 
2558  while (!PEND) {
2559  name_end = p;
2560  PFETCH(c);
2561  if (c == end_code || c == ')' || c == '+' || c == '-') {
2562  if (is_num == 2) r = ONIGERR_INVALID_GROUP_NAME;
2563  break;
2564  }
2565 
2566  if (is_num != 0) {
2567  if (ONIGENC_IS_CODE_DIGIT(enc, c)) {
2568  is_num = 1;
2569  }
2570  else {
2571  r = ONIGERR_INVALID_GROUP_NAME;
2572  is_num = 0;
2573  }
2574  }
2575  else if (!ONIGENC_IS_CODE_NAME(enc, c)) {
2576  r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
2577  }
2578  }
2579 
2580  if (r == 0 && c != end_code) {
2581  if (c == '+' || c == '-') {
2582  int level;
2583  int flag = (c == '-' ? -1 : 1);
2584 
2585  if (PEND) {
2586  r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
2587  goto end;
2588  }
2589  PFETCH(c);
2590  if (! ONIGENC_IS_CODE_DIGIT(enc, c)) goto err;
2591  PUNFETCH;
2592  level = onig_scan_unsigned_number(&p, end, enc);
2593  if (level < 0) return ONIGERR_TOO_BIG_NUMBER;
2594  *rlevel = (level * flag);
2595  exist_level = 1;
2596 
2597  if (!PEND) {
2598  PFETCH(c);
2599  if (c == end_code)
2600  goto end;
2601  }
2602  }
2603 
2604  err:
2605  r = ONIGERR_INVALID_GROUP_NAME;
2606  name_end = end;
2607  }
2608 
2609  end:
2610  if (r == 0) {
2611  if (is_num != 0) {
2612  *rback_num = onig_scan_unsigned_number(&pnum_head, name_end, enc);
2613  if (*rback_num < 0) return ONIGERR_TOO_BIG_NUMBER;
2614  else if (*rback_num == 0) goto err;
2615 
2616  *rback_num *= sign;
2617  }
2618 
2619  *rname_end = name_end;
2620  *src = p;
2621  return (exist_level ? 1 : 0);
2622  }
2623  else {
2624  onig_scan_env_set_error_string(env, r, *src, name_end);
2625  return r;
2626  }
2627 }
2628 # endif /* USE_BACKREF_WITH_LEVEL */
2629 
2630 /*
2631  ref: 0 -> define name (don't allow number name)
2632  1 -> reference name (allow number name)
2633 */
2634 static int
2635 fetch_name(OnigCodePoint start_code, UChar** src, UChar* end,
2636  UChar** rname_end, ScanEnv* env, int* rback_num, int ref)
2637 {
2638  int r, is_num, sign;
2639  OnigCodePoint end_code;
2640  OnigCodePoint c = 0;
2641  OnigEncoding enc = env->enc;
2642  UChar *name_end;
2643  UChar *pnum_head;
2644  UChar *p = *src;
2645 
2646  *rback_num = 0;
2647 
2648  end_code = get_name_end_code_point(start_code);
2649 
2650  name_end = end;
2651  pnum_head = *src;
2652  r = 0;
2653  is_num = 0;
2654  sign = 1;
2655  if (PEND) {
2656  return ONIGERR_EMPTY_GROUP_NAME;
2657  }
2658  else {
2659  PFETCH_S(c);
2660  if (c == end_code)
2661  return ONIGERR_EMPTY_GROUP_NAME;
2662 
2663  if (ONIGENC_IS_CODE_DIGIT(enc, c)) {
2664  if (ref == 1)
2665  is_num = 1;
2666  else {
2667  r = ONIGERR_INVALID_GROUP_NAME;
2668  is_num = 0;
2669  }
2670  }
2671  else if (c == '-') {
2672  if (ref == 1) {
2673  is_num = 2;
2674  sign = -1;
2675  pnum_head = p;
2676  }
2677  else {
2678  r = ONIGERR_INVALID_GROUP_NAME;
2679  is_num = 0;
2680  }
2681  }
2682  else if (!ONIGENC_IS_CODE_NAME(enc, c)) {
2683  r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
2684  }
2685  }
2686 
2687  if (r == 0) {
2688  while (!PEND) {
2689  name_end = p;
2690  PFETCH_S(c);
2691  if (c == end_code || c == ')') {
2692  if (is_num == 2) {
2693  r = ONIGERR_INVALID_GROUP_NAME;
2694  goto teardown;
2695  }
2696  break;
2697  }
2698 
2699  if (is_num != 0) {
2700  if (ONIGENC_IS_CODE_DIGIT(enc, c)) {
2701  is_num = 1;
2702  }
2703  else {
2704  if (!ONIGENC_IS_CODE_WORD(enc, c))
2705  r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
2706  else
2707  r = ONIGERR_INVALID_GROUP_NAME;
2708  goto teardown;
2709  }
2710  }
2711  else {
2712  if (!ONIGENC_IS_CODE_NAME(enc, c)) {
2713  r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
2714  goto teardown;
2715  }
2716  }
2717  }
2718 
2719  if (c != end_code) {
2720  r = ONIGERR_INVALID_GROUP_NAME;
2721  name_end = end;
2722  goto err;
2723  }
2724 
2725  if (is_num != 0) {
2726  *rback_num = onig_scan_unsigned_number(&pnum_head, name_end, enc);
2727  if (*rback_num < 0) return ONIGERR_TOO_BIG_NUMBER;
2728  else if (*rback_num == 0) {
2729  r = ONIGERR_INVALID_GROUP_NAME;
2730  goto err;
2731  }
2732 
2733  *rback_num *= sign;
2734  }
2735 
2736  *rname_end = name_end;
2737  *src = p;
2738  return 0;
2739  }
2740  else {
2741 teardown:
2742  while (!PEND) {
2743  name_end = p;
2744  PFETCH_S(c);
2745  if (c == end_code || c == ')')
2746  break;
2747  }
2748  if (PEND)
2749  name_end = end;
2750 
2751  err:
2752  onig_scan_env_set_error_string(env, r, *src, name_end);
2753  return r;
2754  }
2755 }
2756 #else
2757 static int
2758 fetch_name(OnigCodePoint start_code, UChar** src, UChar* end,
2759  UChar** rname_end, ScanEnv* env, int* rback_num, int ref)
2760 {
2761  int r, is_num, sign;
2762  OnigCodePoint end_code;
2763  OnigCodePoint c = 0;
2764  UChar *name_end;
2765  OnigEncoding enc = env->enc;
2766  UChar *pnum_head;
2767  UChar *p = *src;
2768  PFETCH_READY;
2769 
2770  *rback_num = 0;
2771 
2772  end_code = get_name_end_code_point(start_code);
2773 
2774  *rname_end = name_end = end;
2775  r = 0;
2776  pnum_head = *src;
2777  is_num = 0;
2778  sign = 1;
2779 
2780  if (PEND) {
2781  return ONIGERR_EMPTY_GROUP_NAME;
2782  }
2783  else {
2784  PFETCH(c);
2785  if (c == end_code)
2786  return ONIGERR_EMPTY_GROUP_NAME;
2787 
2788  if (ONIGENC_IS_CODE_DIGIT(enc, c)) {
2789  is_num = 1;
2790  }
2791  else if (c == '-') {
2792  is_num = 2;
2793  sign = -1;
2794  pnum_head = p;
2795  }
2796  else {
2797  r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
2798  }
2799  }
2800 
2801  while (!PEND) {
2802  name_end = p;
2803 
2804  PFETCH(c);
2805  if (c == end_code || c == ')') break;
2806  if (! ONIGENC_IS_CODE_DIGIT(enc, c))
2807  r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
2808  }
2809  if (r == 0 && c != end_code) {
2810  r = ONIGERR_INVALID_GROUP_NAME;
2811  name_end = end;
2812  }
2813 
2814  if (r == 0) {
2815  *rback_num = onig_scan_unsigned_number(&pnum_head, name_end, enc);
2816  if (*rback_num < 0) return ONIGERR_TOO_BIG_NUMBER;
2817  else if (*rback_num == 0) {
2818  r = ONIGERR_INVALID_GROUP_NAME;
2819  goto err;
2820  }
2821  *rback_num *= sign;
2822 
2823  *rname_end = name_end;
2824  *src = p;
2825  return 0;
2826  }
2827  else {
2828  err:
2829  onig_scan_env_set_error_string(env, r, *src, name_end);
2830  return r;
2831  }
2832 }
2833 #endif /* USE_NAMED_GROUP */
2834 
2835 
2836 #ifdef PRINTF_ARGS
2837 PRINTF_ARGS(static void onig_syntax_warn(ScanEnv *env, const char *fmt, ...), 2, 3);
2838 #endif
2839 
2840 static void
2841 onig_syntax_warn(ScanEnv *env, const char *fmt, ...)
2842 {
2843  va_list args;
2844  UChar buf[WARN_BUFSIZE];
2845  va_start(args, fmt);
2846  onig_vsnprintf_with_pattern(buf, WARN_BUFSIZE, env->enc,
2847  env->pattern, env->pattern_end,
2848  fmt, args);
2849  va_end(args);
2850 #ifdef RUBY
2851  if (env->sourcefile == NULL)
2852  rb_warn("%s", (char *)buf);
2853  else
2854  rb_compile_warn(env->sourcefile, env->sourceline, "%s", (char *)buf);
2855 #else
2856  (*onig_warn)((char* )buf);
2857 #endif
2858 }
2859 
2860 static void
2861 CC_ESC_WARN(ScanEnv *env, UChar *c)
2862 {
2863  if (onig_warn == onig_null_warn) return ;
2864 
2865  if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_WARN_CC_OP_NOT_ESCAPED) &&
2866  IS_SYNTAX_BV(env->syntax, ONIG_SYN_BACKSLASH_ESCAPE_IN_CC)) {
2867  onig_syntax_warn(env, "character class has '%s' without escape", c);
2868  }
2869 }
2870 
2871 static void
2872 CLOSE_BRACKET_WITHOUT_ESC_WARN(ScanEnv* env, UChar* c)
2873 {
2874  if (onig_warn == onig_null_warn) return ;
2875 
2876  if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_WARN_CC_OP_NOT_ESCAPED)) {
2877  onig_syntax_warn(env, "regular expression has '%s' without escape", c);
2878  }
2879 }
2880 
2881 #ifndef RTEST
2882 # define RTEST(v) 1
2883 #endif
2884 
2885 static void
2886 CC_DUP_WARN(ScanEnv *env, OnigCodePoint from ARG_UNUSED, OnigCodePoint to ARG_UNUSED)
2887 {
2888  if (onig_warn == onig_null_warn || !RTEST(ruby_verbose)) return ;
2889 
2890  if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_WARN_CC_DUP) &&
2891  !(env->warnings_flag & ONIG_SYN_WARN_CC_DUP)) {
2892 #ifdef WARN_ALL_CC_DUP
2893  onig_syntax_warn(env, "character class has duplicated range: %04x-%04x", from, to);
2894 #else
2895  env->warnings_flag |= ONIG_SYN_WARN_CC_DUP;
2896  onig_syntax_warn(env, "character class has duplicated range");
2897 #endif
2898  }
2899 }
2900 
2901 static void
2902 UNKNOWN_ESC_WARN(ScanEnv *env, int c)
2903 {
2904  if (onig_warn == onig_null_warn || !RTEST(ruby_verbose)) return ;
2905  onig_syntax_warn(env, "Unknown escape \\%c is ignored", c);
2906 }
2907 
2908 static UChar*
2909 find_str_position(OnigCodePoint s[], int n, UChar* from, UChar* to,
2910  UChar **next, OnigEncoding enc)
2911 {
2912  int i;
2913  OnigCodePoint x;
2914  UChar *q;
2915  UChar *p = from;
2916 
2917  while (p < to) {
2918  x = ONIGENC_MBC_TO_CODE(enc, p, to);
2919  q = p + enclen(enc, p, to);
2920  if (x == s[0]) {
2921  for (i = 1; i < n && q < to; i++) {
2922  x = ONIGENC_MBC_TO_CODE(enc, q, to);
2923  if (x != s[i]) break;
2924  q += enclen(enc, q, to);
2925  }
2926  if (i >= n) {
2927  if (IS_NOT_NULL(next))
2928  *next = q;
2929  return p;
2930  }
2931  }
2932  p = q;
2933  }
2934  return NULL_UCHARP;
2935 }
2936 
2937 static int
2938 str_exist_check_with_esc(OnigCodePoint s[], int n, UChar* from, UChar* to,
2939  OnigCodePoint bad, OnigEncoding enc, const OnigSyntaxType* syn)
2940 {
2941  int i, in_esc;
2942  OnigCodePoint x;
2943  UChar *q;
2944  UChar *p = from;
2945 
2946  in_esc = 0;
2947  while (p < to) {
2948  if (in_esc) {
2949  in_esc = 0;
2950  p += enclen(enc, p, to);
2951  }
2952  else {
2953  x = ONIGENC_MBC_TO_CODE(enc, p, to);
2954  q = p + enclen(enc, p, to);
2955  if (x == s[0]) {
2956  for (i = 1; i < n && q < to; i++) {
2957  x = ONIGENC_MBC_TO_CODE(enc, q, to);
2958  if (x != s[i]) break;
2959  q += enclen(enc, q, to);
2960  }
2961  if (i >= n) return 1;
2962  p += enclen(enc, p, to);
2963  }
2964  else {
2965  x = ONIGENC_MBC_TO_CODE(enc, p, to);
2966  if (x == bad) return 0;
2967  else if (x == MC_ESC(syn)) in_esc = 1;
2968  p = q;
2969  }
2970  }
2971  }
2972  return 0;
2973 }
2974 
2975 static int
2976 fetch_token_in_cc(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env)
2977 {
2978  int num;
2979  OnigCodePoint c, c2;
2980  const OnigSyntaxType* syn = env->syntax;
2981  OnigEncoding enc = env->enc;
2982  UChar* prev;
2983  UChar* p = *src;
2984  PFETCH_READY;
2985 
2986  if (PEND) {
2987  tok->type = TK_EOT;
2988  return tok->type;
2989  }
2990 
2991  PFETCH(c);
2992  tok->type = TK_CHAR;
2993  tok->base = 0;
2994  tok->u.c = c;
2995  tok->escaped = 0;
2996 
2997  if (c == ']') {
2998  tok->type = TK_CC_CLOSE;
2999  }
3000  else if (c == '-') {
3001  tok->type = TK_CC_RANGE;
3002  }
3003  else if (c == MC_ESC(syn)) {
3004  if (! IS_SYNTAX_BV(syn, ONIG_SYN_BACKSLASH_ESCAPE_IN_CC))
3005  goto end;
3006 
3007  if (PEND) return ONIGERR_END_PATTERN_AT_ESCAPE;
3008 
3009  PFETCH(c);
3010  tok->escaped = 1;
3011  tok->u.c = c;
3012  switch (c) {
3013  case 'w':
3014  tok->type = TK_CHAR_TYPE;
3015  tok->u.prop.ctype = ONIGENC_CTYPE_WORD;
3016  tok->u.prop.not = 0;
3017  break;
3018  case 'W':
3019  tok->type = TK_CHAR_TYPE;
3020  tok->u.prop.ctype = ONIGENC_CTYPE_WORD;
3021  tok->u.prop.not = 1;
3022  break;
3023  case 'd':
3024  tok->type = TK_CHAR_TYPE;
3025  tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT;
3026  tok->u.prop.not = 0;
3027  break;
3028  case 'D':
3029  tok->type = TK_CHAR_TYPE;
3030  tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT;
3031  tok->u.prop.not = 1;
3032  break;
3033  case 's':
3034  tok->type = TK_CHAR_TYPE;
3035  tok->u.prop.ctype = ONIGENC_CTYPE_SPACE;
3036  tok->u.prop.not = 0;
3037  break;
3038  case 'S':
3039  tok->type = TK_CHAR_TYPE;
3040  tok->u.prop.ctype = ONIGENC_CTYPE_SPACE;
3041  tok->u.prop.not = 1;
3042  break;
3043  case 'h':
3044  if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break;
3045  tok->type = TK_CHAR_TYPE;
3046  tok->u.prop.ctype = ONIGENC_CTYPE_XDIGIT;
3047  tok->u.prop.not = 0;
3048  break;
3049  case 'H':
3050  if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break;
3051  tok->type = TK_CHAR_TYPE;
3052  tok->u.prop.ctype = ONIGENC_CTYPE_XDIGIT;
3053  tok->u.prop.not = 1;
3054  break;
3055 
3056  case 'p':
3057  case 'P':
3058  if (PEND) break;
3059 
3060  c2 = PPEEK;
3061  if (c2 == '{' &&
3062  IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY)) {
3063  PINC;
3064  tok->type = TK_CHAR_PROPERTY;
3065  tok->u.prop.not = (c == 'P' ? 1 : 0);
3066 
3067  if (!PEND && IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT)) {
3068  PFETCH(c2);
3069  if (c2 == '^') {
3070  tok->u.prop.not = (tok->u.prop.not == 0 ? 1 : 0);
3071  }
3072  else
3073  PUNFETCH;
3074  }
3075  }
3076  else {
3077  onig_syntax_warn(env, "invalid Unicode Property \\%c", c);
3078  }
3079  break;
3080 
3081  case 'x':
3082  if (PEND) break;
3083 
3084  prev = p;
3085  if (PPEEK_IS('{') && IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_BRACE_HEX8)) {
3086  PINC;
3087  num = scan_unsigned_hexadecimal_number(&p, end, 0, 8, enc);
3088  if (num < 0) return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE;
3089  if (!PEND) {
3090  c2 = PPEEK;
3091  if (ONIGENC_IS_CODE_XDIGIT(enc, c2))
3092  return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE;
3093  }
3094 
3095  if (p > prev + enclen(enc, prev, end) && !PEND && (PPEEK_IS('}'))) {
3096  PINC;
3097  tok->type = TK_CODE_POINT;
3098  tok->base = 16;
3099  tok->u.code = (OnigCodePoint )num;
3100  }
3101  else {
3102  /* can't read nothing or invalid format */
3103  p = prev;
3104  }
3105  }
3106  else if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_HEX2)) {
3107  num = scan_unsigned_hexadecimal_number(&p, end, 0, 2, enc);
3108  if (num < 0) return ONIGERR_TOO_BIG_NUMBER;
3109  if (p == prev) { /* can't read nothing. */
3110  num = 0; /* but, it's not error */
3111  }
3112  tok->type = TK_RAW_BYTE;
3113  tok->base = 16;
3114  tok->u.c = num;
3115  }
3116  break;
3117 
3118  case 'u':
3119  if (PEND) break;
3120 
3121  prev = p;
3122  if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_U_HEX4)) {
3123  num = scan_unsigned_hexadecimal_number(&p, end, 4, 4, enc);
3124  if (num < -1) return ONIGERR_TOO_SHORT_DIGITS;
3125  else if (num < 0) return ONIGERR_TOO_BIG_NUMBER;
3126  if (p == prev) { /* can't read nothing. */
3127  num = 0; /* but, it's not error */
3128  }
3129  tok->type = TK_CODE_POINT;
3130  tok->base = 16;
3131  tok->u.code = (OnigCodePoint )num;
3132  }
3133  break;
3134 
3135  case 'o':
3136  if (PEND) break;
3137 
3138  prev = p;
3139  if (PPEEK_IS('{') && IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_O_BRACE_OCTAL)) {
3140  PINC;
3141  num = scan_unsigned_octal_number(&p, end, 11, enc);
3142  if (num < 0) return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE;
3143  if (!PEND) {
3144  c2 = PPEEK;
3145  if (ONIGENC_IS_CODE_DIGIT(enc, c2) && c2 < '8')
3146  return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE;
3147  }
3148 
3149  if (p > prev + enclen(enc, prev, end) && !PEND && (PPEEK_IS('}'))) {
3150  PINC;
3151  tok->type = TK_CODE_POINT;
3152  tok->base = 8;
3153  tok->u.code = (OnigCodePoint )num;
3154  }
3155  else {
3156  /* can't read nothing or invalid format */
3157  p = prev;
3158  }
3159  }
3160  break;
3161 
3162  case '0':
3163  case '1': case '2': case '3': case '4': case '5': case '6': case '7':
3164  if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_OCTAL3)) {
3165  PUNFETCH;
3166  prev = p;
3167  num = scan_unsigned_octal_number(&p, end, 3, enc);
3168  if (num < 0 || 0xff < num) return ONIGERR_TOO_BIG_NUMBER;
3169  if (p == prev) { /* can't read nothing. */
3170  num = 0; /* but, it's not error */
3171  }
3172  tok->type = TK_RAW_BYTE;
3173  tok->base = 8;
3174  tok->u.c = num;
3175  }
3176  break;
3177 
3178  default:
3179  PUNFETCH;
3180  num = fetch_escaped_value(&p, end, env, &c2);
3181  if (num < 0) return num;
3182  if ((OnigCodePoint )tok->u.c != c2) {
3183  tok->u.code = (OnigCodePoint )c2;
3184  tok->type = TK_CODE_POINT;
3185  }
3186  break;
3187  }
3188  }
3189  else if (c == '[') {
3190  if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_POSIX_BRACKET) && (PPEEK_IS(':'))) {
3191  OnigCodePoint send[] = { (OnigCodePoint )':', (OnigCodePoint )']' };
3192  tok->backp = p; /* point at '[' is read */
3193  PINC;
3194  if (str_exist_check_with_esc(send, 2, p, end,
3195  (OnigCodePoint )']', enc, syn)) {
3196  tok->type = TK_POSIX_BRACKET_OPEN;
3197  }
3198  else {
3199  PUNFETCH;
3200  goto cc_in_cc;
3201  }
3202  }
3203  else {
3204  cc_in_cc:
3205  if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_CCLASS_SET_OP)) {
3206  tok->type = TK_CC_CC_OPEN;
3207  }
3208  else {
3209  CC_ESC_WARN(env, (UChar* )"[");
3210  }
3211  }
3212  }
3213  else if (c == '&') {
3214  if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_CCLASS_SET_OP) &&
3215  !PEND && (PPEEK_IS('&'))) {
3216  PINC;
3217  tok->type = TK_CC_AND;
3218  }
3219  }
3220 
3221  end:
3222  *src = p;
3223  return tok->type;
3224 }
3225 
3226 #ifdef USE_NAMED_GROUP
3227 static int
3228 fetch_named_backref_token(OnigCodePoint c, OnigToken* tok, UChar** src,
3229  UChar* end, ScanEnv* env)
3230 {
3231  int r, num;
3232  const OnigSyntaxType* syn = env->syntax;
3233  UChar* prev;
3234  UChar* p = *src;
3235  UChar* name_end;
3236  int* backs;
3237  int back_num;
3238 
3239  prev = p;
3240 
3241 # ifdef USE_BACKREF_WITH_LEVEL
3242  name_end = NULL_UCHARP; /* no need. escape gcc warning. */
3243  r = fetch_name_with_level(c, &p, end, &name_end,
3244  env, &back_num, &tok->u.backref.level);
3245  if (r == 1) tok->u.backref.exist_level = 1;
3246  else tok->u.backref.exist_level = 0;
3247 # else
3248  r = fetch_name(&p, end, &name_end, env, &back_num, 1);
3249 # endif
3250  if (r < 0) return r;
3251 
3252  if (back_num != 0) {
3253  if (back_num < 0) {
3254  back_num = BACKREF_REL_TO_ABS(back_num, env);
3255  if (back_num <= 0)
3256  return ONIGERR_INVALID_BACKREF;
3257  }
3258 
3259  if (IS_SYNTAX_BV(syn, ONIG_SYN_STRICT_CHECK_BACKREF)) {
3260  if (back_num > env->num_mem ||
3261  IS_NULL(SCANENV_MEM_NODES(env)[back_num]))
3262  return ONIGERR_INVALID_BACKREF;
3263  }
3264  tok->type = TK_BACKREF;
3265  tok->u.backref.by_name = 0;
3266  tok->u.backref.num = 1;
3267  tok->u.backref.ref1 = back_num;
3268  }
3269  else {
3270  num = onig_name_to_group_numbers(env->reg, prev, name_end, &backs);
3271  if (num <= 0) {
3272  onig_scan_env_set_error_string(env,
3273  ONIGERR_UNDEFINED_NAME_REFERENCE, prev, name_end);
3274  return ONIGERR_UNDEFINED_NAME_REFERENCE;
3275  }
3276  if (IS_SYNTAX_BV(syn, ONIG_SYN_STRICT_CHECK_BACKREF)) {
3277  int i;
3278  for (i = 0; i < num; i++) {
3279  if (backs[i] > env->num_mem ||
3280  IS_NULL(SCANENV_MEM_NODES(env)[backs[i]]))
3281  return ONIGERR_INVALID_BACKREF;
3282  }
3283  }
3284 
3285  tok->type = TK_BACKREF;
3286  tok->u.backref.by_name = 1;
3287  if (num == 1 || IS_SYNTAX_BV(syn, ONIG_SYN_USE_LEFT_MOST_NAMED_GROUP)) {
3288  tok->u.backref.num = 1;
3289  tok->u.backref.ref1 = backs[0];
3290  }
3291  else {
3292  tok->u.backref.num = num;
3293  tok->u.backref.refs = backs;
3294  }
3295  }
3296  *src = p;
3297  return 0;
3298 }
3299 #endif
3300 
3301 static int
3302 fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env)
3303 {
3304  int r, num;
3305  OnigCodePoint c;
3306  OnigEncoding enc = env->enc;
3307  const OnigSyntaxType* syn = env->syntax;
3308  UChar* prev;
3309  UChar* p = *src;
3310  PFETCH_READY;
3311 
3312  start:
3313  if (PEND) {
3314  tok->type = TK_EOT;
3315  return tok->type;
3316  }
3317 
3318  tok->type = TK_STRING;
3319  tok->base = 0;
3320  tok->backp = p;
3321 
3322  PFETCH(c);
3323  if (IS_MC_ESC_CODE(c, syn)) {
3324  if (PEND) return ONIGERR_END_PATTERN_AT_ESCAPE;
3325 
3326  tok->backp = p;
3327  PFETCH(c);
3328 
3329  tok->u.c = c;
3330  tok->escaped = 1;
3331  switch (c) {
3332  case '*':
3333  if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_ASTERISK_ZERO_INF)) break;
3334  tok->type = TK_OP_REPEAT;
3335  tok->u.repeat.lower = 0;
3336  tok->u.repeat.upper = REPEAT_INFINITE;
3337  goto greedy_check;
3338  break;
3339 
3340  case '+':
3341  if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_PLUS_ONE_INF)) break;
3342  tok->type = TK_OP_REPEAT;
3343  tok->u.repeat.lower = 1;
3344  tok->u.repeat.upper = REPEAT_INFINITE;
3345  goto greedy_check;
3346  break;
3347 
3348  case '?':
3349  if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_QMARK_ZERO_ONE)) break;
3350  tok->type = TK_OP_REPEAT;
3351  tok->u.repeat.lower = 0;
3352  tok->u.repeat.upper = 1;
3353  greedy_check:
3354  if (!PEND && PPEEK_IS('?') &&
3355  IS_SYNTAX_OP(syn, ONIG_SYN_OP_QMARK_NON_GREEDY)) {
3356  PFETCH(c);
3357  tok->u.repeat.greedy = 0;
3358  tok->u.repeat.possessive = 0;
3359  }
3360  else {
3361  possessive_check:
3362  if (!PEND && PPEEK_IS('+') &&
3363  ((IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_PLUS_POSSESSIVE_REPEAT) &&
3364  tok->type != TK_INTERVAL) ||
3365  (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_PLUS_POSSESSIVE_INTERVAL) &&
3366  tok->type == TK_INTERVAL))) {
3367  PFETCH(c);
3368  tok->u.repeat.greedy = 1;
3369  tok->u.repeat.possessive = 1;
3370  }
3371  else {
3372  tok->u.repeat.greedy = 1;
3373  tok->u.repeat.possessive = 0;
3374  }
3375  }
3376  break;
3377 
3378  case '{':
3379  if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_BRACE_INTERVAL)) break;
3380  r = fetch_range_quantifier(&p, end, tok, env);
3381  if (r < 0) return r; /* error */
3382  if (r == 0) goto greedy_check;
3383  else if (r == 2) { /* {n} */
3384  if (IS_SYNTAX_BV(syn, ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY))
3385  goto possessive_check;
3386 
3387  goto greedy_check;
3388  }
3389  /* r == 1 : normal char */
3390  break;
3391 
3392  case '|':
3393  if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_VBAR_ALT)) break;
3394  tok->type = TK_ALT;
3395  break;
3396 
3397  case '(':
3398  if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LPAREN_SUBEXP)) break;
3399  tok->type = TK_SUBEXP_OPEN;
3400  break;
3401 
3402  case ')':
3403  if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LPAREN_SUBEXP)) break;
3404  tok->type = TK_SUBEXP_CLOSE;
3405  break;
3406 
3407  case 'w':
3408  if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_W_WORD)) break;
3409  tok->type = TK_CHAR_TYPE;
3410  tok->u.prop.ctype = ONIGENC_CTYPE_WORD;
3411  tok->u.prop.not = 0;
3412  break;
3413 
3414  case 'W':
3415  if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_W_WORD)) break;
3416  tok->type = TK_CHAR_TYPE;
3417  tok->u.prop.ctype = ONIGENC_CTYPE_WORD;
3418  tok->u.prop.not = 1;
3419  break;
3420 
3421  case 'b':
3422  if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_B_WORD_BOUND)) break;
3423  tok->type = TK_ANCHOR;
3424  tok->u.anchor.subtype = ANCHOR_WORD_BOUND;
3425  tok->u.anchor.ascii_range = IS_ASCII_RANGE(env->option)
3426  && ! IS_WORD_BOUND_ALL_RANGE(env->option);
3427  break;
3428 
3429  case 'B':
3430  if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_B_WORD_BOUND)) break;
3431  tok->type = TK_ANCHOR;
3432  tok->u.anchor.subtype = ANCHOR_NOT_WORD_BOUND;
3433  tok->u.anchor.ascii_range = IS_ASCII_RANGE(env->option)
3434  && ! IS_WORD_BOUND_ALL_RANGE(env->option);
3435  break;
3436 
3437 #ifdef USE_WORD_BEGIN_END
3438  case '<':
3439  if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END)) break;
3440  tok->type = TK_ANCHOR;
3441  tok->u.anchor.subtype = ANCHOR_WORD_BEGIN;
3442  tok->u.anchor.ascii_range = IS_ASCII_RANGE(env->option);
3443  break;
3444 
3445  case '>':
3446  if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END)) break;
3447  tok->type = TK_ANCHOR;
3448  tok->u.anchor.subtype = ANCHOR_WORD_END;
3449  tok->u.anchor.ascii_range = IS_ASCII_RANGE(env->option);
3450  break;
3451 #endif
3452 
3453  case 's':
3454  if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_S_WHITE_SPACE)) break;
3455  tok->type = TK_CHAR_TYPE;
3456  tok->u.prop.ctype = ONIGENC_CTYPE_SPACE;
3457  tok->u.prop.not = 0;
3458  break;
3459 
3460  case 'S':
3461  if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_S_WHITE_SPACE)) break;
3462  tok->type = TK_CHAR_TYPE;
3463  tok->u.prop.ctype = ONIGENC_CTYPE_SPACE;
3464  tok->u.prop.not = 1;
3465  break;
3466 
3467  case 'd':
3468  if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_D_DIGIT)) break;
3469  tok->type = TK_CHAR_TYPE;
3470  tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT;
3471  tok->u.prop.not = 0;
3472  break;
3473 
3474  case 'D':
3475  if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_D_DIGIT)) break;
3476  tok->type = TK_CHAR_TYPE;
3477  tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT;
3478  tok->u.prop.not = 1;
3479  break;
3480 
3481  case 'h':
3482  if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break;
3483  tok->type = TK_CHAR_TYPE;
3484  tok->u.prop.ctype = ONIGENC_CTYPE_XDIGIT;
3485  tok->u.prop.not = 0;
3486  break;
3487 
3488  case 'H':
3489  if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break;
3490  tok->type = TK_CHAR_TYPE;
3491  tok->u.prop.ctype = ONIGENC_CTYPE_XDIGIT;
3492  tok->u.prop.not = 1;
3493  break;
3494 
3495  case 'A':
3496  if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR)) break;
3497  begin_buf:
3498  tok->type = TK_ANCHOR;
3499  tok->u.anchor.subtype = ANCHOR_BEGIN_BUF;
3500  break;
3501 
3502  case 'Z':
3503  if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR)) break;
3504  tok->type = TK_ANCHOR;
3505  tok->u.anchor.subtype = ANCHOR_SEMI_END_BUF;
3506  break;
3507 
3508  case 'z':
3509  if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR)) break;
3510  end_buf:
3511  tok->type = TK_ANCHOR;
3512  tok->u.anchor.subtype = ANCHOR_END_BUF;
3513  break;
3514 
3515  case 'G':
3516  if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_CAPITAL_G_BEGIN_ANCHOR)) break;
3517  tok->type = TK_ANCHOR;
3518  tok->u.anchor.subtype = ANCHOR_BEGIN_POSITION;
3519  break;
3520 
3521  case '`':
3522  if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_GNU_BUF_ANCHOR)) break;
3523  goto begin_buf;
3524  break;
3525 
3526  case '\'':
3527  if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_GNU_BUF_ANCHOR)) break;
3528  goto end_buf;
3529  break;
3530 
3531  case 'x':
3532  if (PEND) break;
3533 
3534  prev = p;
3535  if (PPEEK_IS('{') && IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_BRACE_HEX8)) {
3536  PINC;
3537  num = scan_unsigned_hexadecimal_number(&p, end, 0, 8, enc);
3538  if (num < 0) return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE;
3539  if (!PEND) {
3540  if (ONIGENC_IS_CODE_XDIGIT(enc, PPEEK))
3541  return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE;
3542  }
3543 
3544  if ((p > prev + enclen(enc, prev, end)) && !PEND && PPEEK_IS('}')) {
3545  PINC;
3546  tok->type = TK_CODE_POINT;
3547  tok->u.code = (OnigCodePoint )num;
3548  }
3549  else {
3550  /* can't read nothing or invalid format */
3551  p = prev;
3552  }
3553  }
3554  else if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_HEX2)) {
3555  num = scan_unsigned_hexadecimal_number(&p, end, 0, 2, enc);
3556  if (num < 0) return ONIGERR_TOO_BIG_NUMBER;
3557  if (p == prev) { /* can't read nothing. */
3558  num = 0; /* but, it's not error */
3559  }
3560  tok->type = TK_RAW_BYTE;
3561  tok->base = 16;
3562  tok->u.c = num;
3563  }
3564  break;
3565 
3566  case 'u':
3567  if (PEND) break;
3568 
3569  prev = p;
3570  if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_U_HEX4)) {
3571  num = scan_unsigned_hexadecimal_number(&p, end, 4, 4, enc);
3572  if (num < -1) return ONIGERR_TOO_SHORT_DIGITS;
3573  else if (num < 0) return ONIGERR_TOO_BIG_NUMBER;
3574  if (p == prev) { /* can't read nothing. */
3575  num = 0; /* but, it's not error */
3576  }
3577  tok->type = TK_CODE_POINT;
3578  tok->base = 16;
3579  tok->u.code = (OnigCodePoint )num;
3580  }
3581  break;
3582 
3583  case 'o':
3584  if (PEND) break;
3585 
3586  prev = p;
3587  if (PPEEK_IS('{') && IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_O_BRACE_OCTAL)) {
3588  PINC;
3589  num = scan_unsigned_octal_number(&p, end, 11, enc);
3590  if (num < 0) return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE;
3591  if (!PEND) {
3592  OnigCodePoint c = PPEEK;
3593  if (ONIGENC_IS_CODE_DIGIT(enc, c) && c < '8')
3594  return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE;
3595  }
3596 
3597  if ((p > prev + enclen(enc, prev, end)) && !PEND && PPEEK_IS('}')) {
3598  PINC;
3599  tok->type = TK_CODE_POINT;
3600  tok->u.code = (OnigCodePoint )num;
3601  }
3602  else {
3603  /* can't read nothing or invalid format */
3604  p = prev;
3605  }
3606  }
3607  break;
3608 
3609  case '1': case '2': case '3': case '4':
3610  case '5': case '6': case '7': case '8': case '9':
3611  PUNFETCH;
3612  prev = p;
3613  num = onig_scan_unsigned_number(&p, end, enc);
3614  if (num < 0 || num > ONIG_MAX_BACKREF_NUM) {
3615  goto skip_backref;
3616  }
3617 
3618  if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_DECIMAL_BACKREF) &&
3619  (num <= env->num_mem || num <= 9)) { /* This spec. from GNU regex */
3620  if (IS_SYNTAX_BV(syn, ONIG_SYN_STRICT_CHECK_BACKREF)) {
3621  if (num > env->num_mem || IS_NULL(SCANENV_MEM_NODES(env)[num]))
3622  return ONIGERR_INVALID_BACKREF;
3623  }
3624 
3625  tok->type = TK_BACKREF;
3626  tok->u.backref.num = 1;
3627  tok->u.backref.ref1 = num;
3628  tok->u.backref.by_name = 0;
3629 #ifdef USE_BACKREF_WITH_LEVEL
3630  tok->u.backref.exist_level = 0;
3631 #endif
3632  break;
3633  }
3634 
3635  skip_backref:
3636  if (c == '8' || c == '9') {
3637  /* normal char */
3638  p = prev; PINC;
3639  break;
3640  }
3641 
3642  p = prev;
3643  /* fall through */
3644  case '0':
3645  if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_OCTAL3)) {
3646  prev = p;
3647  num = scan_unsigned_octal_number(&p, end, (c == '0' ? 2:3), enc);
3648  if (num < 0 || 0xff < num) return ONIGERR_TOO_BIG_NUMBER;
3649  if (p == prev) { /* can't read nothing. */
3650  num = 0; /* but, it's not error */
3651  }
3652  tok->type = TK_RAW_BYTE;
3653  tok->base = 8;
3654  tok->u.c = num;
3655  }
3656  else if (c != '0') {
3657  PINC;
3658  }
3659  break;
3660 
3661 #ifdef USE_NAMED_GROUP
3662  case 'k':
3663  if (!PEND && IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_K_NAMED_BACKREF)) {
3664  PFETCH(c);
3665  if (c == '<' || c == '\'') {
3666  r = fetch_named_backref_token(c, tok, &p, end, env);
3667  if (r < 0) return r;
3668  }
3669  else {
3670  PUNFETCH;
3671  onig_syntax_warn(env, "invalid back reference");
3672  }
3673  }
3674  break;
3675 #endif
3676 
3677 #if defined(USE_SUBEXP_CALL) || defined(USE_NAMED_GROUP)
3678  case 'g':
3679 # ifdef USE_NAMED_GROUP
3680  if (!PEND && IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_G_BRACE_BACKREF)) {
3681  PFETCH(c);
3682  if (c == '{') {
3683  r = fetch_named_backref_token(c, tok, &p, end, env);
3684  if (r < 0) return r;
3685  }
3686  else
3687  PUNFETCH;
3688  }
3689 # endif
3690 # ifdef USE_SUBEXP_CALL
3691  if (!PEND && IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_G_SUBEXP_CALL)) {
3692  PFETCH(c);
3693  if (c == '<' || c == '\'') {
3694  int gnum = -1, rel = 0;
3695  UChar* name_end;
3696  OnigCodePoint cnext;
3697 
3698  cnext = PPEEK;
3699  if (cnext == '0') {
3700  PINC;
3701  if (PPEEK_IS(get_name_end_code_point(c))) { /* \g<0>, \g'0' */
3702  PINC;
3703  name_end = p;
3704  gnum = 0;
3705  }
3706  }
3707  else if (cnext == '+') {
3708  PINC;
3709  rel = 1;
3710  }
3711  prev = p;
3712  if (gnum < 0) {
3713  r = fetch_name((OnigCodePoint )c, &p, end, &name_end, env, &gnum, 1);
3714  if (r < 0) return r;
3715  }
3716 
3717  tok->type = TK_CALL;
3718  tok->u.call.name = prev;
3719  tok->u.call.name_end = name_end;
3720  tok->u.call.gnum = gnum;
3721  tok->u.call.rel = rel;
3722  }
3723  else {
3724  onig_syntax_warn(env, "invalid subexp call");
3725  PUNFETCH;
3726  }
3727  }
3728 # endif
3729  break;
3730 #endif
3731 
3732  case 'Q':
3733  if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_CAPITAL_Q_QUOTE)) {
3734  tok->type = TK_QUOTE_OPEN;
3735  }
3736  break;
3737 
3738  case 'p':
3739  case 'P':
3740  if (PPEEK_IS('{') &&
3741  IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY)) {
3742  PINC;
3743  tok->type = TK_CHAR_PROPERTY;
3744  tok->u.prop.not = (c == 'P' ? 1 : 0);
3745 
3746  if (!PEND && IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT)) {
3747  PFETCH(c);
3748  if (c == '^') {
3749  tok->u.prop.not = (tok->u.prop.not == 0 ? 1 : 0);
3750  }
3751  else
3752  PUNFETCH;
3753  }
3754  }
3755  else {
3756  onig_syntax_warn(env, "invalid Unicode Property \\%c", c);
3757  }
3758  break;
3759 
3760  case 'R':
3761  if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_CAPITAL_R_LINEBREAK)) {
3762  tok->type = TK_LINEBREAK;
3763  }
3764  break;
3765 
3766  case 'X':
3767  if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_CAPITAL_X_EXTENDED_GRAPHEME_CLUSTER)) {
3768  tok->type = TK_EXTENDED_GRAPHEME_CLUSTER;
3769  }
3770  break;
3771 
3772  case 'K':
3773  if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_CAPITAL_K_KEEP)) {
3774  tok->type = TK_KEEP;
3775  }
3776  break;
3777 
3778  default:
3779  {
3780  OnigCodePoint c2;
3781 
3782  PUNFETCH;
3783  num = fetch_escaped_value(&p, end, env, &c2);
3784  if (num < 0) return num;
3785  /* set_raw: */
3786  if ((OnigCodePoint )tok->u.c != c2) {
3787  tok->type = TK_CODE_POINT;
3788  tok->u.code = (OnigCodePoint )c2;
3789  }
3790  else { /* string */
3791  p = tok->backp + enclen(enc, tok->backp, end);
3792  }
3793  }
3794  break;
3795  }
3796  }
3797  else {
3798  tok->u.c = c;
3799  tok->escaped = 0;
3800 
3801 #ifdef USE_VARIABLE_META_CHARS
3802  if ((c != ONIG_INEFFECTIVE_META_CHAR) &&
3803  IS_SYNTAX_OP(syn, ONIG_SYN_OP_VARIABLE_META_CHARACTERS)) {
3804  if (c == MC_ANYCHAR(syn))
3805  goto any_char;
3806  else if (c == MC_ANYTIME(syn))
3807  goto anytime;
3808  else if (c == MC_ZERO_OR_ONE_TIME(syn))
3809  goto zero_or_one_time;
3810  else if (c == MC_ONE_OR_MORE_TIME(syn))
3811  goto one_or_more_time;
3812  else if (c == MC_ANYCHAR_ANYTIME(syn)) {
3813  tok->type = TK_ANYCHAR_ANYTIME;
3814  goto out;
3815  }
3816  }
3817 #endif
3818 
3819  switch (c) {
3820  case '.':
3821  if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_DOT_ANYCHAR)) break;
3822 #ifdef USE_VARIABLE_META_CHARS
3823  any_char:
3824 #endif
3825  tok->type = TK_ANYCHAR;
3826  break;
3827 
3828  case '*':
3829  if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ASTERISK_ZERO_INF)) break;
3830 #ifdef USE_VARIABLE_META_CHARS
3831  anytime:
3832 #endif
3833  tok->type = TK_OP_REPEAT;
3834  tok->u.repeat.lower = 0;
3835  tok->u.repeat.upper = REPEAT_INFINITE;
3836  goto greedy_check;
3837  break;
3838 
3839  case '+':
3840  if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_PLUS_ONE_INF)) break;
3841 #ifdef USE_VARIABLE_META_CHARS
3842  one_or_more_time:
3843 #endif
3844  tok->type = TK_OP_REPEAT;
3845  tok->u.repeat.lower = 1;
3846  tok->u.repeat.upper = REPEAT_INFINITE;
3847  goto greedy_check;
3848  break;
3849 
3850  case '?':
3851  if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_QMARK_ZERO_ONE)) break;
3852 #ifdef USE_VARIABLE_META_CHARS
3853  zero_or_one_time:
3854 #endif
3855  tok->type = TK_OP_REPEAT;
3856  tok->u.repeat.lower = 0;
3857  tok->u.repeat.upper = 1;
3858  goto greedy_check;
3859  break;
3860 
3861  case '{':
3862  if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_BRACE_INTERVAL)) break;
3863  r = fetch_range_quantifier(&p, end, tok, env);
3864  if (r < 0) return r; /* error */
3865  if (r == 0) goto greedy_check;
3866  else if (r == 2) { /* {n} */
3867  if (IS_SYNTAX_BV(syn, ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY))
3868  goto possessive_check;
3869 
3870  goto greedy_check;
3871  }
3872  /* r == 1 : normal char */
3873  break;
3874 
3875  case '|':
3876  if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_VBAR_ALT)) break;
3877  tok->type = TK_ALT;
3878  break;
3879 
3880  case '(':
3881  if (PPEEK_IS('?') &&
3882  IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_QMARK_GROUP_EFFECT)) {
3883  PINC;
3884  if (PPEEK_IS('#')) {
3885  PFETCH(c);
3886  while (1) {
3887  if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
3888  PFETCH(c);
3889  if (c == MC_ESC(syn)) {
3890  if (!PEND) PFETCH(c);
3891  }
3892  else {
3893  if (c == ')') break;
3894  }
3895  }
3896  goto start;
3897  }
3898 #ifdef USE_PERL_SUBEXP_CALL
3899  /* (?&name), (?n), (?R), (?0), (?+n), (?-n) */
3900  c = PPEEK;
3901  if ((c == '&' || c == 'R' || ONIGENC_IS_CODE_DIGIT(enc, c)) &&
3902  IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_SUBEXP_CALL)) {
3903  /* (?&name), (?n), (?R), (?0) */
3904  int gnum;
3905  UChar *name;
3906  UChar *name_end;
3907 
3908  if (c == 'R' || c == '0') {
3909  PINC; /* skip 'R' / '0' */
3910  if (!PPEEK_IS(')')) return ONIGERR_INVALID_GROUP_NAME;
3911  PINC; /* skip ')' */
3912  name_end = name = p;
3913  gnum = 0;
3914  }
3915  else {
3916  int numref = 1;
3917  if (c == '&') { /* (?&name) */
3918  PINC;
3919  numref = 0; /* don't allow number name */
3920  }
3921  name = p;
3922  r = fetch_name((OnigCodePoint )'(', &p, end, &name_end, env, &gnum, numref);
3923  if (r < 0) return r;
3924  }
3925 
3926  tok->type = TK_CALL;
3927  tok->u.call.name = name;
3928  tok->u.call.name_end = name_end;
3929  tok->u.call.gnum = gnum;
3930  tok->u.call.rel = 0;
3931  break;
3932  }
3933  else if ((c == '-' || c == '+') &&
3934  IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_SUBEXP_CALL)) {
3935  /* (?+n), (?-n) */
3936  int gnum;
3937  UChar *name;
3938  UChar *name_end;
3939  OnigCodePoint cnext;
3940  PFETCH_READY;
3941 
3942  PINC; /* skip '-' / '+' */
3943  cnext = PPEEK;
3944  if (ONIGENC_IS_CODE_DIGIT(enc, cnext)) {
3945  if (c == '-') PUNFETCH;
3946  name = p;
3947  r = fetch_name((OnigCodePoint )'(', &p, end, &name_end, env, &gnum, 1);
3948  if (r < 0) return r;
3949 
3950  tok->type = TK_CALL;
3951  tok->u.call.name = name;
3952  tok->u.call.name_end = name_end;
3953  tok->u.call.gnum = gnum;
3954  tok->u.call.rel = 1;
3955  break;
3956  }
3957  }
3958 #endif /* USE_PERL_SUBEXP_CALL */
3959 #ifdef USE_CAPITAL_P_NAMED_GROUP
3960  if (PPEEK_IS('P') &&
3961  IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_CAPITAL_P_NAMED_GROUP)) {
3962  int gnum;
3963  UChar *name;
3964  UChar *name_end;
3965  PFETCH_READY;
3966 
3967  PINC; /* skip 'P' */
3968  if (PEND) return ONIGERR_UNDEFINED_GROUP_OPTION;
3969  PFETCH(c);
3970  if (c == '=') { /* (?P=name): backref */
3971  r = fetch_named_backref_token((OnigCodePoint )'(', tok, &p, end, env);
3972  if (r < 0) return r;
3973  break;
3974  }
3975  else if (c == '>') { /* (?P>name): subexp call */
3976  name = p;
3977  r = fetch_name((OnigCodePoint )'(', &p, end, &name_end, env, &gnum, 0);
3978  if (r < 0) return r;
3979 
3980  tok->type = TK_CALL;
3981  tok->u.call.name = name;
3982  tok->u.call.name_end = name_end;
3983  tok->u.call.gnum = gnum;
3984  tok->u.call.rel = 0;
3985  break;
3986  }
3987  }
3988 #endif /* USE_CAPITAL_P_NAMED_GROUP */
3989  PUNFETCH;
3990  }
3991 
3992  if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LPAREN_SUBEXP)) break;
3993  tok->type = TK_SUBEXP_OPEN;
3994  break;
3995 
3996  case ')':
3997  if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LPAREN_SUBEXP)) break;
3998  tok->type = TK_SUBEXP_CLOSE;
3999  break;
4000 
4001  case '^':
4002  if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LINE_ANCHOR)) break;
4003  tok->type = TK_ANCHOR;
4004  tok->u.anchor.subtype = (IS_SINGLELINE(env->option)
4005  ? ANCHOR_BEGIN_BUF : ANCHOR_BEGIN_LINE);
4006  break;
4007 
4008  case '$':
4009  if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LINE_ANCHOR)) break;
4010  tok->type = TK_ANCHOR;
4011  tok->u.anchor.subtype = (IS_SINGLELINE(env->option)
4012  ? ANCHOR_SEMI_END_BUF : ANCHOR_END_LINE);
4013  break;
4014 
4015  case '[':
4016  if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_BRACKET_CC)) break;
4017  tok->type = TK_CC_OPEN;
4018  break;
4019 
4020  case ']':
4021  if (*src > env->pattern) /* /].../ is allowed. */
4022  CLOSE_BRACKET_WITHOUT_ESC_WARN(env, (UChar* )"]");
4023  break;
4024 
4025  case '#':
4026  if (IS_EXTEND(env->option)) {
4027  while (!PEND) {
4028  PFETCH(c);
4029  if (ONIGENC_IS_CODE_NEWLINE(enc, c))
4030  break;
4031  }
4032  goto start;
4033  break;
4034  }
4035  break;
4036 
4037  case ' ': case '\t': case '\n': case '\r': case '\f':
4038  if (IS_EXTEND(env->option))
4039  goto start;
4040  break;
4041 
4042  default:
4043  /* string */
4044  break;
4045  }
4046  }
4047 
4048 #ifdef USE_VARIABLE_META_CHARS
4049  out:
4050 #endif
4051  *src = p;
4052  return tok->type;
4053 }
4054 
4055 static int
4056 add_ctype_to_cc_by_range(CClassNode* cc, int ctype ARG_UNUSED, int not,
4057  ScanEnv* env,
4058  OnigCodePoint sb_out, const OnigCodePoint mbr[])
4059 {
4060  int i, r;
4061  OnigCodePoint j;
4062 
4063  int n = ONIGENC_CODE_RANGE_NUM(mbr);
4064 
4065  if (not == 0) {
4066  for (i = 0; i < n; i++) {
4067  for (j = ONIGENC_CODE_RANGE_FROM(mbr, i);
4068  j <= ONIGENC_CODE_RANGE_TO(mbr, i); j++) {
4069  if (j >= sb_out) {
4070  if (j > ONIGENC_CODE_RANGE_FROM(mbr, i)) {
4071  r = add_code_range_to_buf(&(cc->mbuf), env, j,
4072  ONIGENC_CODE_RANGE_TO(mbr, i));
4073  if (r != 0) return r;
4074  i++;
4075  }
4076 
4077  goto sb_end;
4078  }
4079  BITSET_SET_BIT_CHKDUP(cc->bs, j);
4080  }
4081  }
4082 
4083  sb_end:
4084  for ( ; i < n; i++) {
4085  r = add_code_range_to_buf(&(cc->mbuf), env,
4086  ONIGENC_CODE_RANGE_FROM(mbr, i),
4087  ONIGENC_CODE_RANGE_TO(mbr, i));
4088  if (r != 0) return r;
4089  }
4090  }
4091  else {
4092  OnigCodePoint prev = 0;
4093 
4094  for (i = 0; i < n; i++) {
4095  for (j = prev;
4096  j < ONIGENC_CODE_RANGE_FROM(mbr, i); j++) {
4097  if (j >= sb_out) {
4098  goto sb_end2;
4099  }
4100  BITSET_SET_BIT_CHKDUP(cc->bs, j);
4101  }
4102  prev = ONIGENC_CODE_RANGE_TO(mbr, i) + 1;
4103  }
4104  for (j = prev; j < sb_out; j++) {
4105  BITSET_SET_BIT_CHKDUP(cc->bs, j);
4106  }
4107 
4108  sb_end2:
4109  prev = sb_out;
4110 
4111  for (i = 0; i < n; i++) {
4112  if (prev < ONIGENC_CODE_RANGE_FROM(mbr, i)) {
4113  r = add_code_range_to_buf(&(cc->mbuf), env, prev,
4114  ONIGENC_CODE_RANGE_FROM(mbr, i) - 1);
4115  if (r != 0) return r;
4116  }
4117  prev = ONIGENC_CODE_RANGE_TO(mbr, i) + 1;
4118  }
4119  if (prev < 0x7fffffff) {
4120  r = add_code_range_to_buf(&(cc->mbuf), env, prev, 0x7fffffff);
4121  if (r != 0) return r;
4122  }
4123  }
4124 
4125  return 0;
4126 }
4127 
4128 static int
4129 add_ctype_to_cc(CClassNode* cc, int ctype, int not, int ascii_range, ScanEnv* env)
4130 {
4131  int maxcode;
4132  int c, r;
4133  const OnigCodePoint *ranges;
4134  OnigCodePoint sb_out;
4135  OnigEncoding enc = env->enc;
4136 
4137  r = ONIGENC_GET_CTYPE_CODE_RANGE(enc, ctype, &sb_out, &ranges);
4138  if (r == 0) {
4139  if (ascii_range) {
4140  CClassNode ccwork;
4141  initialize_cclass(&ccwork);
4142  r = add_ctype_to_cc_by_range(&ccwork, ctype, not, env, sb_out,
4143  ranges);
4144  if (r == 0) {
4145  if (not) {
4146  r = add_code_range_to_buf0(&(ccwork.mbuf), env, 0x80, ONIG_LAST_CODE_POINT, FALSE);
4147  }
4148  else {
4149  CClassNode ccascii;
4150  initialize_cclass(&ccascii);
4151  if (ONIGENC_MBC_MINLEN(env->enc) > 1) {
4152  r = add_code_range(&(ccascii.mbuf), env, 0x00, 0x7F);
4153  }
4154  else {
4155  bitset_set_range(env, ccascii.bs, 0x00, 0x7F);
4156  r = 0;
4157  }
4158  if (r == 0) {
4159  r = and_cclass(&ccwork, &ccascii, env);
4160  }
4161  if (IS_NOT_NULL(ccascii.mbuf)) bbuf_free(ccascii.mbuf);
4162  }
4163  if (r == 0) {
4164  r = or_cclass(cc, &ccwork, env);
4165  }
4166  if (IS_NOT_NULL(ccwork.mbuf)) bbuf_free(ccwork.mbuf);
4167  }
4168  }
4169  else {
4170  r = add_ctype_to_cc_by_range(cc, ctype, not, env, sb_out, ranges);
4171  }
4172  return r;
4173  }
4174  else if (r != ONIG_NO_SUPPORT_CONFIG) {
4175  return r;
4176  }
4177 
4178  maxcode = ascii_range ? 0x80 : SINGLE_BYTE_SIZE;
4179  r = 0;
4180  switch (ctype) {
4181  case ONIGENC_CTYPE_ALPHA:
4182  case ONIGENC_CTYPE_BLANK:
4183  case ONIGENC_CTYPE_CNTRL:
4184  case ONIGENC_CTYPE_DIGIT:
4185  case ONIGENC_CTYPE_LOWER:
4186  case ONIGENC_CTYPE_PUNCT:
4187  case ONIGENC_CTYPE_SPACE:
4188  case ONIGENC_CTYPE_UPPER:
4189  case ONIGENC_CTYPE_XDIGIT:
4190  case ONIGENC_CTYPE_ASCII:
4191  case ONIGENC_CTYPE_ALNUM:
4192  if (not != 0) {
4193  for (c = 0; c < SINGLE_BYTE_SIZE; c++) {
4194  if (! ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype))
4195  BITSET_SET_BIT_CHKDUP(cc->bs, c);
4196  }
4197  ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf);
4198  }
4199  else {
4200  for (c = 0; c < SINGLE_BYTE_SIZE; c++) {
4201  if (ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype))
4202  BITSET_SET_BIT_CHKDUP(cc->bs, c);
4203  }
4204  }
4205  break;
4206 
4207  case ONIGENC_CTYPE_GRAPH:
4208  case ONIGENC_CTYPE_PRINT:
4209  if (not != 0) {
4210  for (c = 0; c < SINGLE_BYTE_SIZE; c++) {
4211  if (! ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype)
4212  || c >= maxcode)
4213  BITSET_SET_BIT_CHKDUP(cc->bs, c);
4214  }
4215  if (ascii_range)
4216  ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf);
4217  }
4218  else {
4219  for (c = 0; c < maxcode; c++) {
4220  if (ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype))
4221  BITSET_SET_BIT_CHKDUP(cc->bs, c);
4222  }
4223  if (! ascii_range)
4224  ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf);
4225  }
4226  break;
4227 
4228  case ONIGENC_CTYPE_WORD:
4229  if (not == 0) {
4230  for (c = 0; c < maxcode; c++) {
4231  if (ONIGENC_IS_CODE_WORD(enc, c)) BITSET_SET_BIT_CHKDUP(cc->bs, c);
4232  }
4233  if (! ascii_range)
4234  ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf);
4235  }
4236  else {
4237  for (c = 0; c < SINGLE_BYTE_SIZE; c++) {
4238  if ((ONIGENC_CODE_TO_MBCLEN(enc, c) > 0) /* check invalid code point */
4239  && (! ONIGENC_IS_CODE_WORD(enc, c) || c >= maxcode))
4240  BITSET_SET_BIT_CHKDUP(cc->bs, c);
4241  }
4242  if (ascii_range)
4243  ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf);
4244  }
4245  break;
4246 
4247  default:
4248  return ONIGERR_PARSER_BUG;
4249  break;
4250  }
4251 
4252  return r;
4253 }
4254 
4255 static int
4256 parse_posix_bracket(CClassNode* cc, CClassNode* asc_cc,
4257  UChar** src, UChar* end, ScanEnv* env)
4258 {
4259 #define POSIX_BRACKET_CHECK_LIMIT_LENGTH 20
4260 #define POSIX_BRACKET_NAME_MIN_LEN 4
4261 
4262  static const PosixBracketEntryType PBS[] = {
4263  POSIX_BRACKET_ENTRY_INIT("alnum", ONIGENC_CTYPE_ALNUM),
4264  POSIX_BRACKET_ENTRY_INIT("alpha", ONIGENC_CTYPE_ALPHA),
4265  POSIX_BRACKET_ENTRY_INIT("blank", ONIGENC_CTYPE_BLANK),
4266  POSIX_BRACKET_ENTRY_INIT("cntrl", ONIGENC_CTYPE_CNTRL),
4267  POSIX_BRACKET_ENTRY_INIT("digit", ONIGENC_CTYPE_DIGIT),
4268  POSIX_BRACKET_ENTRY_INIT("graph", ONIGENC_CTYPE_GRAPH),
4269  POSIX_BRACKET_ENTRY_INIT("lower", ONIGENC_CTYPE_LOWER),
4270  POSIX_BRACKET_ENTRY_INIT("print", ONIGENC_CTYPE_PRINT),
4271  POSIX_BRACKET_ENTRY_INIT("punct", ONIGENC_CTYPE_PUNCT),
4272  POSIX_BRACKET_ENTRY_INIT("space", ONIGENC_CTYPE_SPACE),
4273  POSIX_BRACKET_ENTRY_INIT("upper", ONIGENC_CTYPE_UPPER),
4274  POSIX_BRACKET_ENTRY_INIT("xdigit", ONIGENC_CTYPE_XDIGIT),
4275  POSIX_BRACKET_ENTRY_INIT("ascii", ONIGENC_CTYPE_ASCII),
4276  POSIX_BRACKET_ENTRY_INIT("word", ONIGENC_CTYPE_WORD),
4277  };
4278 
4279  const PosixBracketEntryType *pb;
4280  int not, i, r;
4281  int ascii_range;
4282  OnigCodePoint c;
4283  OnigEncoding enc = env->enc;
4284  UChar *p = *src;
4285 
4286  if (PPEEK_IS('^')) {
4287  PINC_S;
4288  not = 1;
4289  }
4290  else
4291  not = 0;
4292 
4293  if (onigenc_strlen(enc, p, end) < POSIX_BRACKET_NAME_MIN_LEN + 3)
4294  goto not_posix_bracket;
4295 
4296  ascii_range = IS_ASCII_RANGE(env->option) &&
4297  ! IS_POSIX_BRACKET_ALL_RANGE(env->option);
4298  for (pb = PBS; pb < PBS + numberof(PBS); pb++) {
4299  if (onigenc_with_ascii_strncmp(enc, p, end, pb->name, pb->len) == 0) {
4300  p = (UChar* )onigenc_step(enc, p, end, pb->len);
4301  if (onigenc_with_ascii_strncmp(enc, p, end, (UChar* )":]", 2) != 0)
4302  return ONIGERR_INVALID_POSIX_BRACKET_TYPE;
4303 
4304  r = add_ctype_to_cc(cc, pb->ctype, not, ascii_range, env);
4305  if (r != 0) return r;
4306 
4307  if (IS_NOT_NULL(asc_cc)) {
4308  if (pb->ctype != ONIGENC_CTYPE_WORD &&
4309  pb->ctype != ONIGENC_CTYPE_ASCII &&
4310  !ascii_range)
4311  r = add_ctype_to_cc(asc_cc, pb->ctype, not, ascii_range, env);
4312  if (r != 0) return r;
4313  }
4314 
4315  PINC_S; PINC_S;
4316  *src = p;
4317  return 0;
4318  }
4319  }
4320 
4321  not_posix_bracket:
4322  c = 0;
4323  i = 0;
4324  while (!PEND && ((c = PPEEK) != ':') && c != ']') {
4325  PINC_S;
4326  if (++i > POSIX_BRACKET_CHECK_LIMIT_LENGTH) break;
4327  }
4328  if (c == ':' && ! PEND) {
4329  PINC_S;
4330  if (! PEND) {
4331  PFETCH_S(c);
4332  if (c == ']')
4333  return ONIGERR_INVALID_POSIX_BRACKET_TYPE;
4334  }
4335  }
4336 
4337  return 1; /* 1: is not POSIX bracket, but no error. */
4338 }
4339 
4340 static int
4341 fetch_char_property_to_ctype(UChar** src, UChar* end, ScanEnv* env)
4342 {
4343  int r;
4344  OnigCodePoint c;
4345  OnigEncoding enc = env->enc;
4346  UChar *prev, *start, *p = *src;
4347 
4348  r = ONIGERR_INVALID_CHAR_PROPERTY_NAME;
4349  start = prev = p;
4350 
4351  while (!PEND) {
4352  prev = p;
4353  PFETCH_S(c);
4354  if (c == '}') {
4355  r = ONIGENC_PROPERTY_NAME_TO_CTYPE(enc, start, prev);
4356  if (r < 0) break;
4357 
4358  *src = p;
4359  return r;
4360  }
4361  else if (c == '(' || c == ')' || c == '{' || c == '|') {
4362  break;
4363  }
4364  }
4365 
4366  onig_scan_env_set_error_string(env, r, *src, prev);
4367  return r;
4368 }
4369 
4370 static int cclass_case_fold(Node** np, CClassNode* cc, CClassNode* asc_cc, ScanEnv* env);
4371 
4372 static int
4373 parse_char_property(Node** np, OnigToken* tok, UChar** src, UChar* end,
4374  ScanEnv* env)
4375 {
4376  int r, ctype;
4377  CClassNode* cc;
4378 
4379  ctype = fetch_char_property_to_ctype(src, end, env);
4380  if (ctype < 0) return ctype;
4381 
4382  *np = node_new_cclass();
4383  CHECK_NULL_RETURN_MEMERR(*np);
4384  cc = NCCLASS(*np);
4385  r = add_ctype_to_cc(cc, ctype, 0, 0, env);
4386  if (r != 0) return r;
4387  if (tok->u.prop.not != 0) NCCLASS_SET_NOT(cc);
4388 
4389  if (IS_IGNORECASE(env->option)) {
4390  if (ctype != ONIGENC_CTYPE_ASCII)
4391  r = cclass_case_fold(np, cc, cc, env);
4392  }
4393  return r;
4394 }
4395 
4396 
4397 enum CCSTATE {
4398  CCS_VALUE,
4399  CCS_RANGE,
4400  CCS_COMPLETE,
4401  CCS_START
4402 };
4403 
4404 enum CCVALTYPE {
4405  CCV_SB,
4406  CCV_CODE_POINT,
4407  CCV_CLASS
4408 };
4409 
4410 static int
4411 next_state_class(CClassNode* cc, CClassNode* asc_cc,
4412  OnigCodePoint* vs, enum CCVALTYPE* type,
4413  enum CCSTATE* state, ScanEnv* env)
4414 {
4415  int r;
4416 
4417  if (*state == CCS_RANGE)
4418  return ONIGERR_CHAR_CLASS_VALUE_AT_END_OF_RANGE;
4419 
4420  if (*state == CCS_VALUE && *type != CCV_CLASS) {
4421  if (*type == CCV_SB) {
4422  BITSET_SET_BIT_CHKDUP(cc->bs, (int )(*vs));
4423  if (IS_NOT_NULL(asc_cc))
4424  BITSET_SET_BIT(asc_cc->bs, (int )(*vs));
4425  }
4426  else if (*type == CCV_CODE_POINT) {
4427  r = add_code_range(&(cc->mbuf), env, *vs, *vs);
4428  if (r < 0) return r;
4429  if (IS_NOT_NULL(asc_cc)) {
4430  r = add_code_range0(&(asc_cc->mbuf), env, *vs, *vs, 0);
4431  if (r < 0) return r;
4432  }
4433  }
4434  }
4435 
4436  *state = CCS_VALUE;
4437  *type = CCV_CLASS;
4438  return 0;
4439 }
4440 
4441 static int
4442 next_state_val(CClassNode* cc, CClassNode* asc_cc,
4443  OnigCodePoint *from, OnigCodePoint to,
4444  int* from_israw, int to_israw,
4445  enum CCVALTYPE intype, enum CCVALTYPE* type,
4446  enum CCSTATE* state, ScanEnv* env)
4447 {
4448  int r;
4449 
4450  switch (*state) {
4451  case CCS_VALUE:
4452  if (*type == CCV_SB) {
4453  BITSET_SET_BIT_CHKDUP(cc->bs, (int )(*from));
4454  if (IS_NOT_NULL(asc_cc))
4455  BITSET_SET_BIT(asc_cc->bs, (int )(*from));
4456  }
4457  else if (*type == CCV_CODE_POINT) {
4458  r = add_code_range(&(cc->mbuf), env, *from, *from);
4459  if (r < 0) return r;
4460  if (IS_NOT_NULL(asc_cc)) {
4461  r = add_code_range0(&(asc_cc->mbuf), env, *from, *from, 0);
4462  if (r < 0) return r;
4463  }
4464  }
4465  break;
4466 
4467  case CCS_RANGE:
4468  if (intype == *type) {
4469  if (intype == CCV_SB) {
4470  if (*from > 0xff || to > 0xff)
4471  return ONIGERR_INVALID_CODE_POINT_VALUE;
4472 
4473  if (*from > to) {
4474  if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC))
4475  goto ccs_range_end;
4476  else
4477  return ONIGERR_EMPTY_RANGE_IN_CHAR_CLASS;
4478  }
4479  bitset_set_range(env, cc->bs, (int )*from, (int )to);
4480  if (IS_NOT_NULL(asc_cc))
4481  bitset_set_range(env, asc_cc->bs, (int )*from, (int )to);
4482  }
4483  else {
4484  r = add_code_range(&(cc->mbuf), env, *from, to);
4485  if (r < 0) return r;
4486  if (IS_NOT_NULL(asc_cc)) {
4487  r = add_code_range0(&(asc_cc->mbuf), env, *from, to, 0);
4488  if (r < 0) return r;
4489  }
4490  }
4491  }
4492  else {
4493  if (*from > to) {
4494  if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC))
4495  goto ccs_range_end;
4496  else
4497  return ONIGERR_EMPTY_RANGE_IN_CHAR_CLASS;
4498  }
4499  bitset_set_range(env, cc->bs, (int )*from, (int )(to < 0xff ? to : 0xff));
4500  r = add_code_range(&(cc->mbuf), env, (OnigCodePoint )*from, to);
4501  if (r < 0) return r;
4502  if (IS_NOT_NULL(asc_cc)) {
4503  bitset_set_range(env, asc_cc->bs, (int )*from, (int )(to < 0xff ? to : 0xff));
4504  r = add_code_range0(&(asc_cc->mbuf), env, (OnigCodePoint )*from, to, 0);
4505  if (r < 0) return r;
4506  }
4507  }
4508  ccs_range_end:
4509  *state = CCS_COMPLETE;
4510  break;
4511 
4512  case CCS_COMPLETE:
4513  case CCS_START:
4514  *state = CCS_VALUE;
4515  break;
4516 
4517  default:
4518  break;
4519  }
4520 
4521  *from_israw = to_israw;
4522  *from = to;
4523  *type = intype;
4524  return 0;
4525 }
4526 
4527 static int
4528 code_exist_check(OnigCodePoint c, UChar* from, UChar* end, int ignore_escaped,
4529  ScanEnv* env)
4530 {
4531  int in_esc;
4532  OnigCodePoint code;
4533  OnigEncoding enc = env->enc;
4534  UChar* p = from;
4535 
4536  in_esc = 0;
4537  while (! PEND) {
4538  if (ignore_escaped && in_esc) {
4539  in_esc = 0;
4540  }
4541  else {
4542  PFETCH_S(code);
4543  if (code == c) return 1;
4544  if (code == MC_ESC(env->syntax)) in_esc = 1;
4545  }
4546  }
4547  return 0;
4548 }
4549 
4550 static int
4551 parse_char_class(Node** np, Node** asc_np, OnigToken* tok, UChar** src, UChar* end,
4552  ScanEnv* env)
4553 {
4554  int r, neg, len, fetched, and_start;
4555  OnigCodePoint v, vs;
4556  UChar *p;
4557  Node* node;
4558  Node* asc_node;
4559  CClassNode *cc, *prev_cc;
4560  CClassNode *asc_cc, *asc_prev_cc;
4561  CClassNode work_cc, asc_work_cc;
4562 
4563  enum CCSTATE state;
4564  enum CCVALTYPE val_type, in_type;
4565  int val_israw, in_israw;
4566 
4567  *np = *asc_np = NULL_NODE;
4568  env->parse_depth++;
4569  if (env->parse_depth > ParseDepthLimit)
4570  return ONIGERR_PARSE_DEPTH_LIMIT_OVER;
4571  prev_cc = asc_prev_cc = (CClassNode* )NULL;
4572  r = fetch_token_in_cc(tok, src, end, env);
4573  if (r == TK_CHAR && tok->u.c == '^' && tok->escaped == 0) {
4574  neg = 1;
4575  r = fetch_token_in_cc(tok, src, end, env);
4576  }
4577  else {
4578  neg = 0;
4579  }
4580 
4581  if (r < 0) return r;
4582  if (r == TK_CC_CLOSE) {
4583  if (! code_exist_check((OnigCodePoint )']',
4584  *src, env->pattern_end, 1, env))
4585  return ONIGERR_EMPTY_CHAR_CLASS;
4586 
4587  CC_ESC_WARN(env, (UChar* )"]");
4588  r = tok->type = TK_CHAR; /* allow []...] */
4589  }
4590 
4591  *np = node = node_new_cclass();
4592  CHECK_NULL_RETURN_MEMERR(node);
4593  cc = NCCLASS(node);
4594 
4595  if (IS_IGNORECASE(env->option)) {
4596  *asc_np = asc_node = node_new_cclass();
4597  CHECK_NULL_RETURN_MEMERR(asc_node);
4598  asc_cc = NCCLASS(asc_node);
4599  }
4600  else {
4601  asc_node = NULL_NODE;
4602  asc_cc = NULL;
4603  }
4604 
4605  and_start = 0;
4606  state = CCS_START;
4607  p = *src;
4608  while (r != TK_CC_CLOSE) {
4609  fetched = 0;
4610  switch (r) {
4611  case TK_CHAR:
4612  if ((tok->u.code >= SINGLE_BYTE_SIZE) ||
4613  (len = ONIGENC_CODE_TO_MBCLEN(env->enc, tok->u.c)) > 1) {
4614  in_type = CCV_CODE_POINT;
4615  }
4616  else if (len < 0) {
4617  r = len;
4618  goto err;
4619  }
4620  else {
4621  sb_char:
4622  in_type = CCV_SB;
4623  }
4624  v = (OnigCodePoint )tok->u.c;
4625  in_israw = 0;
4626  goto val_entry2;
4627  break;
4628 
4629  case TK_RAW_BYTE:
4630  /* tok->base != 0 : octal or hexadec. */
4631  if (! ONIGENC_IS_SINGLEBYTE(env->enc) && tok->base != 0) {
4632  UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN];
4633  UChar* bufe = buf + ONIGENC_CODE_TO_MBC_MAXLEN;
4634  UChar* psave = p;
4635  int i, base = tok->base;
4636 
4637  buf[0] = (UChar )tok->u.c;
4638  for (i = 1; i < ONIGENC_MBC_MAXLEN(env->enc); i++) {
4639  r = fetch_token_in_cc(tok, &p, end, env);
4640  if (r < 0) goto err;
4641  if (r != TK_RAW_BYTE || tok->base != base) {
4642  fetched = 1;
4643  break;
4644  }
4645  buf[i] = (UChar )tok->u.c;
4646  }
4647 
4648  if (i < ONIGENC_MBC_MINLEN(env->enc)) {
4649  r = ONIGERR_TOO_SHORT_MULTI_BYTE_STRING;
4650  goto err;
4651  }
4652 
4653  len = enclen(env->enc, buf, buf + i);
4654  if (i < len) {
4655  r = ONIGERR_TOO_SHORT_MULTI_BYTE_STRING;
4656  goto err;
4657  }
4658  else if (i > len) { /* fetch back */
4659  p = psave;
4660  for (i = 1; i < len; i++) {
4661  (void)fetch_token_in_cc(tok, &p, end, env);
4662  /* no need to check the return value (already checked above) */
4663  }
4664  fetched = 0;
4665  }
4666 
4667  if (i == 1) {
4668  v = (OnigCodePoint )buf[0];
4669  goto raw_single;
4670  }
4671  else {
4672  v = ONIGENC_MBC_TO_CODE(env->enc, buf, bufe);
4673  in_type = CCV_CODE_POINT;
4674  }
4675  }
4676  else {
4677  v = (OnigCodePoint )tok->u.c;
4678  raw_single:
4679  in_type = CCV_SB;
4680  }
4681  in_israw = 1;
4682  goto val_entry2;
4683  break;
4684 
4685  case TK_CODE_POINT:
4686  v = tok->u.code;
4687  in_israw = 1;
4688  val_entry:
4689  len = ONIGENC_CODE_TO_MBCLEN(env->enc, v);
4690  if (len < 0) {
4691  r = len;
4692  goto err;
4693  }
4694  in_type = (len == 1 ? CCV_SB : CCV_CODE_POINT);
4695  val_entry2:
4696  r = next_state_val(cc, asc_cc, &vs, v, &val_israw, in_israw, in_type, &val_type,
4697  &state, env);
4698  if (r != 0) goto err;
4699  break;
4700 
4701  case TK_POSIX_BRACKET_OPEN:
4702  r = parse_posix_bracket(cc, asc_cc, &p, end, env);
4703  if (r < 0) goto err;
4704  if (r == 1) { /* is not POSIX bracket */
4705  CC_ESC_WARN(env, (UChar* )"[");
4706  p = tok->backp;
4707  v = (OnigCodePoint )tok->u.c;
4708  in_israw = 0;
4709  goto val_entry;
4710  }
4711  goto next_class;
4712  break;
4713 
4714  case TK_CHAR_TYPE:
4715  r = add_ctype_to_cc(cc, tok->u.prop.ctype, tok->u.prop.not,
4716  IS_ASCII_RANGE(env->option), env);
4717  if (r != 0) return r;
4718  if (IS_NOT_NULL(asc_cc)) {
4719  if (tok->u.prop.ctype != ONIGENC_CTYPE_WORD)
4720  r = add_ctype_to_cc(asc_cc, tok->u.prop.ctype, tok->u.prop.not,
4721  IS_ASCII_RANGE(env->option), env);
4722  if (r != 0) return r;
4723  }
4724 
4725  next_class:
4726  r = next_state_class(cc, asc_cc, &vs, &val_type, &state, env);
4727  if (r != 0) goto err;
4728  break;
4729 
4730  case TK_CHAR_PROPERTY:
4731  {
4732  int ctype;
4733 
4734  ctype = fetch_char_property_to_ctype(&p, end, env);
4735  if (ctype < 0) return ctype;
4736  r = add_ctype_to_cc(cc, ctype, tok->u.prop.not, 0, env);
4737  if (r != 0) return r;
4738  if (IS_NOT_NULL(asc_cc)) {
4739  if (ctype != ONIGENC_CTYPE_ASCII)
4740  r = add_ctype_to_cc(asc_cc, ctype, tok->u.prop.not, 0, env);
4741  if (r != 0) return r;
4742  }
4743  goto next_class;
4744  }
4745  break;
4746 
4747  case TK_CC_RANGE:
4748  if (state == CCS_VALUE) {
4749  r = fetch_token_in_cc(tok, &p, end, env);
4750  if (r < 0) goto err;
4751  fetched = 1;
4752  if (r == TK_CC_CLOSE) { /* allow [x-] */
4753  range_end_val:
4754  v = (OnigCodePoint )'-';
4755  in_israw = 0;
4756  goto val_entry;
4757  }
4758  else if (r == TK_CC_AND) {
4759  CC_ESC_WARN(env, (UChar* )"-");
4760  goto range_end_val;
4761  }
4762 
4763  if (val_type == CCV_CLASS) {
4764  r = ONIGERR_UNMATCHED_RANGE_SPECIFIER_IN_CHAR_CLASS;
4765  goto err;
4766  }
4767 
4768  state = CCS_RANGE;
4769  }
4770  else if (state == CCS_START) {
4771  /* [-xa] is allowed */
4772  v = (OnigCodePoint )tok->u.c;
4773  in_israw = 0;
4774 
4775  r = fetch_token_in_cc(tok, &p, end, env);
4776  if (r < 0) goto err;
4777  fetched = 1;
4778  /* [--x] or [a&&-x] is warned. */
4779  if (r == TK_CC_RANGE || and_start != 0)
4780  CC_ESC_WARN(env, (UChar* )"-");
4781 
4782  goto val_entry;
4783  }
4784  else if (state == CCS_RANGE) {
4785  CC_ESC_WARN(env, (UChar* )"-");
4786  goto sb_char; /* [!--x] is allowed */
4787  }
4788  else { /* CCS_COMPLETE */
4789  r = fetch_token_in_cc(tok, &p, end, env);
4790  if (r < 0) goto err;
4791  fetched = 1;
4792  if (r == TK_CC_CLOSE) goto range_end_val; /* allow [a-b-] */
4793  else if (r == TK_CC_AND) {
4794  CC_ESC_WARN(env, (UChar* )"-");
4795  goto range_end_val;
4796  }
4797 
4798  if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_DOUBLE_RANGE_OP_IN_CC)) {
4799  CC_ESC_WARN(env, (UChar* )"-");
4800  goto range_end_val; /* [0-9-a] is allowed as [0-9\-a] */
4801  }
4802  r = ONIGERR_UNMATCHED_RANGE_SPECIFIER_IN_CHAR_CLASS;
4803  goto err;
4804  }
4805  break;
4806 
4807  case TK_CC_CC_OPEN: /* [ */
4808  {
4809  Node *anode, *aasc_node;
4810  CClassNode* acc;
4811 
4812  r = parse_char_class(&anode, &aasc_node, tok, &p, end, env);
4813  if (r == 0) {
4814  acc = NCCLASS(anode);
4815  r = or_cclass(cc, acc, env);
4816  }
4817  if (r == 0 && IS_NOT_NULL(aasc_node)) {
4818  acc = NCCLASS(aasc_node);
4819  r = or_cclass(asc_cc, acc, env);
4820  }
4821  onig_node_free(anode);
4822  onig_node_free(aasc_node);
4823  if (r != 0) goto err;
4824  }
4825  break;
4826 
4827  case TK_CC_AND: /* && */
4828  {
4829  if (state == CCS_VALUE) {
4830  r = next_state_val(cc, asc_cc, &vs, 0, &val_israw, 0, val_type,
4831  &val_type, &state, env);
4832  if (r != 0) goto err;
4833  }
4834  /* initialize local variables */
4835  and_start = 1;
4836  state = CCS_START;
4837 
4838  if (IS_NOT_NULL(prev_cc)) {
4839  r = and_cclass(prev_cc, cc, env);
4840  if (r != 0) goto err;
4841  bbuf_free(cc->mbuf);
4842  if (IS_NOT_NULL(asc_cc)) {
4843  r = and_cclass(asc_prev_cc, asc_cc, env);
4844  if (r != 0) goto err;
4845  bbuf_free(asc_cc->mbuf);
4846  }
4847  }
4848  else {
4849  prev_cc = cc;
4850  cc = &work_cc;
4851  if (IS_NOT_NULL(asc_cc)) {
4852  asc_prev_cc = asc_cc;
4853  asc_cc = &asc_work_cc;
4854  }
4855  }
4856  initialize_cclass(cc);
4857  if (IS_NOT_NULL(asc_cc))
4858  initialize_cclass(asc_cc);
4859  }
4860  break;
4861 
4862  case TK_EOT:
4863  r = ONIGERR_PREMATURE_END_OF_CHAR_CLASS;
4864  goto err;
4865  break;
4866  default:
4867  r = ONIGERR_PARSER_BUG;
4868  goto err;
4869  break;
4870  }
4871 
4872  if (fetched)
4873  r = tok->type;
4874  else {
4875  r = fetch_token_in_cc(tok, &p, end, env);
4876  if (r < 0) goto err;
4877  }
4878  }
4879 
4880  if (state == CCS_VALUE) {
4881  r = next_state_val(cc, asc_cc, &vs, 0, &val_israw, 0, val_type,
4882  &val_type, &state, env);
4883  if (r != 0) goto err;
4884  }
4885 
4886  if (IS_NOT_NULL(prev_cc)) {
4887  r = and_cclass(prev_cc, cc, env);
4888  if (r != 0) goto err;
4889  bbuf_free(cc->mbuf);
4890  cc = prev_cc;
4891  if (IS_NOT_NULL(asc_cc)) {
4892  r = and_cclass(asc_prev_cc, asc_cc, env);
4893  if (r != 0) goto err;
4894  bbuf_free(asc_cc->mbuf);
4895  asc_cc = asc_prev_cc;
4896  }
4897  }
4898 
4899  if (neg != 0) {
4900  NCCLASS_SET_NOT(cc);
4901  if (IS_NOT_NULL(asc_cc))
4902  NCCLASS_SET_NOT(asc_cc);
4903  }
4904  else {
4905  NCCLASS_CLEAR_NOT(cc);
4906  if (IS_NOT_NULL(asc_cc))
4907  NCCLASS_CLEAR_NOT(asc_cc);
4908  }
4909  if (IS_NCCLASS_NOT(cc) &&
4910  IS_SYNTAX_BV(env->syntax, ONIG_SYN_NOT_NEWLINE_IN_NEGATIVE_CC)) {
4911  int is_empty;
4912 
4913  is_empty = (IS_NULL(cc->mbuf) ? 1 : 0);
4914  if (is_empty != 0)
4915  BITSET_IS_EMPTY(cc->bs, is_empty);
4916 
4917  if (is_empty == 0) {
4918 #define NEWLINE_CODE 0x0a
4919 
4920  if (ONIGENC_IS_CODE_NEWLINE(env->enc, NEWLINE_CODE)) {
4921  if (ONIGENC_CODE_TO_MBCLEN(env->enc, NEWLINE_CODE) == 1)
4922  BITSET_SET_BIT_CHKDUP(cc->bs, NEWLINE_CODE);
4923  else {
4924  r = add_code_range(&(cc->mbuf), env, NEWLINE_CODE, NEWLINE_CODE);
4925  if (r < 0) goto err;
4926  }
4927  }
4928  }
4929  }
4930  *src = p;
4931  env->parse_depth--;
4932  return 0;
4933 
4934  err:
4935  if (cc != NCCLASS(*np))
4936  bbuf_free(cc->mbuf);
4937  if (IS_NOT_NULL(asc_cc) && (asc_cc != NCCLASS(*asc_np)))
4938  bbuf_free(asc_cc->mbuf);
4939  return r;
4940 }
4941 
4942 static int parse_subexp(Node** top, OnigToken* tok, int term,
4943  UChar** src, UChar* end, ScanEnv* env);
4944 
4945 static int
4946 parse_enclose(Node** np, OnigToken* tok, int term, UChar** src, UChar* end,
4947  ScanEnv* env)
4948 {
4949  int r = 0, num;
4950  Node *target, *work1 = NULL, *work2 = NULL;
4951  OnigOptionType option;
4952  OnigCodePoint c;
4953  OnigEncoding enc = env->enc;
4954 
4955 #ifdef USE_NAMED_GROUP
4956  int list_capture;
4957 #endif
4958 
4959  UChar* p = *src;
4960  PFETCH_READY;
4961 
4962  *np = NULL;
4963  if (PEND) return ONIGERR_END_PATTERN_WITH_UNMATCHED_PARENTHESIS;
4964 
4965  option = env->option;
4966  if (PPEEK_IS('?') &&
4967  IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_GROUP_EFFECT)) {
4968  PINC;
4969  if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
4970 
4971  PFETCH(c);
4972  switch (c) {
4973  case ':': /* (?:...) grouping only */
4974  group:
4975  r = fetch_token(tok, &p, end, env);
4976  if (r < 0) return r;
4977  r = parse_subexp(np, tok, term, &p, end, env);
4978  if (r < 0) return r;
4979  *src = p;
4980  return 1; /* group */
4981  break;
4982 
4983  case '=':
4984  *np = onig_node_new_anchor(ANCHOR_PREC_READ);
4985  break;
4986  case '!': /* preceding read */
4987  *np = onig_node_new_anchor(ANCHOR_PREC_READ_NOT);
4988  break;
4989  case '>': /* (?>...) stop backtrack */
4990  *np = node_new_enclose(ENCLOSE_STOP_BACKTRACK);
4991  break;
4992  case '~': /* (?~...) absent operator */
4993  if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_TILDE_ABSENT)) {
4994  *np = node_new_enclose(ENCLOSE_ABSENT);
4995  }
4996  else {
4997  return ONIGERR_UNDEFINED_GROUP_OPTION;
4998  }
4999  break;
5000 
5001 #ifdef USE_NAMED_GROUP
5002  case '\'':
5003  if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP)) {
5004  goto named_group1;
5005  }
5006  else
5007  return ONIGERR_UNDEFINED_GROUP_OPTION;
5008  break;
5009 
5010 # ifdef USE_CAPITAL_P_NAMED_GROUP
5011  case 'P': /* (?P<name>...) */
5012  if (!PEND &&
5013  IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_CAPITAL_P_NAMED_GROUP)) {
5014  PFETCH(c);
5015  if (c == '<') goto named_group1;
5016  }
5017  return ONIGERR_UNDEFINED_GROUP_OPTION;
5018  break;
5019 # endif
5020 #endif
5021 
5022  case '<': /* look behind (?<=...), (?<!...) */
5023  if (PEND) return ONIGERR_END_PATTERN_WITH_UNMATCHED_PARENTHESIS;
5024  PFETCH(c);
5025  if (c == '=')
5026  *np = onig_node_new_anchor(ANCHOR_LOOK_BEHIND);
5027  else if (c == '!')
5028  *np = onig_node_new_anchor(ANCHOR_LOOK_BEHIND_NOT);
5029 #ifdef USE_NAMED_GROUP
5030  else { /* (?<name>...) */
5031  if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP)) {
5032  UChar *name;
5033  UChar *name_end;
5034 
5035  PUNFETCH;
5036  c = '<';
5037 
5038  named_group1:
5039  list_capture = 0;
5040 
5041 # ifdef USE_CAPTURE_HISTORY
5042  named_group2:
5043 # endif
5044  name = p;
5045  r = fetch_name((OnigCodePoint )c, &p, end, &name_end, env, &num, 0);
5046  if (r < 0) return r;
5047 
5048  num = scan_env_add_mem_entry(env);
5049  if (num < 0) return num;
5050  if (list_capture != 0 && num >= (int )BIT_STATUS_BITS_NUM)
5051  return ONIGERR_GROUP_NUMBER_OVER_FOR_CAPTURE_HISTORY;
5052 
5053  r = name_add(env->reg, name, name_end, num, env);
5054  if (r != 0) return r;
5055  *np = node_new_enclose_memory(env->option, 1);
5056  CHECK_NULL_RETURN_MEMERR(*np);
5057  NENCLOSE(*np)->regnum = num;
5058  if (list_capture != 0)
5059  BIT_STATUS_ON_AT_SIMPLE(env->capture_history, num);
5060  env->num_named++;
5061  }
5062  else {
5063  return ONIGERR_UNDEFINED_GROUP_OPTION;
5064  }
5065  }
5066 #else
5067  else {
5068  return ONIGERR_UNDEFINED_GROUP_OPTION;
5069  }
5070 #endif
5071  break;
5072 
5073 #ifdef USE_CAPTURE_HISTORY
5074  case '@':
5075  if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ATMARK_CAPTURE_HISTORY)) {
5076 # ifdef USE_NAMED_GROUP
5077  if (!PEND &&
5078  IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP)) {
5079  PFETCH(c);
5080  if (c == '<' || c == '\'') {
5081  list_capture = 1;
5082  goto named_group2; /* (?@<name>...) */
5083  }
5084  PUNFETCH;
5085  }
5086 # endif
5087  *np = node_new_enclose_memory(env->option, 0);
5088  CHECK_NULL_RETURN_MEMERR(*np);
5089  num = scan_env_add_mem_entry(env);
5090  if (num < 0) return num;
5091  if (num >= (int )BIT_STATUS_BITS_NUM)
5092  return ONIGERR_GROUP_NUMBER_OVER_FOR_CAPTURE_HISTORY;
5093 
5094  NENCLOSE(*np)->regnum = num;
5095  BIT_STATUS_ON_AT_SIMPLE(env->capture_history, num);
5096  }
5097  else {
5098  return ONIGERR_UNDEFINED_GROUP_OPTION;
5099  }
5100  break;
5101 #endif /* USE_CAPTURE_HISTORY */
5102 
5103  case '(': /* conditional expression: (?(cond)yes), (?(cond)yes|no) */
5104  if (!PEND &&
5105  IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_LPAREN_CONDITION)) {
5106  UChar *name = NULL;
5107  UChar *name_end;
5108  PFETCH(c);
5109  if (ONIGENC_IS_CODE_DIGIT(enc, c)) { /* (n) */
5110  PUNFETCH;
5111  r = fetch_name((OnigCodePoint )'(', &p, end, &name_end, env, &num, 1);
5112  if (r < 0) return r;
5113 #if 0
5114  /* Relative number is not currently supported. (same as Perl) */
5115  if (num < 0) {
5116  num = BACKREF_REL_TO_ABS(num, env);
5117  if (num <= 0)
5118  return ONIGERR_INVALID_BACKREF;
5119  }
5120 #endif
5121  if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_STRICT_CHECK_BACKREF)) {
5122  if (num > env->num_mem ||
5123  IS_NULL(SCANENV_MEM_NODES(env)[num]))
5124  return ONIGERR_INVALID_BACKREF;
5125  }
5126  }
5127 #ifdef USE_NAMED_GROUP
5128  else if (c == '<' || c == '\'') { /* (<name>), ('name') */
5129  name = p;
5130  r = fetch_named_backref_token(c, tok, &p, end, env);
5131  if (r < 0) return r;
5132  if (!PPEEK_IS(')')) return ONIGERR_UNDEFINED_GROUP_OPTION;
5133  PINC;
5134 
5135  if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_USE_LEFT_MOST_NAMED_GROUP)) {
5136  num = tok->u.backref.ref1;
5137  }
5138  else {
5139  /* FIXME:
5140  * Use left most named group for now. This is the same as Perl.
5141  * However this should use the same strategy as normal back-
5142  * references on Ruby syntax; search right to left. */
5143  int len = tok->u.backref.num;
5144  num = len > 1 ? tok->u.backref.refs[0] : tok->u.backref.ref1;
5145  }
5146  }
5147 #endif
5148  else
5149  return ONIGERR_INVALID_CONDITION_PATTERN;
5150  *np = node_new_enclose(ENCLOSE_CONDITION);
5151  CHECK_NULL_RETURN_MEMERR(*np);
5152  NENCLOSE(*np)->regnum = num;
5153  if (IS_NOT_NULL(name)) NENCLOSE(*np)->state |= NST_NAME_REF;
5154  }
5155  else
5156  return ONIGERR_UNDEFINED_GROUP_OPTION;
5157  break;
5158 
5159 #if 0
5160  case '|': /* branch reset: (?|...) */
5161  if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_VBAR_BRANCH_RESET)) {
5162  /* TODO */
5163  }
5164  else
5165  return ONIGERR_UNDEFINED_GROUP_OPTION;
5166  break;
5167 #endif
5168 
5169  case '^': /* loads default options */
5170  if (!PEND && IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_PERL)) {
5171  /* d-imsx */
5172  ONOFF(option, ONIG_OPTION_ASCII_RANGE, 1);
5173  ONOFF(option, ONIG_OPTION_IGNORECASE, 1);
5174  ONOFF(option, ONIG_OPTION_SINGLELINE, 0);
5175  ONOFF(option, ONIG_OPTION_MULTILINE, 1);
5176  ONOFF(option, ONIG_OPTION_EXTEND, 1);
5177  PFETCH(c);
5178  }
5179 #if 0
5180  else if (!PEND && IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_RUBY)) {
5181  /* d-imx */
5182  ONOFF(option, ONIG_OPTION_ASCII_RANGE, 0);
5183  ONOFF(option, ONIG_OPTION_POSIX_BRACKET_ALL_RANGE, 0);
5184  ONOFF(option, ONIG_OPTION_WORD_BOUND_ALL_RANGE, 0);
5185  ONOFF(option, ONIG_OPTION_IGNORECASE, 1);
5186  ONOFF(option, ONIG_OPTION_MULTILINE, 1);
5187  ONOFF(option, ONIG_OPTION_EXTEND, 1);
5188  PFETCH(c);
5189  }
5190 #endif
5191  else {
5192  return ONIGERR_UNDEFINED_GROUP_OPTION;
5193  }
5194  /* fall through */
5195 #ifdef USE_POSIXLINE_OPTION
5196  case 'p':
5197 #endif
5198  case '-': case 'i': case 'm': case 's': case 'x':
5199  case 'a': case 'd': case 'l': case 'u':
5200  {
5201  int neg = 0;
5202 
5203  while (1) {
5204  switch (c) {
5205  case ':':
5206  case ')':
5207  break;
5208 
5209  case '-': neg = 1; break;
5210  case 'x': ONOFF(option, ONIG_OPTION_EXTEND, neg); break;
5211  case 'i': ONOFF(option, ONIG_OPTION_IGNORECASE, neg); break;
5212  case 's':
5213  if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_PERL)) {
5214  ONOFF(option, ONIG_OPTION_MULTILINE, neg);
5215  }
5216  else
5217  return ONIGERR_UNDEFINED_GROUP_OPTION;
5218  break;
5219 
5220  case 'm':
5221  if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_PERL)) {
5222  ONOFF(option, ONIG_OPTION_SINGLELINE, (neg == 0 ? 1 : 0));
5223  }
5224  else if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_RUBY)) {
5225  ONOFF(option, ONIG_OPTION_MULTILINE, neg);
5226  }
5227  else
5228  return ONIGERR_UNDEFINED_GROUP_OPTION;
5229  break;
5230 #ifdef USE_POSIXLINE_OPTION
5231  case 'p':
5232  ONOFF(option, ONIG_OPTION_MULTILINE|ONIG_OPTION_SINGLELINE, neg);
5233  break;
5234 #endif
5235 
5236  case 'a': /* limits \d, \s, \w and POSIX brackets to ASCII range */
5237  if ((IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_PERL) ||
5238  IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_RUBY)) &&
5239  (neg == 0)) {
5240  ONOFF(option, ONIG_OPTION_ASCII_RANGE, 0);
5241  ONOFF(option, ONIG_OPTION_POSIX_BRACKET_ALL_RANGE, 1);
5242  ONOFF(option, ONIG_OPTION_WORD_BOUND_ALL_RANGE, 1);
5243  }
5244  else
5245  return ONIGERR_UNDEFINED_GROUP_OPTION;
5246  break;
5247 
5248  case 'u':
5249  if ((IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_PERL) ||
5250  IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_RUBY)) &&
5251  (neg == 0)) {
5252  ONOFF(option, ONIG_OPTION_ASCII_RANGE, 1);
5253  ONOFF(option, ONIG_OPTION_POSIX_BRACKET_ALL_RANGE, 1);
5254  ONOFF(option, ONIG_OPTION_WORD_BOUND_ALL_RANGE, 1);
5255  }
5256  else
5257  return ONIGERR_UNDEFINED_GROUP_OPTION;
5258  break;
5259 
5260  case 'd':
5261  if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_PERL) &&
5262  (neg == 0)) {
5263  ONOFF(option, ONIG_OPTION_ASCII_RANGE, 1);
5264  }
5265  else if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_RUBY) &&
5266  (neg == 0)) {
5267  ONOFF(option, ONIG_OPTION_ASCII_RANGE, 0);
5268  ONOFF(option, ONIG_OPTION_POSIX_BRACKET_ALL_RANGE, 0);
5269  ONOFF(option, ONIG_OPTION_WORD_BOUND_ALL_RANGE, 0);
5270  }
5271  else
5272  return ONIGERR_UNDEFINED_GROUP_OPTION;
5273  break;
5274 
5275  case 'l':
5276  if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_PERL) && (neg == 0)) {
5277  ONOFF(option, ONIG_OPTION_ASCII_RANGE, 1);
5278  }
5279  else
5280  return ONIGERR_UNDEFINED_GROUP_OPTION;
5281  break;
5282 
5283  default:
5284  return ONIGERR_UNDEFINED_GROUP_OPTION;
5285  }
5286 
5287  if (c == ')') {
5288  *np = node_new_option(option);
5289  CHECK_NULL_RETURN_MEMERR(*np);
5290  *src = p;
5291  return 2; /* option only */
5292  }
5293  else if (c == ':') {
5294  OnigOptionType prev = env->option;
5295 
5296  env->option = option;
5297  r = fetch_token(tok, &p, end, env);
5298  if (r < 0) {
5299  env->option = prev;
5300  return r;
5301  }
5302  r = parse_subexp(&target, tok, term, &p, end, env);
5303  env->option = prev;
5304  if (r < 0) return r;
5305  *np = node_new_option(option);
5306  CHECK_NULL_RETURN_MEMERR(*np);
5307  NENCLOSE(*np)->target = target;
5308  *src = p;
5309  return 0;
5310  }
5311 
5312  if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
5313  PFETCH(c);
5314  }
5315  }
5316  break;
5317 
5318  default:
5319  return ONIGERR_UNDEFINED_GROUP_OPTION;
5320  }
5321  }
5322  else {
5323  if (ONIG_IS_OPTION_ON(env->option, ONIG_OPTION_DONT_CAPTURE_GROUP))
5324  goto group;
5325 
5326  *np = node_new_enclose_memory(env->option, 0);
5327  CHECK_NULL_RETURN_MEMERR(*np);
5328  num = scan_env_add_mem_entry(env);
5329  if (num < 0) return num;
5330  NENCLOSE(*np)->regnum = num;
5331  }
5332 
5333  CHECK_NULL_RETURN_MEMERR(*np);
5334  r = fetch_token(tok, &p, end, env);
5335  if (r < 0) return r;
5336  r = parse_subexp(&target, tok, term, &p, end, env);
5337  if (r < 0) {
5338  onig_node_free(target);
5339  return r;
5340  }
5341 
5342  if (NTYPE(*np) == NT_ANCHOR)
5343  NANCHOR(*np)->target = target;
5344  else {
5345  NENCLOSE(*np)->target = target;
5346  if (NENCLOSE(*np)->type == ENCLOSE_MEMORY) {
5347  /* Don't move this to previous of parse_subexp() */
5348  r = scan_env_set_mem_node(env, NENCLOSE(*np)->regnum, *np);
5349  if (r != 0) return r;
5350  }
5351  else if (NENCLOSE(*np)->type == ENCLOSE_CONDITION) {
5352  if (NTYPE(target) != NT_ALT) {
5353  /* convert (?(cond)yes) to (?(cond)yes|empty) */
5354  work1 = node_new_empty();
5355  if (IS_NULL(work1)) goto err;
5356  work2 = onig_node_new_alt(work1, NULL_NODE);
5357  if (IS_NULL(work2)) goto err;
5358  work1 = onig_node_new_alt(target, work2);
5359  if (IS_NULL(work1)) goto err;
5360  NENCLOSE(*np)->target = work1;
5361  }
5362  }
5363  }
5364 
5365  *src = p;
5366  return 0;
5367 
5368  err:
5369  onig_node_free(work1);
5370  onig_node_free(work2);
5371  onig_node_free(*np);
5372  *np = NULL;
5373  return ONIGERR_MEMORY;
5374 }
5375 
5376 static const char* const PopularQStr[] = {
5377  "?", "*", "+", "??", "*?", "+?"
5378 };
5379 
5380 static const char* const ReduceQStr[] = {
5381  "", "", "*", "*?", "??", "+ and ??", "+? and ?"
5382 };
5383 
5384 static int
5385 set_quantifier(Node* qnode, Node* target, int group, ScanEnv* env)
5386 {
5387  QtfrNode* qn;
5388 
5389  qn = NQTFR(qnode);
5390  if (qn->lower == 1 && qn->upper == 1) {
5391  return 1;
5392  }
5393 
5394  switch (NTYPE(target)) {
5395  case NT_STR:
5396  if (! group) {
5397  StrNode* sn = NSTR(target);
5398  if (str_node_can_be_split(sn, env->enc)) {
5399  Node* n = str_node_split_last_char(sn, env->enc);
5400  if (IS_NOT_NULL(n)) {
5401  qn->target = n;
5402  return 2;
5403  }
5404  }
5405  }
5406  break;
5407 
5408  case NT_QTFR:
5409  { /* check redundant double repeat. */
5410  /* verbose warn (?:.?)? etc... but not warn (.?)? etc... */
5411  QtfrNode* qnt = NQTFR(target);
5412  int nestq_num = popular_quantifier_num(qn);
5413  int targetq_num = popular_quantifier_num(qnt);
5414 
5415 #ifdef USE_WARNING_REDUNDANT_NESTED_REPEAT_OPERATOR
5416  if (nestq_num >= 0 && targetq_num >= 0 &&
5417  IS_SYNTAX_BV(env->syntax, ONIG_SYN_WARN_REDUNDANT_NESTED_REPEAT)) {
5418  switch (ReduceTypeTable[targetq_num][nestq_num]) {
5419  case RQ_ASIS:
5420  break;
5421 
5422  case RQ_DEL:
5423  if (onig_warn != onig_null_warn) {
5424  onig_syntax_warn(env, "regular expression has redundant nested repeat operator '%s'",
5425  PopularQStr[targetq_num]);
5426  }
5427  goto warn_exit;
5428  break;
5429 
5430  default:
5431  if (onig_warn != onig_null_warn) {
5432  onig_syntax_warn(env, "nested repeat operator '%s' and '%s' was replaced with '%s' in regular expression",
5433  PopularQStr[targetq_num], PopularQStr[nestq_num],
5434  ReduceQStr[ReduceTypeTable[targetq_num][nestq_num]]);
5435  }
5436  goto warn_exit;
5437  break;
5438  }
5439  }
5440 
5441  warn_exit:
5442 #endif
5443  if (targetq_num >= 0) {
5444  if (nestq_num >= 0) {
5445  onig_reduce_nested_quantifier(qnode, target);
5446  goto q_exit;
5447  }
5448  else if (targetq_num == 1 || targetq_num == 2) { /* * or + */
5449  /* (?:a*){n,m}, (?:a+){n,m} => (?:a*){n,n}, (?:a+){n,n} */
5450  if (! IS_REPEAT_INFINITE(qn->upper) && qn->upper > 1 && qn->greedy) {
5451  qn->upper = (qn->lower == 0 ? 1 : qn->lower);
5452  }
5453  }
5454  }
5455  }
5456  break;
5457 
5458  default:
5459  break;
5460  }
5461 
5462  qn->target = target;
5463  q_exit:
5464  return 0;
5465 }
5466 
5467 
5468 #ifndef CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS
5469 static int
5470 clear_not_flag_cclass(CClassNode* cc, OnigEncoding enc)
5471 {
5472  BBuf *tbuf;
5473  int r;
5474 
5475  if (IS_NCCLASS_NOT(cc)) {
5476  bitset_invert(cc->bs);
5477 
5478  if (! ONIGENC_IS_SINGLEBYTE(enc)) {
5479  r = not_code_range_buf(enc, cc->mbuf, &tbuf);
5480  if (r != 0) return r;
5481 
5482  bbuf_free(cc->mbuf);
5483  cc->mbuf = tbuf;
5484  }
5485 
5486  NCCLASS_CLEAR_NOT(cc);
5487  }
5488 
5489  return 0;
5490 }
5491 #endif /* CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS */
5492 
5493 typedef struct {
5494  ScanEnv* env;
5495  CClassNode* cc;
5496  CClassNode* asc_cc;
5497  Node* alt_root;
5498  Node** ptail;
5500 
5501 static int
5502 i_apply_case_fold(OnigCodePoint from, OnigCodePoint to[],
5503  int to_len, void* arg)
5504 {
5505  IApplyCaseFoldArg* iarg;
5506  ScanEnv* env;
5507  CClassNode* cc;
5508  CClassNode* asc_cc;
5509  BitSetRef bs;
5510  int add_flag, r;
5511 
5512  iarg = (IApplyCaseFoldArg* )arg;
5513  env = iarg->env;
5514  cc = iarg->cc;
5515  asc_cc = iarg->asc_cc;
5516  bs = cc->bs;
5517 
5518  if (IS_NULL(asc_cc)) {
5519  add_flag = 0;
5520  }
5521  else if (ONIGENC_IS_ASCII_CODE(from) == ONIGENC_IS_ASCII_CODE(*to)) {
5522  add_flag = 1;
5523  }
5524  else {
5525  add_flag = onig_is_code_in_cc(env->enc, from, asc_cc);
5526  if (IS_NCCLASS_NOT(asc_cc))
5527  add_flag = !add_flag;
5528  }
5529 
5530  if (to_len == 1) {
5531  int is_in = onig_is_code_in_cc(env->enc, from, cc);
5532 #ifdef CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS
5533  if ((is_in != 0 && !IS_NCCLASS_NOT(cc)) ||
5534  (is_in == 0 && IS_NCCLASS_NOT(cc))) {
5535  if (add_flag) {
5536  if (ONIGENC_MBC_MINLEN(env->enc) > 1 || *to >= SINGLE_BYTE_SIZE) {
5537  r = add_code_range0(&(cc->mbuf), env, *to, *to, 0);
5538  if (r < 0) return r;
5539  }
5540  else {
5541  BITSET_SET_BIT(bs, *to);
5542  }
5543  }
5544  }
5545 #else
5546  if (is_in != 0) {
5547  if (add_flag) {
5548  if (ONIGENC_MBC_MINLEN(env->enc) > 1 || *to >= SINGLE_BYTE_SIZE) {
5549  if (IS_NCCLASS_NOT(cc)) clear_not_flag_cclass(cc, env->enc);
5550  r = add_code_range0(&(cc->mbuf), env, *to, *to, 0);
5551  if (r < 0) return r;
5552  }
5553  else {
5554  if (IS_NCCLASS_NOT(cc)) {
5555  BITSET_CLEAR_BIT(bs, *to);
5556  }
5557  else {
5558  BITSET_SET_BIT(bs, *to);
5559  }
5560  }
5561  }
5562  }
5563 #endif /* CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS */
5564  }
5565  else {
5566  int r, i, len;
5567  UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN];
5568  Node *snode = NULL_NODE;
5569 
5570  if (onig_is_code_in_cc(env->enc, from, cc)
5571 #ifdef CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS
5572  && !IS_NCCLASS_NOT(cc)
5573 #endif
5574  ) {
5575  for (i = 0; i < to_len; i++) {
5576  len = ONIGENC_CODE_TO_MBC(env->enc, to[i], buf);
5577  if (i == 0) {
5578  snode = onig_node_new_str(buf, buf + len);
5579  CHECK_NULL_RETURN_MEMERR(snode);
5580 
5581  /* char-class expanded multi-char only
5582  compare with string folded at match time. */
5583  NSTRING_SET_AMBIG(snode);
5584  }
5585  else {
5586  r = onig_node_str_cat(snode, buf, buf + len);
5587  if (r < 0) {
5588  onig_node_free(snode);
5589  return r;
5590  }
5591  }
5592  }
5593 
5594  *(iarg->ptail) = onig_node_new_alt(snode, NULL_NODE);
5595  CHECK_NULL_RETURN_MEMERR(*(iarg->ptail));
5596  iarg->ptail = &(NCDR((*(iarg->ptail))));
5597  }
5598  }
5599 
5600  return 0;
5601 }
5602 
5603 static int
5604 cclass_case_fold(Node** np, CClassNode* cc, CClassNode* asc_cc, ScanEnv* env)
5605 {
5606  int r;
5607  IApplyCaseFoldArg iarg;
5608 
5609  iarg.env = env;
5610  iarg.cc = cc;
5611  iarg.asc_cc = asc_cc;
5612  iarg.alt_root = NULL_NODE;
5613  iarg.ptail = &(iarg.alt_root);
5614 
5615  r = ONIGENC_APPLY_ALL_CASE_FOLD(env->enc, env->case_fold_flag,
5616  i_apply_case_fold, &iarg);
5617  if (r != 0) {
5618  onig_node_free(iarg.alt_root);
5619  return r;
5620  }
5621  if (IS_NOT_NULL(iarg.alt_root)) {
5622  Node* work = onig_node_new_alt(*np, iarg.alt_root);
5623  if (IS_NULL(work)) {
5624  onig_node_free(iarg.alt_root);
5625  return ONIGERR_MEMORY;
5626  }
5627  *np = work;
5628  }
5629  return r;
5630 }
5631 
5632 static int
5633 node_linebreak(Node** np, ScanEnv* env)
5634 {
5635  /* same as (?>\x0D\x0A|[\x0A-\x0D\x{85}\x{2028}\x{2029}]) */
5636  Node* left = NULL;
5637  Node* right = NULL;
5638  Node* target1 = NULL;
5639  Node* target2 = NULL;
5640  CClassNode* cc;
5641  int num1, num2, r;
5642  UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN * 2];
5643 
5644  /* \x0D\x0A */
5645  num1 = ONIGENC_CODE_TO_MBC(env->enc, 0x0D, buf);
5646  if (num1 < 0) return num1;
5647  num2 = ONIGENC_CODE_TO_MBC(env->enc, 0x0A, buf + num1);
5648  if (num2 < 0) return num2;
5649  left = node_new_str_raw(buf, buf + num1 + num2);
5650  if (IS_NULL(left)) goto err;
5651 
5652  /* [\x0A-\x0D] or [\x0A-\x0D\x{85}\x{2028}\x{2029}] */
5653  right = node_new_cclass();
5654  if (IS_NULL(right)) goto err;
5655  cc = NCCLASS(right);
5656  if (ONIGENC_MBC_MINLEN(env->enc) > 1) {
5657  r = add_code_range(&(cc->mbuf), env, 0x0A, 0x0D);
5658  if (r != 0) goto err;
5659  }
5660  else {
5661  bitset_set_range(env, cc->bs, 0x0A, 0x0D);
5662  }
5663 
5664  /* TODO: move this block to enc/unicode.c */
5665  if (ONIGENC_IS_UNICODE(env->enc)) {
5666  /* UTF-8, UTF-16BE/LE, UTF-32BE/LE */
5667  r = add_code_range(&(cc->mbuf), env, 0x85, 0x85);
5668  if (r != 0) goto err;
5669  r = add_code_range(&(cc->mbuf), env, 0x2028, 0x2029);
5670  if (r != 0) goto err;
5671  }
5672 
5673  /* ...|... */
5674  target1 = onig_node_new_alt(right, NULL_NODE);
5675  if (IS_NULL(target1)) goto err;
5676  right = NULL;
5677  target2 = onig_node_new_alt(left, target1);
5678  if (IS_NULL(target2)) goto err;
5679  left = NULL;
5680  target1 = NULL;
5681 
5682  /* (?>...) */
5683  *np = node_new_enclose(ENCLOSE_STOP_BACKTRACK);
5684  if (IS_NULL(*np)) goto err;
5685  NENCLOSE(*np)->target = target2;
5686  return ONIG_NORMAL;
5687 
5688  err:
5689  onig_node_free(left);
5690  onig_node_free(right);
5691  onig_node_free(target1);
5692  onig_node_free(target2);
5693  return ONIGERR_MEMORY;
5694 }
5695 
5696 static int
5697 propname2ctype(ScanEnv* env, const char* propname)
5698 {
5699  UChar* name = (UChar* )propname;
5700  UChar* name_end = name + strlen(propname);
5701  int ctype = env->enc->property_name_to_ctype(ONIG_ENCODING_ASCII,
5702  name, name_end);
5703  if (ctype < 0) {
5704  onig_scan_env_set_error_string(env, ctype, name, name_end);
5705  }
5706  return ctype;
5707 }
5708 
5709 static int
5710 add_property_to_cc(CClassNode* cc, const char* propname, int not, ScanEnv* env)
5711 {
5712  int ctype = propname2ctype(env, propname);
5713  if (ctype < 0) return ctype;
5714  return add_ctype_to_cc(cc, ctype, not, 0, env);
5715 }
5716 
5717 /*
5718  * helper methods for node_extended_grapheme_cluster (/\X/)
5719  */
5720 static int
5721 create_property_node(Node **np, ScanEnv* env, const char* propname)
5722 {
5723  int r;
5724  CClassNode* cc;
5725 
5726  *np = node_new_cclass();
5727  if (IS_NULL(*np)) return ONIGERR_MEMORY;
5728  cc = NCCLASS(*np);
5729  r = add_property_to_cc(cc, propname, 0, env);
5730  if (r != 0)
5731  onig_node_free(*np);
5732  return r;
5733 }
5734 
5735 static int
5736 quantify_node(Node **np, int lower, int upper)
5737 {
5738  Node* tmp = node_new_quantifier(lower, upper, 0);
5739  if (IS_NULL(tmp)) return ONIGERR_MEMORY;
5740  NQTFR(tmp)->target = *np;
5741  *np = tmp;
5742  return 0;
5743 }
5744 
5745 static int
5746 quantify_property_node(Node **np, ScanEnv* env, const char* propname, char repetitions)
5747 {
5748  int r;
5749  int lower = 0;
5750  int upper = REPEAT_INFINITE;
5751 
5752  r = create_property_node(np, env, propname);
5753  if (r != 0) return r;
5754  switch (repetitions) {
5755  case '?': upper = 1; break;
5756  case '+': lower = 1; break;
5757  case '*': break;
5758  case '2': lower = upper = 2; break;
5759  default : return ONIGERR_PARSER_BUG;
5760  }
5761  return quantify_node(np, lower, upper);
5762 }
5763 
5764 #define LIST 0
5765 #define ALT 1
5766 
5767 /* IMPORTANT: Make sure node_array ends with NULL_NODE */
5768 static int
5769 create_node_from_array(int kind, Node **np, Node **node_array)
5770 {
5771  Node* tmp = NULL_NODE;
5772  int i = 0;
5773 
5774  while (node_array[i] != NULL_NODE) i++;
5775  while (--i >= 0) {
5776  *np = kind==LIST ? node_new_list(node_array[i], tmp)
5777  : onig_node_new_alt(node_array[i], tmp);
5778  if (IS_NULL(*np)) {
5779  while (i >= 0) {
5780  onig_node_free(node_array[i]);
5781  node_array[i--] = NULL_NODE;
5782  }
5783  onig_node_free(tmp);
5784  return ONIGERR_MEMORY;
5785  }
5786  else
5787  node_array[i] = NULL_NODE;
5788  tmp = *np;
5789  }
5790  return 0;
5791 }
5792 
5793 #define R_ERR(call) r=(call);if(r!=0)goto err
5794 
5795 /* Memory layout for common node array:
5796  * The main purpose is to be able to easily free all leftover nodes
5797  * after an error. As a side effect, we share some memory.
5798  *
5799  * The layout is as shown below (each line corresponds to one call of
5800  * create_node_from_array()). Because create_node_from_array sets all
5801  * nodes of the source to NULL_NODE, we can overlap the target array
5802  * as long as we do not override the actual target location.
5803  *
5804  * Target Array name Index
5805  *
5806  * node_array 0 1 2 3 4 5 6 7 8 9 A B C D E F
5807  * top_alts alts[5] 0 1 2 3 4*
5808  * alts+1 list[4] 0 1 2 3*
5809  * list+1 core_alts[7] 0 1 2 3 4 5 6*
5810  * core_alts+0 H_list[4] 0 1 2 3*
5811  * H_list+1 H_alt2[4] 0 1 2 3*
5812  * h_alt2+1 H_list2[3] 0 1 2*
5813  * core_alts+4 XP_list[4] 0 1 2 3*
5814  * XP_list+1 Ex_list[4] 0 1 2 3*
5815  */
5816 #define NODE_COMMON_SIZE 16
5817 
5818 static int
5819 node_extended_grapheme_cluster(Node** np, ScanEnv* env)
5820 {
5821  Node* tmp = NULL;
5822  Node* np1 = NULL;
5823  Node* top_alt = NULL;
5824  int r = 0;
5825  int num1;
5826  int i;
5827  int any_target_position;
5828  UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN * 2];
5829  OnigOptionType option;
5830  /* node_common is function-global so that we can free all nodes
5831  * in case of error. Unused slots are set to NULL_NODE at all times. */
5832  Node *node_common[NODE_COMMON_SIZE];
5833  Node **alts = node_common+0; /* size: 5 */
5834 
5835  for (i=0; i<NODE_COMMON_SIZE; i++)
5836  node_common[i] = NULL_NODE;
5837 
5838  /* CRLF, common for both Unicode and non-Unicode */
5839  /* \x0D\x0A */
5840  r = ONIGENC_CODE_TO_MBC(env->enc, 0x0D, buf);
5841  if (r < 0) goto err;
5842  num1 = r;
5843  r = ONIGENC_CODE_TO_MBC(env->enc, 0x0A, buf + num1);
5844  if (r < 0) goto err;
5845  alts[0] = node_new_str_raw(buf, buf + num1 + r);
5846  if (IS_NULL(alts[0])) goto err;
5847 
5848 #ifdef USE_UNICODE_PROPERTIES
5849  if (ONIGENC_IS_UNICODE(env->enc)) { /* UTF-8, UTF-16BE/LE, UTF-32BE/LE */
5850  CClassNode* cc;
5851 
5852  if (propname2ctype(env, "Grapheme_Cluster_Break=Extend") < 0) goto err;
5853  /* Unicode 11.0.0
5854  * CRLF (already done)
5855  * | [Control CR LF]
5856  * | precore* core postcore*
5857  * | . (to catch invalid stuff, because this seems to be spec for String#grapheme_clusters) */
5858 
5859  /* [Control CR LF] (CR and LF are not in the spec, but this is a conformed fix) */
5860  alts[1] = node_new_cclass();
5861  if (IS_NULL(alts[1])) goto err;
5862  cc = NCCLASS(alts[1]);
5863  R_ERR(add_property_to_cc(cc, "Grapheme_Cluster_Break=Control", 0, env));
5864  if (ONIGENC_MBC_MINLEN(env->enc) > 1) { /* UTF-16/UTF-32 */
5865  R_ERR(add_code_range(&(cc->mbuf), env, 0x000A, 0x000A)); /* CR */
5866  R_ERR(add_code_range(&(cc->mbuf), env, 0x000D, 0x000D)); /* LF */
5867  }
5868  else {
5869  BITSET_SET_BIT(cc->bs, 0x0a);
5870  BITSET_SET_BIT(cc->bs, 0x0d);
5871  }
5872 
5873  /* precore* core postcore* */
5874  {
5875  Node **list = alts + 3; /* size: 4 */
5876 
5877  /* precore*; precore := Prepend */
5878  R_ERR(quantify_property_node(list+0, env, "Grapheme_Cluster_Break=Prepend", '*'));
5879 
5880  /* core := hangul-syllable
5881  * | ri-sequence
5882  * | xpicto-sequence
5883  * | [^Control CR LF] */
5884  {
5885  Node **core_alts = list + 2; /* size: 7 */
5886 
5887  /* hangul-syllable :=
5888  * L* (V+ | LV V* | LVT) T*
5889  * | L+
5890  * | T+ */
5891  /* hangul-syllable is an alternative (would be called H_alt)
5892  * inside an alternative, but we flatten it into core_alts */
5893 
5894  /* L* (V+ | LV V* | LVT) T* */
5895  {
5896  Node **H_list = core_alts + 1; /* size: 4 */
5897  R_ERR(quantify_property_node(H_list+0, env, "Grapheme_Cluster_Break=L", '*'));
5898 
5899  /* V+ | LV V* | LVT */
5900  {
5901  Node **H_alt2 = H_list + 2; /* size: 4 */
5902  R_ERR(quantify_property_node(H_alt2+0, env, "Grapheme_Cluster_Break=V", '+'));
5903 
5904  /* LV V* */
5905  {
5906  Node **H_list2 = H_alt2 + 2; /* size: 3 */
5907 
5908  R_ERR(create_property_node(H_list2+0, env, "Grapheme_Cluster_Break=LV"));
5909  R_ERR(quantify_property_node(H_list2+1, env, "Grapheme_Cluster_Break=V", '*'));
5910  R_ERR(create_node_from_array(LIST, H_alt2+1, H_list2));
5911  }
5912 
5913  R_ERR(create_property_node(H_alt2+2, env, "Grapheme_Cluster_Break=LVT"));
5914  R_ERR(create_node_from_array(ALT, H_list+1, H_alt2));
5915  }
5916 
5917  R_ERR(quantify_property_node(H_list+2, env, "Grapheme_Cluster_Break=T", '*'));
5918  R_ERR(create_node_from_array(LIST, core_alts+0, H_list));
5919  }
5920 
5921  R_ERR(quantify_property_node(core_alts+1, env, "Grapheme_Cluster_Break=L", '+'));
5922  R_ERR(quantify_property_node(core_alts+2, env, "Grapheme_Cluster_Break=T", '+'));
5923  /* end of hangul-syllable */
5924 
5925  /* ri-sequence := RI RI */
5926  R_ERR(quantify_property_node(core_alts+3, env, "Regional_Indicator", '2'));
5927 
5928  /* xpicto-sequence := \p{Extended_Pictographic} (Extend* ZWJ \p{Extended_Pictographic})* */
5929  {
5930  Node **XP_list = core_alts + 5; /* size: 3 */
5931  R_ERR(create_property_node(XP_list+0, env, "Extended_Pictographic"));
5932 
5933  /* (Extend* ZWJ \p{Extended_Pictographic})* */
5934  {
5935  Node **Ex_list = XP_list + 2; /* size: 4 */
5936  /* assert(Ex_list+4 == node_common+NODE_COMMON_SIZE); */
5937  R_ERR(quantify_property_node(Ex_list+0, env, "Grapheme_Cluster_Break=Extend", '*'));
5938 
5939  /* ZWJ (ZERO WIDTH JOINER) */
5940  r = ONIGENC_CODE_TO_MBC(env->enc, 0x200D, buf);
5941  if (r < 0) goto err;
5942  Ex_list[1] = node_new_str_raw(buf, buf + r);
5943  if (IS_NULL(Ex_list[1])) goto err;
5944 
5945  R_ERR(create_property_node(Ex_list+2, env, "Extended_Pictographic"));
5946  R_ERR(create_node_from_array(LIST, XP_list+1, Ex_list));
5947  }
5948  R_ERR(quantify_node(XP_list+1, 0, REPEAT_INFINITE)); /* TODO: Check about node freeing */
5949 
5950  R_ERR(create_node_from_array(LIST, core_alts+4, XP_list));
5951  }
5952 
5953  /* [^Control CR LF] */
5954  core_alts[5] = node_new_cclass();
5955  if (IS_NULL(core_alts[5])) goto err;
5956  cc = NCCLASS(core_alts[5]);
5957  if (ONIGENC_MBC_MINLEN(env->enc) > 1) { /* UTF-16/UTF-32 */
5958  BBuf *inverted_buf = NULL;
5959 
5960  /* TODO: fix false warning */
5961  const int dup_not_warned = env->warnings_flag | ~ONIG_SYN_WARN_CC_DUP;
5962  env->warnings_flag |= ONIG_SYN_WARN_CC_DUP;
5963 
5964  /* Start with a positive buffer and invert at the end.
5965  * Otherwise, adding single-character ranges work the wrong way. */
5966  R_ERR(add_property_to_cc(cc, "Grapheme_Cluster_Break=Control", 0, env));
5967  R_ERR(add_code_range(&(cc->mbuf), env, 0x000A, 0x000A)); /* CR */
5968  R_ERR(add_code_range(&(cc->mbuf), env, 0x000D, 0x000D)); /* LF */
5969  R_ERR(not_code_range_buf(env->enc, cc->mbuf, &inverted_buf, env));
5970  cc->mbuf = inverted_buf; /* TODO: check what to do with buffer before inversion */
5971 
5972  env->warnings_flag &= dup_not_warned; /* TODO: fix false warning */
5973  }
5974  else {
5975  R_ERR(add_property_to_cc(cc, "Grapheme_Cluster_Break=Control", 1, env));
5976  BITSET_CLEAR_BIT(cc->bs, 0x0a);
5977  BITSET_CLEAR_BIT(cc->bs, 0x0d);
5978  }
5979 
5980  R_ERR(create_node_from_array(ALT, list+1, core_alts));
5981  }
5982 
5983  /* postcore*; postcore = [Extend ZWJ SpacingMark] */
5984  R_ERR(create_property_node(list+2, env, "Grapheme_Cluster_Break=Extend"));
5985  cc = NCCLASS(list[2]);
5986  R_ERR(add_property_to_cc(cc, "Grapheme_Cluster_Break=SpacingMark", 0, env));
5987  R_ERR(add_code_range(&(cc->mbuf), env, 0x200D, 0x200D));
5988  R_ERR(quantify_node(list+2, 0, REPEAT_INFINITE));
5989 
5990  R_ERR(create_node_from_array(LIST, alts+2, list));
5991  }
5992 
5993  any_target_position = 3;
5994  }
5995  else
5996 #endif /* USE_UNICODE_PROPERTIES */
5997  {
5998  any_target_position = 1;
5999  }
6000 
6001  /* PerlSyntax: (?s:.), RubySyntax: (?m:.), common for both Unicode and non-Unicode */
6002  /* Not in Unicode spec (UAX #29), but added to catch invalid stuff,
6003  * because this is Ruby spec for String#grapheme_clusters. */
6004  np1 = node_new_anychar();
6005  if (IS_NULL(np1)) goto err;
6006 
6007  option = env->option;
6008  ONOFF(option, ONIG_OPTION_MULTILINE, 0);
6009  tmp = node_new_option(option);
6010  if (IS_NULL(tmp)) goto err;
6011  NENCLOSE(tmp)->target = np1;
6012  alts[any_target_position] = tmp;
6013  np1 = NULL;
6014 
6015  R_ERR(create_node_from_array(ALT, &top_alt, alts));
6016 
6017  /* (?>): For efficiency, because there is no text piece
6018  * that is not in a grapheme cluster, and there is only one way
6019  * to split a string into grapheme clusters. */
6020  tmp = node_new_enclose(ENCLOSE_STOP_BACKTRACK);
6021  if (IS_NULL(tmp)) goto err;
6022  NENCLOSE(tmp)->target = top_alt;
6023  np1 = tmp;
6024 
6025 #ifdef USE_UNICODE_PROPERTIES
6026  if (ONIGENC_IS_UNICODE(env->enc)) {
6027  /* Don't ignore case. */
6028  option = env->option;
6029  ONOFF(option, ONIG_OPTION_IGNORECASE, 1);
6030  *np = node_new_option(option);
6031  if (IS_NULL(*np)) goto err;
6032  NENCLOSE(*np)->target = np1;
6033  }
6034  else
6035 #endif
6036  {
6037  *np = np1;
6038  }
6039  return ONIG_NORMAL;
6040 
6041  err:
6042  onig_node_free(np1);
6043  for (i=0; i<NODE_COMMON_SIZE; i++)
6044  onig_node_free(node_common[i]);
6045  return (r == 0) ? ONIGERR_MEMORY : r;
6046 }
6047 #undef R_ERR
6048 
6049 static int
6050 countbits(unsigned int bits)
6051 {
6052  bits = (bits & 0x55555555) + ((bits >> 1) & 0x55555555);
6053  bits = (bits & 0x33333333) + ((bits >> 2) & 0x33333333);
6054  bits = (bits & 0x0f0f0f0f) + ((bits >> 4) & 0x0f0f0f0f);
6055  bits = (bits & 0x00ff00ff) + ((bits >> 8) & 0x00ff00ff);
6056  return (bits & 0x0000ffff) + ((bits >>16) & 0x0000ffff);
6057 }
6058 
6059 static int
6060 is_onechar_cclass(CClassNode* cc, OnigCodePoint* code)
6061 {
6062  const OnigCodePoint not_found = ONIG_LAST_CODE_POINT;
6063  OnigCodePoint c = not_found;
6064  int i;
6065  BBuf *bbuf = cc->mbuf;
6066 
6067  if (IS_NCCLASS_NOT(cc)) return 0;
6068 
6069  /* check bbuf */
6070  if (IS_NOT_NULL(bbuf)) {
6071  OnigCodePoint n, *data;
6072  GET_CODE_POINT(n, bbuf->p);
6073  data = (OnigCodePoint* )(bbuf->p) + 1;
6074  if ((n == 1) && (data[0] == data[1])) {
6075  /* only one char found in the bbuf, save the code point. */
6076  c = data[0];
6077  if (((c < SINGLE_BYTE_SIZE) && BITSET_AT(cc->bs, c))) {
6078  /* skip if c is included in the bitset */
6079  c = not_found;
6080  }
6081  }
6082  else {
6083  return 0; /* the bbuf contains multiple chars */
6084  }
6085  }
6086 
6087  /* check bitset */
6088  for (i = 0; i < BITSET_SIZE; i++) {
6089  Bits b1 = cc->bs[i];
6090  if (b1 != 0) {
6091  if (((b1 & (b1 - 1)) == 0) && (c == not_found)) {
6092  c = BITS_IN_ROOM * i + countbits(b1 - 1);
6093  } else {
6094  return 0; /* the character class contains multiple chars */
6095  }
6096  }
6097  }
6098 
6099  if (c != not_found) {
6100  *code = c;
6101  return 1;
6102  }
6103 
6104  /* the character class contains no char. */
6105  return 0;
6106 }
6107 
6108 
6109 static int
6110 parse_exp(Node** np, OnigToken* tok, int term,
6111  UChar** src, UChar* end, ScanEnv* env)
6112 {
6113  int r, len, group = 0;
6114  Node* qn;
6115  Node** targetp;
6116 
6117  *np = NULL;
6118  if (tok->type == (enum TokenSyms )term)
6119  goto end_of_token;
6120 
6121  switch (tok->type) {
6122  case TK_ALT:
6123  case TK_EOT:
6124  end_of_token:
6125  *np = node_new_empty();
6126  return tok->type;
6127  break;
6128 
6129  case TK_SUBEXP_OPEN:
6130  r = parse_enclose(np, tok, TK_SUBEXP_CLOSE, src, end, env);
6131  if (r < 0) return r;
6132  if (r == 1) group = 1;
6133  else if (r == 2) { /* option only */
6134  Node* target;
6135  OnigOptionType prev = env->option;
6136 
6137  env->option = NENCLOSE(*np)->option;
6138  r = fetch_token(tok, src, end, env);
6139  if (r < 0) {
6140  env->option = prev;
6141  return r;
6142  }
6143  r = parse_subexp(&target, tok, term, src, end, env);
6144  env->option = prev;
6145  if (r < 0) {
6146  onig_node_free(target);
6147  return r;
6148  }
6149  NENCLOSE(*np)->target = target;
6150  return tok->type;
6151  }
6152  break;
6153 
6154  case TK_SUBEXP_CLOSE:
6155  if (! IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_UNMATCHED_CLOSE_SUBEXP))
6156  return ONIGERR_UNMATCHED_CLOSE_PARENTHESIS;
6157 
6158  if (tok->escaped) goto tk_raw_byte;
6159  else goto tk_byte;
6160  break;
6161 
6162  case TK_LINEBREAK:
6163  r = node_linebreak(np, env);
6164  if (r < 0) return r;
6165  break;
6166 
6167  case TK_EXTENDED_GRAPHEME_CLUSTER:
6168  r = node_extended_grapheme_cluster(np, env);
6169  if (r < 0) return r;
6170  break;
6171 
6172  case TK_KEEP:
6173  *np = onig_node_new_anchor(ANCHOR_KEEP);
6174  CHECK_NULL_RETURN_MEMERR(*np);
6175  break;
6176 
6177  case TK_STRING:
6178  tk_byte:
6179  {
6180  *np = node_new_str(tok->backp, *src);
6181  CHECK_NULL_RETURN_MEMERR(*np);
6182 
6183  string_loop:
6184  while (1) {
6185  r = fetch_token(tok, src, end, env);
6186  if (r < 0) return r;
6187  if (r == TK_STRING) {
6188  r = onig_node_str_cat(*np, tok->backp, *src);
6189  }
6190 #ifndef NUMBERED_CHAR_IS_NOT_CASE_AMBIG
6191  else if (r == TK_CODE_POINT) {
6192  r = node_str_cat_codepoint(*np, env->enc, tok->u.code);
6193  }
6194 #endif
6195  else {
6196  break;
6197  }
6198  if (r < 0) return r;
6199  }
6200 
6201  string_end:
6202  targetp = np;
6203  goto repeat;
6204  }
6205  break;
6206 
6207  case TK_RAW_BYTE:
6208  tk_raw_byte:
6209  {
6210  *np = node_new_str_raw_char((UChar )tok->u.c);
6211  CHECK_NULL_RETURN_MEMERR(*np);
6212  len = 1;
6213  while (1) {
6214  if (len >= ONIGENC_MBC_MINLEN(env->enc)) {
6215  if (len == enclen(env->enc, NSTR(*np)->s, NSTR(*np)->end)) {
6216  r = fetch_token(tok, src, end, env);
6217  NSTRING_CLEAR_RAW(*np);
6218  goto string_end;
6219  }
6220  }
6221 
6222  r = fetch_token(tok, src, end, env);
6223  if (r < 0) return r;
6224  if (r != TK_RAW_BYTE) {
6225  /* Don't use this, it is wrong for little endian encodings. */
6226 #ifdef USE_PAD_TO_SHORT_BYTE_CHAR
6227  int rem;
6228  if (len < ONIGENC_MBC_MINLEN(env->enc)) {
6229  rem = ONIGENC_MBC_MINLEN(env->enc) - len;
6230  (void )node_str_head_pad(NSTR(*np), rem, (UChar )0);
6231  if (len + rem == enclen(env->enc, NSTR(*np)->s)) {
6232  NSTRING_CLEAR_RAW(*np);
6233  goto string_end;
6234  }
6235  }
6236 #endif
6237  return ONIGERR_TOO_SHORT_MULTI_BYTE_STRING;
6238  }
6239 
6240  r = node_str_cat_char(*np, (UChar )tok->u.c);
6241  if (r < 0) return r;
6242 
6243  len++;
6244  }
6245  }
6246  break;
6247 
6248  case TK_CODE_POINT:
6249  {
6250  *np = node_new_empty();
6251  CHECK_NULL_RETURN_MEMERR(*np);
6252  r = node_str_cat_codepoint(*np, env->enc, tok->u.code);
6253  if (r != 0) return r;
6254 #ifdef NUMBERED_CHAR_IS_NOT_CASE_AMBIG
6255  NSTRING_SET_RAW(*np);
6256 #else
6257  goto string_loop;
6258 #endif
6259  }
6260  break;
6261 
6262  case TK_QUOTE_OPEN:
6263  {
6264  OnigCodePoint end_op[2];
6265  UChar *qstart, *qend, *nextp;
6266 
6267  end_op[0] = (OnigCodePoint )MC_ESC(env->syntax);
6268  end_op[1] = (OnigCodePoint )'E';
6269  qstart = *src;
6270  qend = find_str_position(end_op, 2, qstart, end, &nextp, env->enc);
6271  if (IS_NULL(qend)) {
6272  nextp = qend = end;
6273  }
6274  *np = node_new_str(qstart, qend);
6275  CHECK_NULL_RETURN_MEMERR(*np);
6276  *src = nextp;
6277  }
6278  break;
6279 
6280  case TK_CHAR_TYPE:
6281  {
6282  switch (tok->u.prop.ctype) {
6283  case ONIGENC_CTYPE_WORD:
6284  *np = node_new_ctype(tok->u.prop.ctype, tok->u.prop.not,
6285  IS_ASCII_RANGE(env->option));
6286  CHECK_NULL_RETURN_MEMERR(*np);
6287  break;
6288 
6289  case ONIGENC_CTYPE_SPACE:
6290  case ONIGENC_CTYPE_DIGIT:
6291  case ONIGENC_CTYPE_XDIGIT:
6292  {
6293  CClassNode* cc;
6294 
6295  *np = node_new_cclass();
6296  CHECK_NULL_RETURN_MEMERR(*np);
6297  cc = NCCLASS(*np);
6298  r = add_ctype_to_cc(cc, tok->u.prop.ctype, 0,
6299  IS_ASCII_RANGE(env->option), env);
6300  if (r != 0) return r;
6301  if (tok->u.prop.not != 0) NCCLASS_SET_NOT(cc);
6302  }
6303  break;
6304 
6305  default:
6306  return ONIGERR_PARSER_BUG;
6307  break;
6308  }
6309  }
6310  break;
6311 
6312  case TK_CHAR_PROPERTY:
6313  r = parse_char_property(np, tok, src, end, env);
6314  if (r != 0) return r;
6315  break;
6316 
6317  case TK_CC_OPEN:
6318  {
6319  Node *asc_node;
6320  CClassNode* cc;
6321  OnigCodePoint code;
6322 
6323  r = parse_char_class(np, &asc_node, tok, src, end, env);
6324  if (r != 0) {
6325  onig_node_free(asc_node);
6326  return r;
6327  }
6328 
6329  cc = NCCLASS(*np);
6330  if (is_onechar_cclass(cc, &code)) {
6331  onig_node_free(*np);
6332  onig_node_free(asc_node);
6333  *np = node_new_empty();
6334  CHECK_NULL_RETURN_MEMERR(*np);
6335  r = node_str_cat_codepoint(*np, env->enc, code);
6336  if (r != 0) return r;
6337  goto string_loop;
6338  }
6339  if (IS_IGNORECASE(env->option)) {
6340  r = cclass_case_fold(np, cc, NCCLASS(asc_node), env);
6341  if (r != 0) {
6342  onig_node_free(asc_node);
6343  return r;
6344  }
6345  }
6346  onig_node_free(asc_node);
6347  }
6348  break;
6349 
6350  case TK_ANYCHAR:
6351  *np = node_new_anychar();
6352  CHECK_NULL_RETURN_MEMERR(*np);
6353  break;
6354 
6355  case TK_ANYCHAR_ANYTIME:
6356  *np = node_new_anychar();
6357  CHECK_NULL_RETURN_MEMERR(*np);
6358  qn = node_new_quantifier(0, REPEAT_INFINITE, 0);
6359  CHECK_NULL_RETURN_MEMERR(qn);
6360  NQTFR(qn)->target = *np;
6361  *np = qn;
6362  break;
6363 
6364  case TK_BACKREF:
6365  len = tok->u.backref.num;
6366  *np = node_new_backref(len,
6367  (len > 1 ? tok->u.backref.refs : &(tok->u.backref.ref1)),
6368  tok->u.backref.by_name,
6369 #ifdef USE_BACKREF_WITH_LEVEL
6370  tok->u.backref.exist_level,
6371  tok->u.backref.level,
6372 #endif
6373  env);
6374  CHECK_NULL_RETURN_MEMERR(*np);
6375  break;
6376 
6377 #ifdef USE_SUBEXP_CALL
6378  case TK_CALL:
6379  {
6380  int gnum = tok->u.call.gnum;
6381 
6382  if (gnum < 0 || tok->u.call.rel != 0) {
6383  if (gnum > 0) gnum--;
6384  gnum = BACKREF_REL_TO_ABS(gnum, env);
6385  if (gnum <= 0)
6386  return ONIGERR_INVALID_BACKREF;
6387  }
6388  *np = node_new_call(tok->u.call.name, tok->u.call.name_end, gnum);
6389  CHECK_NULL_RETURN_MEMERR(*np);
6390  env->num_call++;
6391  }
6392  break;
6393 #endif
6394 
6395  case TK_ANCHOR:
6396  *np = onig_node_new_anchor(tok->u.anchor.subtype);
6397  CHECK_NULL_RETURN_MEMERR(*np);
6398  NANCHOR(*np)->ascii_range = tok->u.anchor.ascii_range;
6399  break;
6400 
6401  case TK_OP_REPEAT:
6402  case TK_INTERVAL:
6403  if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_CONTEXT_INDEP_REPEAT_OPS)) {
6404  if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_CONTEXT_INVALID_REPEAT_OPS))
6405  return ONIGERR_TARGET_OF_REPEAT_OPERATOR_NOT_SPECIFIED;
6406  else
6407  *np = node_new_empty();
6408  }
6409  else {
6410  goto tk_byte;
6411  }
6412  break;
6413 
6414  default:
6415  return ONIGERR_PARSER_BUG;
6416  break;
6417  }
6418 
6419  {
6420  targetp = np;
6421 
6422  re_entry:
6423  r = fetch_token(tok, src, end, env);
6424  if (r < 0) return r;
6425 
6426  repeat:
6427  if (r == TK_OP_REPEAT || r == TK_INTERVAL) {
6428  if (is_invalid_quantifier_target(*targetp))
6429  return ONIGERR_TARGET_OF_REPEAT_OPERATOR_INVALID;
6430 
6431  qn = node_new_quantifier(tok->u.repeat.lower, tok->u.repeat.upper,
6432  (r == TK_INTERVAL ? 1 : 0));
6433  CHECK_NULL_RETURN_MEMERR(qn);
6434  NQTFR(qn)->greedy = tok->u.repeat.greedy;
6435  r = set_quantifier(qn, *targetp, group, env);
6436  if (r < 0) {
6437  onig_node_free(qn);
6438  return r;
6439  }
6440 
6441  if (tok->u.repeat.possessive != 0) {
6442  Node* en;
6443  en = node_new_enclose(ENCLOSE_STOP_BACKTRACK);
6444  if (IS_NULL(en)) {
6445  onig_node_free(qn);
6446  return ONIGERR_MEMORY;
6447  }
6448  NENCLOSE(en)->target = qn;
6449  qn = en;
6450  }
6451 
6452  if (r == 0) {
6453  *targetp = qn;
6454  }
6455  else if (r == 1) {
6456  onig_node_free(qn);
6457  }
6458  else if (r == 2) { /* split case: /abc+/ */
6459  Node *tmp;
6460 
6461  *targetp = node_new_list(*targetp, NULL);
6462  if (IS_NULL(*targetp)) {
6463  onig_node_free(qn);
6464  return ONIGERR_MEMORY;
6465  }
6466  tmp = NCDR(*targetp) = node_new_list(qn, NULL);
6467  if (IS_NULL(tmp)) {
6468  onig_node_free(qn);
6469  return ONIGERR_MEMORY;
6470  }
6471  targetp = &(NCAR(tmp));
6472  }
6473  goto re_entry;
6474  }
6475  }
6476 
6477  return r;
6478 }
6479 
6480 static int
6481 parse_branch(Node** top, OnigToken* tok, int term,
6482  UChar** src, UChar* end, ScanEnv* env)
6483 {
6484  int r;
6485  Node *node, **headp;
6486 
6487  *top = NULL;
6488  r = parse_exp(&node, tok, term, src, end, env);
6489  if (r < 0) {
6490  onig_node_free(node);
6491  return r;
6492  }
6493 
6494  if (r == TK_EOT || r == term || r == TK_ALT) {
6495  *top = node;
6496  }
6497  else {
6498  *top = node_new_list(node, NULL);
6499  headp = &(NCDR(*top));
6500  while (r != TK_EOT && r != term && r != TK_ALT) {
6501  r = parse_exp(&node, tok, term, src, end, env);
6502  if (r < 0) {
6503  onig_node_free(node);
6504  return r;
6505  }
6506 
6507  if (NTYPE(node) == NT_LIST) {
6508  *headp = node;
6509  while (IS_NOT_NULL(NCDR(node))) node = NCDR(node);
6510  headp = &(NCDR(node));
6511  }
6512  else {
6513  *headp = node_new_list(node, NULL);
6514  headp = &(NCDR(*headp));
6515  }
6516  }
6517  }
6518 
6519  return r;
6520 }
6521 
6522 /* term_tok: TK_EOT or TK_SUBEXP_CLOSE */
6523 static int
6524 parse_subexp(Node** top, OnigToken* tok, int term,
6525  UChar** src, UChar* end, ScanEnv* env)
6526 {
6527  int r;
6528  Node *node, **headp;
6529 
6530  *top = NULL;
6531  env->parse_depth++;
6532  if (env->parse_depth > ParseDepthLimit)
6533  return ONIGERR_PARSE_DEPTH_LIMIT_OVER;
6534  r = parse_branch(&node, tok, term, src, end, env);
6535  if (r < 0) {
6536  onig_node_free(node);
6537  return r;
6538  }
6539 
6540  if (r == term) {
6541  *top = node;
6542  }
6543  else if (r == TK_ALT) {
6544  *top = onig_node_new_alt(node, NULL);
6545  headp = &(NCDR(*top));
6546  while (r == TK_ALT) {
6547  r = fetch_token(tok, src, end, env);
6548  if (r < 0) return r;
6549  r = parse_branch(&node, tok, term, src, end, env);
6550  if (r < 0) {
6551  onig_node_free(node);
6552  return r;
6553  }
6554 
6555  *headp = onig_node_new_alt(node, NULL);
6556  headp = &(NCDR(*headp));
6557  }
6558 
6559  if (tok->type != (enum TokenSyms )term)
6560  goto err;
6561  }
6562  else {
6563  onig_node_free(node);
6564  err:
6565  if (term == TK_SUBEXP_CLOSE)
6566  return ONIGERR_END_PATTERN_WITH_UNMATCHED_PARENTHESIS;
6567  else
6568  return ONIGERR_PARSER_BUG;
6569  }
6570 
6571  env->parse_depth--;
6572  return r;
6573 }
6574 
6575 static int
6576 parse_regexp(Node** top, UChar** src, UChar* end, ScanEnv* env)
6577 {
6578  int r;
6579  OnigToken tok;
6580 
6581  r = fetch_token(&tok, src, end, env);
6582  if (r < 0) return r;
6583  r = parse_subexp(top, &tok, TK_EOT, src, end, env);
6584  if (r < 0) return r;
6585 
6586 #ifdef USE_SUBEXP_CALL
6587  if (env->num_call > 0) {
6588  /* Capture the pattern itself. It is used for (?R), (?0) and \g<0>. */
6589  const int num = 0;
6590  Node* np;
6591  np = node_new_enclose_memory(env->option, 0);
6592  CHECK_NULL_RETURN_MEMERR(np);
6593  NENCLOSE(np)->regnum = num;
6594  NENCLOSE(np)->target = *top;
6595  r = scan_env_set_mem_node(env, num, np);
6596  if (r != 0) {
6597  onig_node_free(np);
6598  return r;
6599  }
6600  *top = np;
6601  }
6602 #endif
6603  return 0;
6604 }
6605 
6606 extern int
6607 onig_parse_make_tree(Node** root, const UChar* pattern, const UChar* end,
6608  regex_t* reg, ScanEnv* env)
6609 {
6610  int r;
6611  UChar* p;
6612 
6613 #ifdef USE_NAMED_GROUP
6614  names_clear(reg);
6615 #endif
6616 
6617  scan_env_clear(env);
6618  env->option = reg->options;
6619  env->case_fold_flag = reg->case_fold_flag;
6620  env->enc = reg->enc;
6621  env->syntax = reg->syntax;
6622  env->pattern = (UChar* )pattern;
6623  env->pattern_end = (UChar* )end;
6624  env->reg = reg;
6625 
6626  *root = NULL;
6627  p = (UChar* )pattern;
6628  r = parse_regexp(root, &p, (UChar* )end, env);
6629  reg->num_mem = env->num_mem;
6630  return r;
6631 }
6632 
6633 extern void
6634 onig_scan_env_set_error_string(ScanEnv* env, int ecode ARG_UNUSED,
6635  UChar* arg, UChar* arg_end)
6636 {
6637  env->error = arg;
6638  env->error_end = arg_end;
6639 }
#define xfree
Old name of ruby_xfree.
Definition: xmalloc.h:58
#define xrealloc
Old name of ruby_xrealloc.
Definition: xmalloc.h:56
#define xmalloc
Old name of ruby_xmalloc.
Definition: xmalloc.h:53
void rb_compile_warn(const char *file, int line, const char *fmt,...)
Identical to rb_compile_warning(), except it reports always regardless of runtime -W flag.
Definition: error.c:360
#define ruby_verbose
This variable controls whether the interpreter is in debug mode.
Definition: error.h:459
void rb_warn(const char *fmt,...)
Identical to rb_warning(), except it reports always regardless of runtime -W flag.
Definition: error.c:418
VALUE type(ANYARGS)
ANYARGS-ed function type.
Definition: cxxanyargs.hpp:56
#define RTEST
This is an old name of RB_TEST.
Defines old _.
Definition: regint.h:441
Definition: regparse.c:453
Definition: regenc.h:118
Definition: st.h:79