Ruby  3.1.4p223 (2023-03-30 revision HEAD)
string.c
1 /**********************************************************************
2 
3  string.c -
4 
5  $Author$
6  created at: Mon Aug 9 17:12:58 JST 1993
7 
8  Copyright (C) 1993-2007 Yukihiro Matsumoto
9  Copyright (C) 2000 Network Applied Communication Laboratory, Inc.
10  Copyright (C) 2000 Information-technology Promotion Agency, Japan
11 
12 **********************************************************************/
13 
14 #include "ruby/internal/config.h"
15 
16 #include <ctype.h>
17 #include <errno.h>
18 #include <math.h>
19 
20 #ifdef HAVE_UNISTD_H
21 # include <unistd.h>
22 #endif
23 
24 #include "debug_counter.h"
25 #include "encindex.h"
26 #include "gc.h"
27 #include "id.h"
28 #include "internal.h"
29 #include "internal/array.h"
30 #include "internal/compar.h"
31 #include "internal/compilers.h"
32 #include "internal/encoding.h"
33 #include "internal/error.h"
34 #include "internal/gc.h"
35 #include "internal/numeric.h"
36 #include "internal/object.h"
37 #include "internal/proc.h"
38 #include "internal/re.h"
39 #include "internal/sanitizers.h"
40 #include "internal/string.h"
41 #include "internal/transcode.h"
42 #include "probes.h"
43 #include "ruby/encoding.h"
44 #include "ruby/re.h"
45 #include "ruby/util.h"
46 #include "ruby_assert.h"
47 #include "vm_sync.h"
48 
49 #if defined HAVE_CRYPT_R
50 # if defined HAVE_CRYPT_H
51 # include <crypt.h>
52 # endif
53 #elif !defined HAVE_CRYPT
54 # include "missing/crypt.h"
55 # define HAVE_CRYPT_R 1
56 #endif
57 
58 #define BEG(no) (regs->beg[(no)])
59 #define END(no) (regs->end[(no)])
60 
61 #undef rb_str_new
62 #undef rb_usascii_str_new
63 #undef rb_utf8_str_new
64 #undef rb_enc_str_new
65 #undef rb_str_new_cstr
66 #undef rb_tainted_str_new_cstr
67 #undef rb_usascii_str_new_cstr
68 #undef rb_utf8_str_new_cstr
69 #undef rb_enc_str_new_cstr
70 #undef rb_external_str_new_cstr
71 #undef rb_locale_str_new_cstr
72 #undef rb_str_dup_frozen
73 #undef rb_str_buf_new_cstr
74 #undef rb_str_buf_cat
75 #undef rb_str_buf_cat2
76 #undef rb_str_cat2
77 #undef rb_str_cat_cstr
78 #undef rb_fstring_cstr
79 
82 
83 /* FLAGS of RString
84  *
85  * 1: RSTRING_NOEMBED
86  * 2: STR_SHARED (== ELTS_SHARED)
87  * 2-6: RSTRING_EMBED_LEN (5 bits == 32)
88  * 5: STR_SHARED_ROOT (RSTRING_NOEMBED==1 && STR_SHARED == 0, there may be
89  * other strings that rely on this string's buffer)
90  * 6: STR_BORROWED (when RSTRING_NOEMBED==1 && klass==0, unsafe to recycle
91  * early, specific to rb_str_tmp_frozen_{acquire,release})
92  * 7: STR_TMPLOCK (set when a pointer to the buffer is passed to syscall
93  * such as read(2). Any modification and realloc is prohibited)
94  *
95  * 8-9: ENC_CODERANGE (2 bits)
96  * 10-16: ENCODING (7 bits == 128)
97  * 17: RSTRING_FSTR
98  * 18: STR_NOFREE (do not free this string's buffer when a String is freed.
99  * used for a string object based on C string literal)
100  * 19: STR_FAKESTR (when RVALUE is not managed by GC. Typically, the string
101  * object header is temporarily allocated on C stack)
102  */
103 
104 #define RUBY_MAX_CHAR_LEN 16
105 #define STR_SHARED_ROOT FL_USER5
106 #define STR_BORROWED FL_USER6
107 #define STR_TMPLOCK FL_USER7
108 #define STR_NOFREE FL_USER18
109 #define STR_FAKESTR FL_USER19
110 
111 #define STR_SET_NOEMBED(str) do {\
112  FL_SET((str), STR_NOEMBED);\
113  if (USE_RVARGC) {\
114  FL_UNSET((str), STR_SHARED | STR_SHARED_ROOT | STR_BORROWED);\
115  }\
116  else {\
117  STR_SET_EMBED_LEN((str), 0);\
118  }\
119 } while (0)
120 #define STR_SET_EMBED(str) FL_UNSET((str), (STR_NOEMBED|STR_NOFREE))
121 #if USE_RVARGC
122 # define STR_SET_EMBED_LEN(str, n) do { \
123  assert(str_embed_capa(str) > (n));\
124  RSTRING(str)->as.embed.len = (n);\
125 } while (0)
126 #else
127 # define STR_SET_EMBED_LEN(str, n) do { \
128  long tmp_n = (n);\
129  RBASIC(str)->flags &= ~RSTRING_EMBED_LEN_MASK;\
130  RBASIC(str)->flags |= (tmp_n) << RSTRING_EMBED_LEN_SHIFT;\
131 } while (0)
132 #endif
133 
134 #define STR_SET_LEN(str, n) do { \
135  if (STR_EMBED_P(str)) {\
136  STR_SET_EMBED_LEN((str), (n));\
137  }\
138  else {\
139  RSTRING(str)->as.heap.len = (n);\
140  }\
141 } while (0)
142 
143 #define STR_DEC_LEN(str) do {\
144  if (STR_EMBED_P(str)) {\
145  long n = RSTRING_LEN(str);\
146  n--;\
147  STR_SET_EMBED_LEN((str), n);\
148  }\
149  else {\
150  RSTRING(str)->as.heap.len--;\
151  }\
152 } while (0)
153 
154 #define TERM_LEN(str) rb_enc_mbminlen(rb_enc_get(str))
155 #define TERM_FILL(ptr, termlen) do {\
156  char *const term_fill_ptr = (ptr);\
157  const int term_fill_len = (termlen);\
158  *term_fill_ptr = '\0';\
159  if (UNLIKELY(term_fill_len > 1))\
160  memset(term_fill_ptr, 0, term_fill_len);\
161 } while (0)
162 
163 #define RESIZE_CAPA(str,capacity) do {\
164  const int termlen = TERM_LEN(str);\
165  RESIZE_CAPA_TERM(str,capacity,termlen);\
166 } while (0)
167 #define RESIZE_CAPA_TERM(str,capacity,termlen) do {\
168  if (STR_EMBED_P(str)) {\
169  if (str_embed_capa(str) < capacity + termlen) {\
170  char *const tmp = ALLOC_N(char, (size_t)(capacity) + (termlen));\
171  const long tlen = RSTRING_LEN(str);\
172  memcpy(tmp, RSTRING_PTR(str), tlen);\
173  RSTRING(str)->as.heap.ptr = tmp;\
174  RSTRING(str)->as.heap.len = tlen;\
175  STR_SET_NOEMBED(str);\
176  RSTRING(str)->as.heap.aux.capa = (capacity);\
177  }\
178  }\
179  else {\
180  assert(!FL_TEST((str), STR_SHARED)); \
181  SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char, \
182  (size_t)(capacity) + (termlen), STR_HEAP_SIZE(str)); \
183  RSTRING(str)->as.heap.aux.capa = (capacity);\
184  }\
185 } while (0)
186 
187 #define STR_SET_SHARED(str, shared_str) do { \
188  if (!FL_TEST(str, STR_FAKESTR)) { \
189  assert(RSTRING_PTR(shared_str) <= RSTRING_PTR(str)); \
190  assert(RSTRING_PTR(str) <= RSTRING_PTR(shared_str) + RSTRING_LEN(shared_str)); \
191  RB_OBJ_WRITE((str), &RSTRING(str)->as.heap.aux.shared, (shared_str)); \
192  FL_SET((str), STR_SHARED); \
193  FL_SET((shared_str), STR_SHARED_ROOT); \
194  if (RBASIC_CLASS((shared_str)) == 0) /* for CoW-friendliness */ \
195  FL_SET_RAW((shared_str), STR_BORROWED); \
196  } \
197 } while (0)
198 
199 #define STR_HEAP_PTR(str) (RSTRING(str)->as.heap.ptr)
200 #define STR_HEAP_SIZE(str) ((size_t)RSTRING(str)->as.heap.aux.capa + TERM_LEN(str))
201 /* TODO: include the terminator size in capa. */
202 
203 #define STR_ENC_GET(str) get_encoding(str)
204 
205 #if !defined SHARABLE_MIDDLE_SUBSTRING
206 # define SHARABLE_MIDDLE_SUBSTRING 0
207 #endif
208 #if !SHARABLE_MIDDLE_SUBSTRING
209 #define SHARABLE_SUBSTRING_P(beg, len, end) ((beg) + (len) == (end))
210 #else
211 #define SHARABLE_SUBSTRING_P(beg, len, end) 1
212 #endif
213 
214 
215 static inline long
216 str_embed_capa(VALUE str)
217 {
218 #if USE_RVARGC
219  return rb_gc_obj_slot_size(str) - offsetof(struct RString, as.embed.ary);
220 #else
221  return RSTRING_EMBED_LEN_MAX + 1;
222 #endif
223 }
224 
225 static inline size_t
226 str_embed_size(long capa)
227 {
228  return offsetof(struct RString, as.embed.ary) + capa;
229 }
230 
231 static inline bool
232 STR_EMBEDDABLE_P(long len, long termlen)
233 {
234 #if USE_RVARGC
235  return rb_gc_size_allocatable_p(str_embed_size(len + termlen));
236 #else
237  return len <= RSTRING_EMBED_LEN_MAX + 1 - termlen;
238 #endif
239 }
240 
241 static VALUE str_replace_shared_without_enc(VALUE str2, VALUE str);
242 static VALUE str_new_frozen(VALUE klass, VALUE orig);
243 static VALUE str_new_frozen_buffer(VALUE klass, VALUE orig, int copy_encoding);
244 static VALUE str_new_static(VALUE klass, const char *ptr, long len, int encindex);
245 static VALUE str_new(VALUE klass, const char *ptr, long len);
246 static void str_make_independent_expand(VALUE str, long len, long expand, const int termlen);
247 static inline void str_modifiable(VALUE str);
248 static VALUE rb_str_downcase(int argc, VALUE *argv, VALUE str);
249 
250 static inline void
251 str_make_independent(VALUE str)
252 {
253  long len = RSTRING_LEN(str);
254  int termlen = TERM_LEN(str);
255  str_make_independent_expand((str), len, 0L, termlen);
256 }
257 
258 static inline int str_dependent_p(VALUE str);
259 
260 void
261 rb_str_make_independent(VALUE str)
262 {
263  if (str_dependent_p(str)) {
264  str_make_independent(str);
265  }
266 }
267 
268 void
269 rb_debug_rstring_null_ptr(const char *func)
270 {
271  fprintf(stderr, "%s is returning NULL!! "
272  "SIGSEGV is highly expected to follow immediately. "
273  "If you could reproduce, attach your debugger here, "
274  "and look at the passed string.",
275  func);
276 }
277 
278 /* symbols for [up|down|swap]case/capitalize options */
279 static VALUE sym_ascii, sym_turkic, sym_lithuanian, sym_fold;
280 
281 static rb_encoding *
282 get_actual_encoding(const int encidx, VALUE str)
283 {
284  const unsigned char *q;
285 
286  switch (encidx) {
287  case ENCINDEX_UTF_16:
288  if (RSTRING_LEN(str) < 2) break;
289  q = (const unsigned char *)RSTRING_PTR(str);
290  if (q[0] == 0xFE && q[1] == 0xFF) {
291  return rb_enc_get_from_index(ENCINDEX_UTF_16BE);
292  }
293  if (q[0] == 0xFF && q[1] == 0xFE) {
294  return rb_enc_get_from_index(ENCINDEX_UTF_16LE);
295  }
296  return rb_ascii8bit_encoding();
297  case ENCINDEX_UTF_32:
298  if (RSTRING_LEN(str) < 4) break;
299  q = (const unsigned char *)RSTRING_PTR(str);
300  if (q[0] == 0 && q[1] == 0 && q[2] == 0xFE && q[3] == 0xFF) {
301  return rb_enc_get_from_index(ENCINDEX_UTF_32BE);
302  }
303  if (q[3] == 0 && q[2] == 0 && q[1] == 0xFE && q[0] == 0xFF) {
304  return rb_enc_get_from_index(ENCINDEX_UTF_32LE);
305  }
306  return rb_ascii8bit_encoding();
307  }
308  return rb_enc_from_index(encidx);
309 }
310 
311 static rb_encoding *
312 get_encoding(VALUE str)
313 {
314  return get_actual_encoding(ENCODING_GET(str), str);
315 }
316 
317 static void
318 mustnot_broken(VALUE str)
319 {
320  if (is_broken_string(str)) {
321  rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(str)));
322  }
323 }
324 
325 static void
326 mustnot_wchar(VALUE str)
327 {
328  rb_encoding *enc = STR_ENC_GET(str);
329  if (rb_enc_mbminlen(enc) > 1) {
330  rb_raise(rb_eArgError, "wide char encoding: %s", rb_enc_name(enc));
331  }
332 }
333 
334 static int fstring_cmp(VALUE a, VALUE b);
335 
336 static VALUE register_fstring(VALUE str, bool copy);
337 
338 const struct st_hash_type rb_fstring_hash_type = {
339  fstring_cmp,
340  rb_str_hash,
341 };
342 
343 #define BARE_STRING_P(str) (!FL_ANY_RAW(str, FL_EXIVAR) && RBASIC_CLASS(str) == rb_cString)
344 
346  VALUE fstr;
347  bool copy;
348 };
349 
350 static int
351 fstr_update_callback(st_data_t *key, st_data_t *value, st_data_t data, int existing)
352 {
353 
354  struct fstr_update_arg *arg = (struct fstr_update_arg *)data;
355  VALUE str = (VALUE)*key;
356 
357  if (existing) {
358  /* because of lazy sweep, str may be unmarked already and swept
359  * at next time */
360 
361  if (rb_objspace_garbage_object_p(str)) {
362  arg->fstr = Qundef;
363  return ST_DELETE;
364  }
365 
366  arg->fstr = str;
367  return ST_STOP;
368  }
369  else {
370  if (FL_TEST_RAW(str, STR_FAKESTR)) {
371  if (arg->copy) {
372  VALUE new_str = str_new(rb_cString, RSTRING(str)->as.heap.ptr, RSTRING(str)->as.heap.len);
373  rb_enc_copy(new_str, str);
374  str = new_str;
375  }
376  else {
377  str = str_new_static(rb_cString, RSTRING(str)->as.heap.ptr,
378  RSTRING(str)->as.heap.len,
379  ENCODING_GET(str));
380  }
381  OBJ_FREEZE_RAW(str);
382  }
383  else {
384  if (!OBJ_FROZEN(str))
385  str = str_new_frozen(rb_cString, str);
386  if (STR_SHARED_P(str)) { /* str should not be shared */
387  /* shared substring */
388  str_make_independent(str);
389  assert(OBJ_FROZEN(str));
390  }
391  if (!BARE_STRING_P(str)) {
392  str = str_new_frozen(rb_cString, str);
393  }
394  }
395  RBASIC(str)->flags |= RSTRING_FSTR;
396 
397  *key = *value = arg->fstr = str;
398  return ST_CONTINUE;
399  }
400 }
401 
402 RUBY_FUNC_EXPORTED
403 VALUE
404 rb_fstring(VALUE str)
405 {
406  VALUE fstr;
407  int bare;
408 
409  Check_Type(str, T_STRING);
410 
411  if (FL_TEST(str, RSTRING_FSTR))
412  return str;
413 
414  bare = BARE_STRING_P(str);
415  if (!bare) {
416  if (STR_EMBED_P(str)) {
417  OBJ_FREEZE_RAW(str);
418  return str;
419  }
420  if (FL_TEST_RAW(str, STR_NOEMBED|STR_SHARED_ROOT|STR_SHARED) == (STR_NOEMBED|STR_SHARED_ROOT)) {
421  assert(OBJ_FROZEN(str));
422  return str;
423  }
424  }
425 
426  if (!OBJ_FROZEN(str))
427  rb_str_resize(str, RSTRING_LEN(str));
428 
429  fstr = register_fstring(str, FALSE);
430 
431  if (!bare) {
432  str_replace_shared_without_enc(str, fstr);
433  OBJ_FREEZE_RAW(str);
434  return str;
435  }
436  return fstr;
437 }
438 
439 static VALUE
440 register_fstring(VALUE str, bool copy)
441 {
442  struct fstr_update_arg args;
443  args.copy = copy;
444 
445  RB_VM_LOCK_ENTER();
446  {
447  st_table *frozen_strings = rb_vm_fstring_table();
448  do {
449  args.fstr = str;
450  st_update(frozen_strings, (st_data_t)str, fstr_update_callback, (st_data_t)&args);
451  } while (args.fstr == Qundef);
452  }
453  RB_VM_LOCK_LEAVE();
454 
455  assert(OBJ_FROZEN(args.fstr));
456  assert(!FL_TEST_RAW(args.fstr, STR_FAKESTR));
457  assert(!FL_TEST_RAW(args.fstr, FL_EXIVAR));
458  assert(RBASIC_CLASS(args.fstr) == rb_cString);
459  return args.fstr;
460 }
461 
462 static VALUE
463 setup_fake_str(struct RString *fake_str, const char *name, long len, int encidx)
464 {
465  fake_str->basic.flags = T_STRING|RSTRING_NOEMBED|STR_NOFREE|STR_FAKESTR;
466  /* SHARED to be allocated by the callback */
467 
468  if (!name) {
469  RUBY_ASSERT_ALWAYS(len == 0);
470  name = "";
471  }
472 
473  ENCODING_SET_INLINED((VALUE)fake_str, encidx);
474 
475  RBASIC_SET_CLASS_RAW((VALUE)fake_str, rb_cString);
476  fake_str->as.heap.len = len;
477  fake_str->as.heap.ptr = (char *)name;
478  fake_str->as.heap.aux.capa = len;
479  return (VALUE)fake_str;
480 }
481 
482 /*
483  * set up a fake string which refers a static string literal.
484  */
485 VALUE
486 rb_setup_fake_str(struct RString *fake_str, const char *name, long len, rb_encoding *enc)
487 {
488  return setup_fake_str(fake_str, name, len, rb_enc_to_index(enc));
489 }
490 
491 /*
492  * rb_fstring_new and rb_fstring_cstr family create or lookup a frozen
493  * shared string which refers a static string literal. `ptr` must
494  * point a constant string.
495  */
496 MJIT_FUNC_EXPORTED VALUE
497 rb_fstring_new(const char *ptr, long len)
498 {
499  struct RString fake_str;
500  return register_fstring(setup_fake_str(&fake_str, ptr, len, ENCINDEX_US_ASCII), FALSE);
501 }
502 
503 VALUE
504 rb_fstring_enc_new(const char *ptr, long len, rb_encoding *enc)
505 {
506  struct RString fake_str;
507  return register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), FALSE);
508 }
509 
510 VALUE
511 rb_fstring_cstr(const char *ptr)
512 {
513  return rb_fstring_new(ptr, strlen(ptr));
514 }
515 
516 static int
517 fstring_set_class_i(st_data_t key, st_data_t val, st_data_t arg)
518 {
519  RBASIC_SET_CLASS((VALUE)key, (VALUE)arg);
520  return ST_CONTINUE;
521 }
522 
523 static int
524 fstring_cmp(VALUE a, VALUE b)
525 {
526  long alen, blen;
527  const char *aptr, *bptr;
528  RSTRING_GETMEM(a, aptr, alen);
529  RSTRING_GETMEM(b, bptr, blen);
530  return (alen != blen ||
531  ENCODING_GET(a) != ENCODING_GET(b) ||
532  memcmp(aptr, bptr, alen) != 0);
533 }
534 
535 static inline int
536 single_byte_optimizable(VALUE str)
537 {
538  rb_encoding *enc;
539 
540  /* Conservative. It may be ENC_CODERANGE_UNKNOWN. */
541  if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT)
542  return 1;
543 
544  enc = STR_ENC_GET(str);
545  if (rb_enc_mbmaxlen(enc) == 1)
546  return 1;
547 
548  /* Conservative. Possibly single byte.
549  * "\xa1" in Shift_JIS for example. */
550  return 0;
551 }
552 
554 
555 static inline const char *
556 search_nonascii(const char *p, const char *e)
557 {
558  const uintptr_t *s, *t;
559 
560 #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)
561 # if SIZEOF_UINTPTR_T == 8
562 # define NONASCII_MASK UINT64_C(0x8080808080808080)
563 # elif SIZEOF_UINTPTR_T == 4
564 # define NONASCII_MASK UINT32_C(0x80808080)
565 # else
566 # error "don't know what to do."
567 # endif
568 #else
569 # if SIZEOF_UINTPTR_T == 8
570 # define NONASCII_MASK ((uintptr_t)0x80808080UL << 32 | (uintptr_t)0x80808080UL)
571 # elif SIZEOF_UINTPTR_T == 4
572 # define NONASCII_MASK 0x80808080UL /* or...? */
573 # else
574 # error "don't know what to do."
575 # endif
576 #endif
577 
578  if (UNALIGNED_WORD_ACCESS || e - p >= SIZEOF_VOIDP) {
579 #if !UNALIGNED_WORD_ACCESS
580  if ((uintptr_t)p % SIZEOF_VOIDP) {
581  int l = SIZEOF_VOIDP - (uintptr_t)p % SIZEOF_VOIDP;
582  p += l;
583  switch (l) {
584  default: UNREACHABLE;
585 #if SIZEOF_VOIDP > 4
586  case 7: if (p[-7]&0x80) return p-7;
587  case 6: if (p[-6]&0x80) return p-6;
588  case 5: if (p[-5]&0x80) return p-5;
589  case 4: if (p[-4]&0x80) return p-4;
590 #endif
591  case 3: if (p[-3]&0x80) return p-3;
592  case 2: if (p[-2]&0x80) return p-2;
593  case 1: if (p[-1]&0x80) return p-1;
594  case 0: break;
595  }
596  }
597 #endif
598 #if defined(HAVE_BUILTIN___BUILTIN_ASSUME_ALIGNED) &&! UNALIGNED_WORD_ACCESS
599 #define aligned_ptr(value) \
600  __builtin_assume_aligned((value), sizeof(uintptr_t))
601 #else
602 #define aligned_ptr(value) (uintptr_t *)(value)
603 #endif
604  s = aligned_ptr(p);
605  t = (uintptr_t *)(e - (SIZEOF_VOIDP-1));
606 #undef aligned_ptr
607  for (;s < t; s++) {
608  if (*s & NONASCII_MASK) {
609 #ifdef WORDS_BIGENDIAN
610  return (const char *)s + (nlz_intptr(*s&NONASCII_MASK)>>3);
611 #else
612  return (const char *)s + (ntz_intptr(*s&NONASCII_MASK)>>3);
613 #endif
614  }
615  }
616  p = (const char *)s;
617  }
618 
619  switch (e - p) {
620  default: UNREACHABLE;
621 #if SIZEOF_VOIDP > 4
622  case 7: if (e[-7]&0x80) return e-7;
623  case 6: if (e[-6]&0x80) return e-6;
624  case 5: if (e[-5]&0x80) return e-5;
625  case 4: if (e[-4]&0x80) return e-4;
626 #endif
627  case 3: if (e[-3]&0x80) return e-3;
628  case 2: if (e[-2]&0x80) return e-2;
629  case 1: if (e[-1]&0x80) return e-1;
630  case 0: return NULL;
631  }
632 }
633 
634 static int
635 coderange_scan(const char *p, long len, rb_encoding *enc)
636 {
637  const char *e = p + len;
638 
639  if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
640  /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */
641  p = search_nonascii(p, e);
643  }
644 
645  if (rb_enc_asciicompat(enc)) {
646  p = search_nonascii(p, e);
647  if (!p) return ENC_CODERANGE_7BIT;
648  for (;;) {
649  int ret = rb_enc_precise_mbclen(p, e, enc);
650  if (!MBCLEN_CHARFOUND_P(ret)) return ENC_CODERANGE_BROKEN;
651  p += MBCLEN_CHARFOUND_LEN(ret);
652  if (p == e) break;
653  p = search_nonascii(p, e);
654  if (!p) break;
655  }
656  }
657  else {
658  while (p < e) {
659  int ret = rb_enc_precise_mbclen(p, e, enc);
660  if (!MBCLEN_CHARFOUND_P(ret)) return ENC_CODERANGE_BROKEN;
661  p += MBCLEN_CHARFOUND_LEN(ret);
662  }
663  }
664  return ENC_CODERANGE_VALID;
665 }
666 
667 long
668 rb_str_coderange_scan_restartable(const char *s, const char *e, rb_encoding *enc, int *cr)
669 {
670  const char *p = s;
671 
672  if (*cr == ENC_CODERANGE_BROKEN)
673  return e - s;
674 
675  if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
676  /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */
677  if (*cr == ENC_CODERANGE_VALID) return e - s;
678  p = search_nonascii(p, e);
680  return e - s;
681  }
682  else if (rb_enc_asciicompat(enc)) {
683  p = search_nonascii(p, e);
684  if (!p) {
685  if (*cr != ENC_CODERANGE_VALID) *cr = ENC_CODERANGE_7BIT;
686  return e - s;
687  }
688  for (;;) {
689  int ret = rb_enc_precise_mbclen(p, e, enc);
690  if (!MBCLEN_CHARFOUND_P(ret)) {
692  return p - s;
693  }
694  p += MBCLEN_CHARFOUND_LEN(ret);
695  if (p == e) break;
696  p = search_nonascii(p, e);
697  if (!p) break;
698  }
699  }
700  else {
701  while (p < e) {
702  int ret = rb_enc_precise_mbclen(p, e, enc);
703  if (!MBCLEN_CHARFOUND_P(ret)) {
705  return p - s;
706  }
707  p += MBCLEN_CHARFOUND_LEN(ret);
708  }
709  }
710  *cr = ENC_CODERANGE_VALID;
711  return e - s;
712 }
713 
714 static inline void
715 str_enc_copy(VALUE str1, VALUE str2)
716 {
717  rb_enc_set_index(str1, ENCODING_GET(str2));
718 }
719 
720 static void
721 rb_enc_cr_str_copy_for_substr(VALUE dest, VALUE src)
722 {
723  /* this function is designed for copying encoding and coderange
724  * from src to new string "dest" which is made from the part of src.
725  */
726  str_enc_copy(dest, src);
727  if (RSTRING_LEN(dest) == 0) {
728  if (!rb_enc_asciicompat(STR_ENC_GET(src)))
730  else
732  return;
733  }
734  switch (ENC_CODERANGE(src)) {
735  case ENC_CODERANGE_7BIT:
737  break;
738  case ENC_CODERANGE_VALID:
739  if (!rb_enc_asciicompat(STR_ENC_GET(src)) ||
740  search_nonascii(RSTRING_PTR(dest), RSTRING_END(dest)))
742  else
744  break;
745  default:
746  break;
747  }
748 }
749 
750 static void
751 rb_enc_cr_str_exact_copy(VALUE dest, VALUE src)
752 {
753  str_enc_copy(dest, src);
754  ENC_CODERANGE_SET(dest, ENC_CODERANGE(src));
755 }
756 
757 static int
758 enc_coderange_scan(VALUE str, rb_encoding *enc, int encidx)
759 {
760  if (rb_enc_mbminlen(enc) > 1 && rb_enc_dummy_p(enc) &&
761  rb_enc_mbminlen(enc = get_actual_encoding(encidx, str)) == 1) {
762  return ENC_CODERANGE_BROKEN;
763  }
764  else {
765  return coderange_scan(RSTRING_PTR(str), RSTRING_LEN(str), enc);
766  }
767 }
768 
769 int
770 rb_enc_str_coderange_scan(VALUE str, rb_encoding *enc)
771 {
772  return enc_coderange_scan(str, enc, rb_enc_to_index(enc));
773 }
774 
775 int
777 {
778  int cr = ENC_CODERANGE(str);
779 
780  if (cr == ENC_CODERANGE_UNKNOWN) {
781  int encidx = ENCODING_GET(str);
782  rb_encoding *enc = rb_enc_from_index(encidx);
783  cr = enc_coderange_scan(str, enc, encidx);
784  ENC_CODERANGE_SET(str, cr);
785  }
786  return cr;
787 }
788 
789 int
791 {
792  rb_encoding *enc = STR_ENC_GET(str);
793 
794  if (!rb_enc_asciicompat(enc))
795  return FALSE;
796  else if (rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT)
797  return TRUE;
798  return FALSE;
799 }
800 
801 static inline void
802 str_mod_check(VALUE s, const char *p, long len)
803 {
804  if (RSTRING_PTR(s) != p || RSTRING_LEN(s) != len){
805  rb_raise(rb_eRuntimeError, "string modified");
806  }
807 }
808 
809 static size_t
810 str_capacity(VALUE str, const int termlen)
811 {
812  if (STR_EMBED_P(str)) {
813 #if USE_RVARGC
814  return str_embed_capa(str) - termlen;
815 #else
816  return (RSTRING_EMBED_LEN_MAX + 1 - termlen);
817 #endif
818  }
819  else if (FL_TEST(str, STR_SHARED|STR_NOFREE)) {
820  return RSTRING(str)->as.heap.len;
821  }
822  else {
823  return RSTRING(str)->as.heap.aux.capa;
824  }
825 }
826 
827 size_t
829 {
830  return str_capacity(str, TERM_LEN(str));
831 }
832 
833 static inline void
834 must_not_null(const char *ptr)
835 {
836  if (!ptr) {
837  rb_raise(rb_eArgError, "NULL pointer given");
838  }
839 }
840 
841 static inline VALUE
842 str_alloc(VALUE klass, size_t size)
843 {
844  assert(size > 0);
845  RVARGC_NEWOBJ_OF(str, struct RString, klass,
847  return (VALUE)str;
848 }
849 
850 static inline VALUE
851 str_alloc_embed(VALUE klass, size_t capa)
852 {
853  size_t size = str_embed_size(capa);
854  assert(rb_gc_size_allocatable_p(size));
855 #if !USE_RVARGC
856  assert(size <= sizeof(struct RString));
857 #endif
858  return str_alloc(klass, size);
859 }
860 
861 static inline VALUE
862 str_alloc_heap(VALUE klass)
863 {
864  return str_alloc(klass, sizeof(struct RString));
865 }
866 
867 static inline VALUE
868 empty_str_alloc(VALUE klass)
869 {
870  RUBY_DTRACE_CREATE_HOOK(STRING, 0);
871  VALUE str = str_alloc_embed(klass, 0);
872  memset(RSTRING(str)->as.embed.ary, 0, str_embed_capa(str));
873  return str;
874 }
875 
876 static VALUE
877 str_new0(VALUE klass, const char *ptr, long len, int termlen)
878 {
879  VALUE str;
880 
881  if (len < 0) {
882  rb_raise(rb_eArgError, "negative string size (or size too big)");
883  }
884 
885  RUBY_DTRACE_CREATE_HOOK(STRING, len);
886 
887  if (STR_EMBEDDABLE_P(len, termlen)) {
888  str = str_alloc_embed(klass, len + termlen);
889  if (len == 0) {
891  }
892  }
893  else {
894  str = str_alloc_heap(klass);
895  RSTRING(str)->as.heap.aux.capa = len;
896  /* :FIXME: @shyouhei guesses `len + termlen` is guaranteed to never
897  * integer overflow. If we can STATIC_ASSERT that, the following
898  * mul_add_mul can be reverted to a simple ALLOC_N. */
899  RSTRING(str)->as.heap.ptr =
900  rb_xmalloc_mul_add_mul(sizeof(char), len, sizeof(char), termlen);
901  STR_SET_NOEMBED(str);
902  }
903  if (ptr) {
904  memcpy(RSTRING_PTR(str), ptr, len);
905  }
906  STR_SET_LEN(str, len);
907  TERM_FILL(RSTRING_PTR(str) + len, termlen);
908  return str;
909 }
910 
911 static VALUE
912 str_new(VALUE klass, const char *ptr, long len)
913 {
914  return str_new0(klass, ptr, len, 1);
915 }
916 
917 VALUE
918 rb_str_new(const char *ptr, long len)
919 {
920  return str_new(rb_cString, ptr, len);
921 }
922 
923 VALUE
924 rb_usascii_str_new(const char *ptr, long len)
925 {
926  VALUE str = rb_str_new(ptr, len);
928  return str;
929 }
930 
931 VALUE
932 rb_utf8_str_new(const char *ptr, long len)
933 {
934  VALUE str = str_new(rb_cString, ptr, len);
936  return str;
937 }
938 
939 VALUE
940 rb_enc_str_new(const char *ptr, long len, rb_encoding *enc)
941 {
942  VALUE str;
943 
944  if (!enc) return rb_str_new(ptr, len);
945 
946  str = str_new0(rb_cString, ptr, len, rb_enc_mbminlen(enc));
947  rb_enc_associate(str, enc);
948  return str;
949 }
950 
951 VALUE
952 rb_str_new_cstr(const char *ptr)
953 {
954  must_not_null(ptr);
955  /* rb_str_new_cstr() can take pointer from non-malloc-generated
956  * memory regions, and that cannot be detected by the MSAN. Just
957  * trust the programmer that the argument passed here is a sane C
958  * string. */
959  __msan_unpoison_string(ptr);
960  return rb_str_new(ptr, strlen(ptr));
961 }
962 
963 VALUE
965 {
966  VALUE str = rb_str_new_cstr(ptr);
968  return str;
969 }
970 
971 VALUE
973 {
974  VALUE str = rb_str_new_cstr(ptr);
976  return str;
977 }
978 
979 VALUE
981 {
982  must_not_null(ptr);
983  if (rb_enc_mbminlen(enc) != 1) {
984  rb_raise(rb_eArgError, "wchar encoding given");
985  }
986  return rb_enc_str_new(ptr, strlen(ptr), enc);
987 }
988 
989 static VALUE
990 str_new_static(VALUE klass, const char *ptr, long len, int encindex)
991 {
992  VALUE str;
993 
994  if (len < 0) {
995  rb_raise(rb_eArgError, "negative string size (or size too big)");
996  }
997 
998  if (!ptr) {
999  rb_encoding *enc = rb_enc_get_from_index(encindex);
1000  str = str_new0(klass, ptr, len, rb_enc_mbminlen(enc));
1001  }
1002  else {
1003  RUBY_DTRACE_CREATE_HOOK(STRING, len);
1004  str = str_alloc_heap(klass);
1005  RSTRING(str)->as.heap.len = len;
1006  RSTRING(str)->as.heap.ptr = (char *)ptr;
1007  RSTRING(str)->as.heap.aux.capa = len;
1008  STR_SET_NOEMBED(str);
1009  RBASIC(str)->flags |= STR_NOFREE;
1010  }
1011  rb_enc_associate_index(str, encindex);
1012  return str;
1013 }
1014 
1015 VALUE
1016 rb_str_new_static(const char *ptr, long len)
1017 {
1018  return str_new_static(rb_cString, ptr, len, 0);
1019 }
1020 
1021 VALUE
1023 {
1024  return str_new_static(rb_cString, ptr, len, ENCINDEX_US_ASCII);
1025 }
1026 
1027 VALUE
1028 rb_utf8_str_new_static(const char *ptr, long len)
1029 {
1030  return str_new_static(rb_cString, ptr, len, ENCINDEX_UTF_8);
1031 }
1032 
1033 VALUE
1034 rb_enc_str_new_static(const char *ptr, long len, rb_encoding *enc)
1035 {
1036  return str_new_static(rb_cString, ptr, len, rb_enc_to_index(enc));
1037 }
1038 
1039 VALUE
1040 rb_tainted_str_new(const char *ptr, long len)
1041 {
1042  rb_warn_deprecated_to_remove_at(3.2, "rb_tainted_str_new", NULL);
1043  return rb_str_new(ptr, len);
1044 }
1045 
1046 VALUE
1048 {
1049  rb_warn_deprecated_to_remove_at(3.2, "rb_tainted_str_new_cstr", NULL);
1050  return rb_str_new_cstr(ptr);
1051 }
1052 
1053 static VALUE str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1054  rb_encoding *from, rb_encoding *to,
1055  int ecflags, VALUE ecopts);
1056 
1057 static inline bool
1058 is_enc_ascii_string(VALUE str, rb_encoding *enc)
1059 {
1060  int encidx = rb_enc_to_index(enc);
1061  if (rb_enc_get_index(str) == encidx)
1062  return is_ascii_string(str);
1063  return enc_coderange_scan(str, enc, encidx) == ENC_CODERANGE_7BIT;
1064 }
1065 
1066 VALUE
1067 rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
1068 {
1069  long len;
1070  const char *ptr;
1071  VALUE newstr;
1072 
1073  if (!to) return str;
1074  if (!from) from = rb_enc_get(str);
1075  if (from == to) return str;
1076  if ((rb_enc_asciicompat(to) && is_enc_ascii_string(str, from)) ||
1077  to == rb_ascii8bit_encoding()) {
1078  if (STR_ENC_GET(str) != to) {
1079  str = rb_str_dup(str);
1080  rb_enc_associate(str, to);
1081  }
1082  return str;
1083  }
1084 
1085  RSTRING_GETMEM(str, ptr, len);
1086  newstr = str_cat_conv_enc_opts(rb_str_buf_new(len), 0, ptr, len,
1087  from, to, ecflags, ecopts);
1088  if (NIL_P(newstr)) {
1089  /* some error, return original */
1090  return str;
1091  }
1092  return newstr;
1093 }
1094 
1095 VALUE
1096 rb_str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1097  rb_encoding *from, int ecflags, VALUE ecopts)
1098 {
1099  long olen;
1100 
1101  olen = RSTRING_LEN(newstr);
1102  if (ofs < -olen || olen < ofs)
1103  rb_raise(rb_eIndexError, "index %ld out of string", ofs);
1104  if (ofs < 0) ofs += olen;
1105  if (!from) {
1106  STR_SET_LEN(newstr, ofs);
1107  return rb_str_cat(newstr, ptr, len);
1108  }
1109 
1110  rb_str_modify(newstr);
1111  return str_cat_conv_enc_opts(newstr, ofs, ptr, len, from,
1112  rb_enc_get(newstr),
1113  ecflags, ecopts);
1114 }
1115 
1116 VALUE
1117 rb_str_initialize(VALUE str, const char *ptr, long len, rb_encoding *enc)
1118 {
1119  STR_SET_LEN(str, 0);
1120  rb_enc_associate(str, enc);
1121  rb_str_cat(str, ptr, len);
1122  return str;
1123 }
1124 
1125 static VALUE
1126 str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1127  rb_encoding *from, rb_encoding *to,
1128  int ecflags, VALUE ecopts)
1129 {
1130  rb_econv_t *ec;
1131  rb_econv_result_t ret;
1132  long olen;
1133  VALUE econv_wrapper;
1134  const unsigned char *start, *sp;
1135  unsigned char *dest, *dp;
1136  size_t converted_output = (size_t)ofs;
1137 
1138  olen = rb_str_capacity(newstr);
1139 
1140  econv_wrapper = rb_obj_alloc(rb_cEncodingConverter);
1141  RBASIC_CLEAR_CLASS(econv_wrapper);
1142  ec = rb_econv_open_opts(from->name, to->name, ecflags, ecopts);
1143  if (!ec) return Qnil;
1144  DATA_PTR(econv_wrapper) = ec;
1145 
1146  sp = (unsigned char*)ptr;
1147  start = sp;
1148  while ((dest = (unsigned char*)RSTRING_PTR(newstr)),
1149  (dp = dest + converted_output),
1150  (ret = rb_econv_convert(ec, &sp, start + len, &dp, dest + olen, 0)),
1152  /* destination buffer short */
1153  size_t converted_input = sp - start;
1154  size_t rest = len - converted_input;
1155  converted_output = dp - dest;
1156  rb_str_set_len(newstr, converted_output);
1157  if (converted_input && converted_output &&
1158  rest < (LONG_MAX / converted_output)) {
1159  rest = (rest * converted_output) / converted_input;
1160  }
1161  else {
1162  rest = olen;
1163  }
1164  olen += rest < 2 ? 2 : rest;
1165  rb_str_resize(newstr, olen);
1166  }
1167  DATA_PTR(econv_wrapper) = 0;
1168  rb_econv_close(ec);
1169  switch (ret) {
1170  case econv_finished:
1171  len = dp - (unsigned char*)RSTRING_PTR(newstr);
1172  rb_str_set_len(newstr, len);
1173  rb_enc_associate(newstr, to);
1174  return newstr;
1175 
1176  default:
1177  return Qnil;
1178  }
1179 }
1180 
1181 VALUE
1183 {
1184  return rb_str_conv_enc_opts(str, from, to, 0, Qnil);
1185 }
1186 
1187 VALUE
1189 {
1190  rb_encoding *ienc;
1191  VALUE str;
1192  const int eidx = rb_enc_to_index(eenc);
1193 
1194  if (!ptr) {
1195  return rb_enc_str_new(ptr, len, eenc);
1196  }
1197 
1198  /* ASCII-8BIT case, no conversion */
1199  if ((eidx == rb_ascii8bit_encindex()) ||
1200  (eidx == rb_usascii_encindex() && search_nonascii(ptr, ptr + len))) {
1201  return rb_str_new(ptr, len);
1202  }
1203  /* no default_internal or same encoding, no conversion */
1205  if (!ienc || eenc == ienc) {
1206  return rb_enc_str_new(ptr, len, eenc);
1207  }
1208  /* ASCII compatible, and ASCII only string, no conversion in
1209  * default_internal */
1210  if ((eidx == rb_ascii8bit_encindex()) ||
1211  (eidx == rb_usascii_encindex()) ||
1212  (rb_enc_asciicompat(eenc) && !search_nonascii(ptr, ptr + len))) {
1213  return rb_enc_str_new(ptr, len, ienc);
1214  }
1215  /* convert from the given encoding to default_internal */
1216  str = rb_enc_str_new(NULL, 0, ienc);
1217  /* when the conversion failed for some reason, just ignore the
1218  * default_internal and result in the given encoding as-is. */
1219  if (NIL_P(rb_str_cat_conv_enc_opts(str, 0, ptr, len, eenc, 0, Qnil))) {
1220  rb_str_initialize(str, ptr, len, eenc);
1221  }
1222  return str;
1223 }
1224 
1225 VALUE
1226 rb_external_str_with_enc(VALUE str, rb_encoding *eenc)
1227 {
1228  int eidx = rb_enc_to_index(eenc);
1229  if (eidx == rb_usascii_encindex() &&
1232  return str;
1233  }
1234  rb_enc_associate_index(str, eidx);
1235  return rb_str_conv_enc(str, eenc, rb_default_internal_encoding());
1236 }
1237 
1238 VALUE
1239 rb_external_str_new(const char *ptr, long len)
1240 {
1242 }
1243 
1244 VALUE
1246 {
1248 }
1249 
1250 VALUE
1251 rb_locale_str_new(const char *ptr, long len)
1252 {
1254 }
1255 
1256 VALUE
1258 {
1260 }
1261 
1262 VALUE
1263 rb_filesystem_str_new(const char *ptr, long len)
1264 {
1266 }
1267 
1268 VALUE
1270 {
1272 }
1273 
1274 VALUE
1276 {
1278 }
1279 
1280 VALUE
1282 {
1284 }
1285 
1286 VALUE
1288 {
1289  return rb_str_conv_enc(str, STR_ENC_GET(str), enc);
1290 }
1291 
1292 static VALUE
1293 str_replace_shared_without_enc(VALUE str2, VALUE str)
1294 {
1295  const int termlen = TERM_LEN(str);
1296  char *ptr;
1297  long len;
1298 
1299  RSTRING_GETMEM(str, ptr, len);
1300  if (str_embed_capa(str2) >= len + termlen) {
1301  char *ptr2 = RSTRING(str2)->as.embed.ary;
1302  STR_SET_EMBED(str2);
1303  memcpy(ptr2, RSTRING_PTR(str), len);
1304  STR_SET_EMBED_LEN(str2, len);
1305  TERM_FILL(ptr2+len, termlen);
1306  }
1307  else {
1308  VALUE root;
1309  if (STR_SHARED_P(str)) {
1310  root = RSTRING(str)->as.heap.aux.shared;
1311  RSTRING_GETMEM(str, ptr, len);
1312  }
1313  else {
1314  root = rb_str_new_frozen(str);
1315  RSTRING_GETMEM(root, ptr, len);
1316  }
1317  assert(OBJ_FROZEN(root));
1318  if (!STR_EMBED_P(str2) && !FL_TEST_RAW(str2, STR_SHARED|STR_NOFREE)) {
1319  if (FL_TEST_RAW(str2, STR_SHARED_ROOT)) {
1320  rb_fatal("about to free a possible shared root");
1321  }
1322  char *ptr2 = STR_HEAP_PTR(str2);
1323  if (ptr2 != ptr) {
1324  ruby_sized_xfree(ptr2, STR_HEAP_SIZE(str2));
1325  }
1326  }
1327  FL_SET(str2, STR_NOEMBED);
1328  RSTRING(str2)->as.heap.len = len;
1329  RSTRING(str2)->as.heap.ptr = ptr;
1330  STR_SET_SHARED(str2, root);
1331  }
1332  return str2;
1333 }
1334 
1335 static VALUE
1336 str_replace_shared(VALUE str2, VALUE str)
1337 {
1338  str_replace_shared_without_enc(str2, str);
1339  rb_enc_cr_str_exact_copy(str2, str);
1340  return str2;
1341 }
1342 
1343 static VALUE
1344 str_new_shared(VALUE klass, VALUE str)
1345 {
1346  return str_replace_shared(str_alloc_heap(klass), str);
1347 }
1348 
1349 VALUE
1351 {
1352  return str_new_shared(rb_obj_class(str), str);
1353 }
1354 
1355 VALUE
1357 {
1358  if (OBJ_FROZEN(orig)) return orig;
1359  return str_new_frozen(rb_obj_class(orig), orig);
1360 }
1361 
1362 static VALUE
1363 rb_str_new_frozen_String(VALUE orig)
1364 {
1365  if (OBJ_FROZEN(orig) && rb_obj_class(orig) == rb_cString) return orig;
1366  return str_new_frozen(rb_cString, orig);
1367 }
1368 
1369 VALUE
1370 rb_str_tmp_frozen_acquire(VALUE orig)
1371 {
1372  if (OBJ_FROZEN_RAW(orig)) return orig;
1373  return str_new_frozen_buffer(0, orig, FALSE);
1374 }
1375 
1376 void
1377 rb_str_tmp_frozen_release(VALUE orig, VALUE tmp)
1378 {
1379  if (RBASIC_CLASS(tmp) != 0)
1380  return;
1381 
1382  if (STR_EMBED_P(tmp)) {
1383  assert(OBJ_FROZEN_RAW(tmp));
1384  }
1385  else if (FL_TEST_RAW(orig, STR_SHARED) &&
1386  !FL_TEST_RAW(orig, STR_TMPLOCK|RUBY_FL_FREEZE)) {
1387  VALUE shared = RSTRING(orig)->as.heap.aux.shared;
1388 
1389  if (shared == tmp && !FL_TEST_RAW(tmp, STR_BORROWED)) {
1390  assert(RSTRING(orig)->as.heap.ptr == RSTRING(tmp)->as.heap.ptr);
1391  assert(RSTRING(orig)->as.heap.len == RSTRING(tmp)->as.heap.len);
1392 
1393  /* Unshare orig since the root (tmp) only has this one child. */
1394  FL_UNSET_RAW(orig, STR_SHARED);
1395  RSTRING(orig)->as.heap.aux.capa = RSTRING(tmp)->as.heap.aux.capa;
1396  RBASIC(orig)->flags |= RBASIC(tmp)->flags & STR_NOFREE;
1397  assert(OBJ_FROZEN_RAW(tmp));
1398 
1399  /* Make tmp embedded and empty so it is safe for sweeping. */
1400  STR_SET_EMBED(tmp);
1401  STR_SET_EMBED_LEN(tmp, 0);
1402  }
1403  }
1404 }
1405 
1406 static VALUE
1407 str_new_frozen(VALUE klass, VALUE orig)
1408 {
1409  return str_new_frozen_buffer(klass, orig, TRUE);
1410 }
1411 
1412 static VALUE
1413 heap_str_make_shared(VALUE klass, VALUE orig)
1414 {
1415  assert(!STR_EMBED_P(orig));
1416  assert(!STR_SHARED_P(orig));
1417 
1418  VALUE str = str_alloc_heap(klass);
1419  STR_SET_NOEMBED(str);
1420  RSTRING(str)->as.heap.len = RSTRING_LEN(orig);
1421  RSTRING(str)->as.heap.ptr = RSTRING_PTR(orig);
1422  RSTRING(str)->as.heap.aux.capa = RSTRING(orig)->as.heap.aux.capa;
1423  RBASIC(str)->flags |= RBASIC(orig)->flags & STR_NOFREE;
1424  RBASIC(orig)->flags &= ~STR_NOFREE;
1425  STR_SET_SHARED(orig, str);
1426  if (klass == 0)
1427  FL_UNSET_RAW(str, STR_BORROWED);
1428  return str;
1429 }
1430 
1431 static VALUE
1432 str_new_frozen_buffer(VALUE klass, VALUE orig, int copy_encoding)
1433 {
1434  VALUE str;
1435 
1436  long len = RSTRING_LEN(orig);
1437 
1438  if (STR_EMBED_P(orig) || STR_EMBEDDABLE_P(len, 1)) {
1439  str = str_new(klass, RSTRING_PTR(orig), len);
1440  assert(STR_EMBED_P(str));
1441  }
1442  else {
1443  if (FL_TEST_RAW(orig, STR_SHARED)) {
1444  VALUE shared = RSTRING(orig)->as.heap.aux.shared;
1445  long ofs = RSTRING(orig)->as.heap.ptr - RSTRING_PTR(shared);
1446  long rest = RSTRING_LEN(shared) - ofs - RSTRING(orig)->as.heap.len;
1447  assert(ofs >= 0);
1448  assert(rest >= 0);
1449  assert(ofs + rest <= RSTRING_LEN(shared));
1450 #if !USE_RVARGC
1451  assert(!STR_EMBED_P(shared));
1452 #endif
1453  assert(OBJ_FROZEN(shared));
1454 
1455  if ((ofs > 0) || (rest > 0) ||
1456  (klass != RBASIC(shared)->klass) ||
1457  ENCODING_GET(shared) != ENCODING_GET(orig)) {
1458  str = str_new_shared(klass, shared);
1459  assert(!STR_EMBED_P(str));
1460  RSTRING(str)->as.heap.ptr += ofs;
1461  RSTRING(str)->as.heap.len -= ofs + rest;
1462  }
1463  else {
1464  if (RBASIC_CLASS(shared) == 0)
1465  FL_SET_RAW(shared, STR_BORROWED);
1466  return shared;
1467  }
1468  }
1469  else if (STR_EMBEDDABLE_P(RSTRING_LEN(orig), TERM_LEN(orig))) {
1470  str = str_alloc_embed(klass, RSTRING_LEN(orig) + TERM_LEN(orig));
1471  STR_SET_EMBED(str);
1472  memcpy(RSTRING_PTR(str), RSTRING_PTR(orig), RSTRING_LEN(orig));
1473  STR_SET_EMBED_LEN(str, RSTRING_LEN(orig));
1474  TERM_FILL(RSTRING_END(str), TERM_LEN(orig));
1475  }
1476  else {
1477  str = heap_str_make_shared(klass, orig);
1478  }
1479  }
1480 
1481  if (copy_encoding) rb_enc_cr_str_exact_copy(str, orig);
1482  OBJ_FREEZE(str);
1483  return str;
1484 }
1485 
1486 VALUE
1487 rb_str_new_with_class(VALUE obj, const char *ptr, long len)
1488 {
1489  return str_new0(rb_obj_class(obj), ptr, len, TERM_LEN(obj));
1490 }
1491 
1492 static VALUE
1493 str_new_empty_String(VALUE str)
1494 {
1495  VALUE v = rb_str_new(0, 0);
1496  rb_enc_copy(v, str);
1497  return v;
1498 }
1499 
1500 #define STR_BUF_MIN_SIZE 63
1501 #if !USE_RVARGC
1502 STATIC_ASSERT(STR_BUF_MIN_SIZE, STR_BUF_MIN_SIZE > RSTRING_EMBED_LEN_MAX);
1503 #endif
1504 
1505 VALUE
1507 {
1508  if (STR_EMBEDDABLE_P(capa, 1)) {
1509  return str_alloc_embed(rb_cString, capa + 1);
1510  }
1511 
1512  VALUE str = str_alloc_heap(rb_cString);
1513 
1514 #if !USE_RVARGC
1515  if (capa < STR_BUF_MIN_SIZE) {
1516  capa = STR_BUF_MIN_SIZE;
1517  }
1518 #endif
1519  FL_SET(str, STR_NOEMBED);
1520  RSTRING(str)->as.heap.aux.capa = capa;
1521  RSTRING(str)->as.heap.ptr = ALLOC_N(char, (size_t)capa + 1);
1522  RSTRING(str)->as.heap.ptr[0] = '\0';
1523 
1524  return str;
1525 }
1526 
1527 VALUE
1529 {
1530  VALUE str;
1531  long len = strlen(ptr);
1532 
1533  str = rb_str_buf_new(len);
1534  rb_str_buf_cat(str, ptr, len);
1535 
1536  return str;
1537 }
1538 
1539 VALUE
1541 {
1542  return str_new(0, 0, len);
1543 }
1544 
1545 void
1547 {
1548  if (FL_TEST(str, RSTRING_FSTR)) {
1549  st_data_t fstr = (st_data_t)str;
1550 
1551  RB_VM_LOCK_ENTER();
1552  {
1553  st_delete(rb_vm_fstring_table(), &fstr, NULL);
1554  RB_DEBUG_COUNTER_INC(obj_str_fstr);
1555  }
1556  RB_VM_LOCK_LEAVE();
1557  }
1558 
1559  if (STR_EMBED_P(str)) {
1560  RB_DEBUG_COUNTER_INC(obj_str_embed);
1561  }
1562  else if (FL_TEST(str, STR_SHARED | STR_NOFREE)) {
1563  (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared, FL_TEST(str, STR_SHARED));
1564  (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared, FL_TEST(str, STR_NOFREE));
1565  }
1566  else {
1567  RB_DEBUG_COUNTER_INC(obj_str_ptr);
1568  ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
1569  }
1570 }
1571 
1572 RUBY_FUNC_EXPORTED size_t
1573 rb_str_memsize(VALUE str)
1574 {
1575  if (FL_TEST(str, STR_NOEMBED|STR_SHARED|STR_NOFREE) == STR_NOEMBED) {
1576  return STR_HEAP_SIZE(str);
1577  }
1578  else {
1579  return 0;
1580  }
1581 }
1582 
1583 VALUE
1585 {
1586  return rb_convert_type_with_id(str, T_STRING, "String", idTo_str);
1587 }
1588 
1589 static inline void str_discard(VALUE str);
1590 static void str_shared_replace(VALUE str, VALUE str2);
1591 
1592 void
1594 {
1595  if (str != str2) str_shared_replace(str, str2);
1596 }
1597 
1598 static void
1599 str_shared_replace(VALUE str, VALUE str2)
1600 {
1601  rb_encoding *enc;
1602  int cr;
1603  int termlen;
1604 
1605  RUBY_ASSERT(str2 != str);
1606  enc = STR_ENC_GET(str2);
1607  cr = ENC_CODERANGE(str2);
1608  str_discard(str);
1609  termlen = rb_enc_mbminlen(enc);
1610 
1611  if (str_embed_capa(str) >= RSTRING_LEN(str2) + termlen) {
1612  STR_SET_EMBED(str);
1613  memcpy(RSTRING_PTR(str), RSTRING_PTR(str2), (size_t)RSTRING_LEN(str2) + termlen);
1614  STR_SET_EMBED_LEN(str, RSTRING_LEN(str2));
1615  rb_enc_associate(str, enc);
1616  ENC_CODERANGE_SET(str, cr);
1617  }
1618  else {
1619 #if USE_RVARGC
1620  if (STR_EMBED_P(str2)) {
1621  assert(!FL_TEST(str2, STR_SHARED));
1622  long len = RSTRING(str2)->as.embed.len;
1623  assert(len + termlen <= str_embed_capa(str2));
1624 
1625  char *new_ptr = ALLOC_N(char, len + termlen);
1626  memcpy(new_ptr, RSTRING(str2)->as.embed.ary, len + termlen);
1627  RSTRING(str2)->as.heap.ptr = new_ptr;
1628  RSTRING(str2)->as.heap.len = len;
1629  RSTRING(str2)->as.heap.aux.capa = len;
1630  STR_SET_NOEMBED(str2);
1631  }
1632 #endif
1633 
1634  STR_SET_NOEMBED(str);
1635  FL_UNSET(str, STR_SHARED);
1636  RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
1637  RSTRING(str)->as.heap.len = RSTRING_LEN(str2);
1638 
1639  if (FL_TEST(str2, STR_SHARED)) {
1640  VALUE shared = RSTRING(str2)->as.heap.aux.shared;
1641  STR_SET_SHARED(str, shared);
1642  }
1643  else {
1644  RSTRING(str)->as.heap.aux.capa = RSTRING(str2)->as.heap.aux.capa;
1645  }
1646 
1647  /* abandon str2 */
1648  STR_SET_EMBED(str2);
1649  RSTRING_PTR(str2)[0] = 0;
1650  STR_SET_EMBED_LEN(str2, 0);
1651  rb_enc_associate(str, enc);
1652  ENC_CODERANGE_SET(str, cr);
1653  }
1654 }
1655 
1656 VALUE
1658 {
1659  VALUE str;
1660 
1661  if (RB_TYPE_P(obj, T_STRING)) {
1662  return obj;
1663  }
1664  str = rb_funcall(obj, idTo_s, 0);
1665  return rb_obj_as_string_result(str, obj);
1666 }
1667 
1668 MJIT_FUNC_EXPORTED VALUE
1669 rb_obj_as_string_result(VALUE str, VALUE obj)
1670 {
1671  if (!RB_TYPE_P(str, T_STRING))
1672  return rb_any_to_s(obj);
1673  return str;
1674 }
1675 
1676 static VALUE
1677 str_replace(VALUE str, VALUE str2)
1678 {
1679  long len;
1680 
1681  len = RSTRING_LEN(str2);
1682  if (STR_SHARED_P(str2)) {
1683  VALUE shared = RSTRING(str2)->as.heap.aux.shared;
1684  assert(OBJ_FROZEN(shared));
1685  STR_SET_NOEMBED(str);
1686  RSTRING(str)->as.heap.len = len;
1687  RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
1688  STR_SET_SHARED(str, shared);
1689  rb_enc_cr_str_exact_copy(str, str2);
1690  }
1691  else {
1692  str_replace_shared(str, str2);
1693  }
1694 
1695  return str;
1696 }
1697 
1698 static inline VALUE
1699 ec_str_alloc(struct rb_execution_context_struct *ec, VALUE klass, size_t size)
1700 {
1701  assert(size > 0);
1702  RB_RVARGC_EC_NEWOBJ_OF(ec, str, struct RString, klass,
1704  return (VALUE)str;
1705 }
1706 
1707 static inline VALUE
1708 ec_str_alloc_embed(struct rb_execution_context_struct *ec, VALUE klass, size_t capa)
1709 {
1710  size_t size = str_embed_size(capa);
1711  assert(rb_gc_size_allocatable_p(size));
1712 #if !USE_RVARGC
1713  assert(size <= sizeof(struct RString));
1714 #endif
1715  return ec_str_alloc(ec, klass, size);
1716 }
1717 
1718 static inline VALUE
1719 ec_str_alloc_heap(struct rb_execution_context_struct *ec, VALUE klass)
1720 {
1721  return ec_str_alloc(ec, klass, sizeof(struct RString));
1722 }
1723 
1724 static inline VALUE
1725 str_duplicate_setup(VALUE klass, VALUE str, VALUE dup)
1726 {
1727  const VALUE flag_mask =
1728 #if !USE_RVARGC
1729  RSTRING_NOEMBED | RSTRING_EMBED_LEN_MASK |
1730 #endif
1732  FL_FREEZE
1733  ;
1734  VALUE flags = FL_TEST_RAW(str, flag_mask);
1735  int encidx = 0;
1736  if (STR_EMBED_P(str)) {
1737  long len = RSTRING_EMBED_LEN(str);
1738 
1739  assert(str_embed_capa(dup) >= len + 1);
1740  STR_SET_EMBED_LEN(dup, len);
1741  MEMCPY(RSTRING(dup)->as.embed.ary, RSTRING(str)->as.embed.ary, char, len + 1);
1742  }
1743  else {
1744  VALUE root = str;
1745  if (FL_TEST_RAW(str, STR_SHARED)) {
1746  root = RSTRING(str)->as.heap.aux.shared;
1747  }
1748  else if (UNLIKELY(!(flags & FL_FREEZE))) {
1749  root = str = str_new_frozen(klass, str);
1750  flags = FL_TEST_RAW(str, flag_mask);
1751  }
1752  assert(!STR_SHARED_P(root));
1753  assert(RB_OBJ_FROZEN_RAW(root));
1754 #if USE_RVARGC
1755  if (1) {
1756 #else
1757  if (STR_EMBED_P(root)) {
1758  MEMCPY(RSTRING(dup)->as.embed.ary, RSTRING(root)->as.embed.ary,
1759  char, RSTRING_EMBED_LEN_MAX + 1);
1760  }
1761  else {
1762 #endif
1763  RSTRING(dup)->as.heap.len = RSTRING_LEN(str);
1764  RSTRING(dup)->as.heap.ptr = RSTRING_PTR(str);
1765  RB_OBJ_WRITE(dup, &RSTRING(dup)->as.heap.aux.shared, root);
1766  flags |= RSTRING_NOEMBED | STR_SHARED;
1767  }
1768  }
1769 
1770  if ((flags & ENCODING_MASK) == (ENCODING_INLINE_MAX<<ENCODING_SHIFT)) {
1771  encidx = rb_enc_get_index(str);
1772  flags &= ~ENCODING_MASK;
1773  }
1774  FL_SET_RAW(dup, flags & ~FL_FREEZE);
1775  if (encidx) rb_enc_associate_index(dup, encidx);
1776  return dup;
1777 }
1778 
1779 static inline VALUE
1780 ec_str_duplicate(struct rb_execution_context_struct *ec, VALUE klass, VALUE str)
1781 {
1782  VALUE dup;
1783  if (!USE_RVARGC || FL_TEST(str, STR_NOEMBED)) {
1784  dup = ec_str_alloc_heap(ec, klass);
1785  }
1786  else {
1787  dup = ec_str_alloc_embed(ec, klass, RSTRING_EMBED_LEN(str) + TERM_LEN(str));
1788  }
1789 
1790  return str_duplicate_setup(klass, str, dup);
1791 }
1792 
1793 static inline VALUE
1794 str_duplicate(VALUE klass, VALUE str)
1795 {
1796  VALUE dup;
1797  if (!USE_RVARGC || FL_TEST(str, STR_NOEMBED)) {
1798  dup = str_alloc_heap(klass);
1799  }
1800  else {
1801  dup = str_alloc_embed(klass, RSTRING_EMBED_LEN(str) + TERM_LEN(str));
1802  }
1803 
1804  return str_duplicate_setup(klass, str, dup);
1805 }
1806 
1807 VALUE
1809 {
1810  return str_duplicate(rb_obj_class(str), str);
1811 }
1812 
1813 VALUE
1815 {
1816  RUBY_DTRACE_CREATE_HOOK(STRING, RSTRING_LEN(str));
1817  return str_duplicate(rb_cString, str);
1818 }
1819 
1820 VALUE
1821 rb_ec_str_resurrect(struct rb_execution_context_struct *ec, VALUE str)
1822 {
1823  RUBY_DTRACE_CREATE_HOOK(STRING, RSTRING_LEN(str));
1824  return ec_str_duplicate(ec, rb_cString, str);
1825 }
1826 
1827 /*
1828  * call-seq:
1829  * String.new(string = '') -> new_string
1830  * String.new(string = '', encoding: encoding) -> new_string
1831  * String.new(string = '', capacity: size) -> new_string
1832  *
1833  * Returns a new \String that is a copy of +string+.
1834  *
1835  * With no arguments, returns the empty string with the Encoding <tt>ASCII-8BIT</tt>:
1836  * s = String.new
1837  * s # => ""
1838  * s.encoding # => #<Encoding:ASCII-8BIT>
1839  *
1840  * With the single \String argument +string+, returns a copy of +string+
1841  * with the same encoding as +string+:
1842  * s = String.new("Que veut dire \u{e7}a?")
1843  * s # => "Que veut dire \u{e7}a?"
1844  * s.encoding # => #<Encoding:UTF-8>
1845  *
1846  * Literal strings like <tt>""</tt> or here-documents always use
1847  * {script encoding}[Encoding.html#class-Encoding-label-Script+encoding], unlike String.new.
1848  *
1849  * With keyword +encoding+, returns a copy of +str+
1850  * with the specified encoding:
1851  * s = String.new(encoding: 'ASCII')
1852  * s.encoding # => #<Encoding:US-ASCII>
1853  * s = String.new('foo', encoding: 'ASCII')
1854  * s.encoding # => #<Encoding:US-ASCII>
1855  *
1856  * Note that these are equivalent:
1857  * s0 = String.new('foo', encoding: 'ASCII')
1858  * s1 = 'foo'.force_encoding('ASCII')
1859  * s0.encoding == s1.encoding # => true
1860  *
1861  * With keyword +capacity+, returns a copy of +str+;
1862  * the given +capacity+ may set the size of the internal buffer,
1863  * which may affect performance:
1864  * String.new(capacity: 1) # => ""
1865  * String.new(capacity: 4096) # => ""
1866  *
1867  * The +string+, +encoding+, and +capacity+ arguments may all be used together:
1868  *
1869  * String.new('hello', encoding: 'UTF-8', capacity: 25)
1870  *
1871  */
1872 
1873 static VALUE
1874 rb_str_init(int argc, VALUE *argv, VALUE str)
1875 {
1876  static ID keyword_ids[2];
1877  VALUE orig, opt, venc, vcapa;
1878  VALUE kwargs[2];
1879  rb_encoding *enc = 0;
1880  int n;
1881 
1882  if (!keyword_ids[0]) {
1883  keyword_ids[0] = rb_id_encoding();
1884  CONST_ID(keyword_ids[1], "capacity");
1885  }
1886 
1887  n = rb_scan_args(argc, argv, "01:", &orig, &opt);
1888  if (!NIL_P(opt)) {
1889  rb_get_kwargs(opt, keyword_ids, 0, 2, kwargs);
1890  venc = kwargs[0];
1891  vcapa = kwargs[1];
1892  if (venc != Qundef && !NIL_P(venc)) {
1893  enc = rb_to_encoding(venc);
1894  }
1895  if (vcapa != Qundef && !NIL_P(vcapa)) {
1896  long capa = NUM2LONG(vcapa);
1897  long len = 0;
1898  int termlen = enc ? rb_enc_mbminlen(enc) : 1;
1899 
1900  if (capa < STR_BUF_MIN_SIZE) {
1901  capa = STR_BUF_MIN_SIZE;
1902  }
1903  if (n == 1) {
1904  StringValue(orig);
1905  len = RSTRING_LEN(orig);
1906  if (capa < len) {
1907  capa = len;
1908  }
1909  if (orig == str) n = 0;
1910  }
1911  str_modifiable(str);
1912  if (STR_EMBED_P(str)) { /* make noembed always */
1913  char *new_ptr = ALLOC_N(char, (size_t)capa + termlen);
1914 #if USE_RVARGC
1915  assert(RSTRING(str)->as.embed.len + 1 <= str_embed_capa(str));
1916  memcpy(new_ptr, RSTRING(str)->as.embed.ary, RSTRING(str)->as.embed.len + 1);
1917 #else
1918  memcpy(new_ptr, RSTRING(str)->as.embed.ary, RSTRING_EMBED_LEN_MAX + 1);
1919 #endif
1920  RSTRING(str)->as.heap.ptr = new_ptr;
1921  }
1922  else if (FL_TEST(str, STR_SHARED|STR_NOFREE)) {
1923  const size_t size = (size_t)capa + termlen;
1924  const char *const old_ptr = RSTRING_PTR(str);
1925  const size_t osize = RSTRING(str)->as.heap.len + TERM_LEN(str);
1926  char *new_ptr = ALLOC_N(char, (size_t)capa + termlen);
1927  memcpy(new_ptr, old_ptr, osize < size ? osize : size);
1928  FL_UNSET_RAW(str, STR_SHARED|STR_NOFREE);
1929  RSTRING(str)->as.heap.ptr = new_ptr;
1930  }
1931  else if (STR_HEAP_SIZE(str) != (size_t)capa + termlen) {
1932  SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char,
1933  (size_t)capa + termlen, STR_HEAP_SIZE(str));
1934  }
1935  RSTRING(str)->as.heap.len = len;
1936  TERM_FILL(&RSTRING(str)->as.heap.ptr[len], termlen);
1937  if (n == 1) {
1938  memcpy(RSTRING(str)->as.heap.ptr, RSTRING_PTR(orig), len);
1939  rb_enc_cr_str_exact_copy(str, orig);
1940  }
1941  FL_SET(str, STR_NOEMBED);
1942  RSTRING(str)->as.heap.aux.capa = capa;
1943  }
1944  else if (n == 1) {
1945  rb_str_replace(str, orig);
1946  }
1947  if (enc) {
1948  rb_enc_associate(str, enc);
1949  ENC_CODERANGE_CLEAR(str);
1950  }
1951  }
1952  else if (n == 1) {
1953  rb_str_replace(str, orig);
1954  }
1955  return str;
1956 }
1957 
1958 #ifdef NONASCII_MASK
1959 #define is_utf8_lead_byte(c) (((c)&0xC0) != 0x80)
1960 
1961 /*
1962  * UTF-8 leading bytes have either 0xxxxxxx or 11xxxxxx
1963  * bit representation. (see https://en.wikipedia.org/wiki/UTF-8)
1964  * Therefore, the following pseudocode can detect UTF-8 leading bytes.
1965  *
1966  * if (!(byte & 0x80))
1967  * byte |= 0x40; // turn on bit6
1968  * return ((byte>>6) & 1); // bit6 represent whether this byte is leading or not.
1969  *
1970  * This function calculates whether a byte is leading or not for all bytes
1971  * in the argument word by concurrently using the above logic, and then
1972  * adds up the number of leading bytes in the word.
1973  */
1974 static inline uintptr_t
1975 count_utf8_lead_bytes_with_word(const uintptr_t *s)
1976 {
1977  uintptr_t d = *s;
1978 
1979  /* Transform so that bit0 indicates whether we have a UTF-8 leading byte or not. */
1980  d = (d>>6) | (~d>>7);
1981  d &= NONASCII_MASK >> 7;
1982 
1983  /* Gather all bytes. */
1984 #if defined(HAVE_BUILTIN___BUILTIN_POPCOUNT) && defined(__POPCNT__)
1985  /* use only if it can use POPCNT */
1986  return rb_popcount_intptr(d);
1987 #else
1988  d += (d>>8);
1989  d += (d>>16);
1990 # if SIZEOF_VOIDP == 8
1991  d += (d>>32);
1992 # endif
1993  return (d&0xF);
1994 #endif
1995 }
1996 #endif
1997 
1998 static inline long
1999 enc_strlen(const char *p, const char *e, rb_encoding *enc, int cr)
2000 {
2001  long c;
2002  const char *q;
2003 
2004  if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2005  long diff = (long)(e - p);
2006  return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
2007  }
2008 #ifdef NONASCII_MASK
2009  else if (cr == ENC_CODERANGE_VALID && enc == rb_utf8_encoding()) {
2010  uintptr_t len = 0;
2011  if ((int)sizeof(uintptr_t) * 2 < e - p) {
2012  const uintptr_t *s, *t;
2013  const uintptr_t lowbits = sizeof(uintptr_t) - 1;
2014  s = (const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
2015  t = (const uintptr_t*)(~lowbits & (uintptr_t)e);
2016  while (p < (const char *)s) {
2017  if (is_utf8_lead_byte(*p)) len++;
2018  p++;
2019  }
2020  while (s < t) {
2021  len += count_utf8_lead_bytes_with_word(s);
2022  s++;
2023  }
2024  p = (const char *)s;
2025  }
2026  while (p < e) {
2027  if (is_utf8_lead_byte(*p)) len++;
2028  p++;
2029  }
2030  return (long)len;
2031  }
2032 #endif
2033  else if (rb_enc_asciicompat(enc)) {
2034  c = 0;
2035  if (ENC_CODERANGE_CLEAN_P(cr)) {
2036  while (p < e) {
2037  if (ISASCII(*p)) {
2038  q = search_nonascii(p, e);
2039  if (!q)
2040  return c + (e - p);
2041  c += q - p;
2042  p = q;
2043  }
2044  p += rb_enc_fast_mbclen(p, e, enc);
2045  c++;
2046  }
2047  }
2048  else {
2049  while (p < e) {
2050  if (ISASCII(*p)) {
2051  q = search_nonascii(p, e);
2052  if (!q)
2053  return c + (e - p);
2054  c += q - p;
2055  p = q;
2056  }
2057  p += rb_enc_mbclen(p, e, enc);
2058  c++;
2059  }
2060  }
2061  return c;
2062  }
2063 
2064  for (c=0; p<e; c++) {
2065  p += rb_enc_mbclen(p, e, enc);
2066  }
2067  return c;
2068 }
2069 
2070 long
2071 rb_enc_strlen(const char *p, const char *e, rb_encoding *enc)
2072 {
2073  return enc_strlen(p, e, enc, ENC_CODERANGE_UNKNOWN);
2074 }
2075 
2076 /* To get strlen with cr
2077  * Note that given cr is not used.
2078  */
2079 long
2080 rb_enc_strlen_cr(const char *p, const char *e, rb_encoding *enc, int *cr)
2081 {
2082  long c;
2083  const char *q;
2084  int ret;
2085 
2086  *cr = 0;
2087  if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2088  long diff = (long)(e - p);
2089  return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
2090  }
2091  else if (rb_enc_asciicompat(enc)) {
2092  c = 0;
2093  while (p < e) {
2094  if (ISASCII(*p)) {
2095  q = search_nonascii(p, e);
2096  if (!q) {
2097  if (!*cr) *cr = ENC_CODERANGE_7BIT;
2098  return c + (e - p);
2099  }
2100  c += q - p;
2101  p = q;
2102  }
2103  ret = rb_enc_precise_mbclen(p, e, enc);
2104  if (MBCLEN_CHARFOUND_P(ret)) {
2105  *cr |= ENC_CODERANGE_VALID;
2106  p += MBCLEN_CHARFOUND_LEN(ret);
2107  }
2108  else {
2109  *cr = ENC_CODERANGE_BROKEN;
2110  p++;
2111  }
2112  c++;
2113  }
2114  if (!*cr) *cr = ENC_CODERANGE_7BIT;
2115  return c;
2116  }
2117 
2118  for (c=0; p<e; c++) {
2119  ret = rb_enc_precise_mbclen(p, e, enc);
2120  if (MBCLEN_CHARFOUND_P(ret)) {
2121  *cr |= ENC_CODERANGE_VALID;
2122  p += MBCLEN_CHARFOUND_LEN(ret);
2123  }
2124  else {
2125  *cr = ENC_CODERANGE_BROKEN;
2126  if (p + rb_enc_mbminlen(enc) <= e)
2127  p += rb_enc_mbminlen(enc);
2128  else
2129  p = e;
2130  }
2131  }
2132  if (!*cr) *cr = ENC_CODERANGE_7BIT;
2133  return c;
2134 }
2135 
2136 /* enc must be str's enc or rb_enc_check(str, str2) */
2137 static long
2138 str_strlen(VALUE str, rb_encoding *enc)
2139 {
2140  const char *p, *e;
2141  int cr;
2142 
2143  if (single_byte_optimizable(str)) return RSTRING_LEN(str);
2144  if (!enc) enc = STR_ENC_GET(str);
2145  p = RSTRING_PTR(str);
2146  e = RSTRING_END(str);
2147  cr = ENC_CODERANGE(str);
2148 
2149  if (cr == ENC_CODERANGE_UNKNOWN) {
2150  long n = rb_enc_strlen_cr(p, e, enc, &cr);
2151  if (cr) ENC_CODERANGE_SET(str, cr);
2152  return n;
2153  }
2154  else {
2155  return enc_strlen(p, e, enc, cr);
2156  }
2157 }
2158 
2159 long
2161 {
2162  return str_strlen(str, NULL);
2163 }
2164 
2165 /*
2166  * call-seq:
2167  * length -> integer
2168  *
2169  * Returns the count of characters (not bytes) in +self+:
2170  *
2171  * "\x80\u3042".length # => 2
2172  * "hello".length # => 5
2173  *
2174  * String#size is an alias for String#length.
2175  *
2176  * Related: String#bytesize.
2177  */
2178 
2179 VALUE
2181 {
2182  return LONG2NUM(str_strlen(str, NULL));
2183 }
2184 
2185 /*
2186  * call-seq:
2187  * bytesize -> integer
2188  *
2189  * Returns the count of bytes in +self+:
2190  *
2191  * "\x80\u3042".bytesize # => 4
2192  * "hello".bytesize # => 5
2193  *
2194  * Related: String#length.
2195  */
2196 
2197 static VALUE
2198 rb_str_bytesize(VALUE str)
2199 {
2200  return LONG2NUM(RSTRING_LEN(str));
2201 }
2202 
2203 /*
2204  * call-seq:
2205  * empty? -> true or false
2206  *
2207  * Returns +true+ if the length of +self+ is zero, +false+ otherwise:
2208  *
2209  * "hello".empty? # => false
2210  * " ".empty? # => false
2211  * "".empty? # => true
2212  *
2213  */
2214 
2215 static VALUE
2216 rb_str_empty(VALUE str)
2217 {
2218  return RBOOL(RSTRING_LEN(str) == 0);
2219 }
2220 
2221 /*
2222  * call-seq:
2223  * string + other_string -> new_string
2224  *
2225  * Returns a new \String containing +other_string+ concatenated to +self+:
2226  *
2227  * "Hello from " + self.to_s # => "Hello from main"
2228  *
2229  */
2230 
2231 VALUE
2233 {
2234  VALUE str3;
2235  rb_encoding *enc;
2236  char *ptr1, *ptr2, *ptr3;
2237  long len1, len2;
2238  int termlen;
2239 
2240  StringValue(str2);
2241  enc = rb_enc_check_str(str1, str2);
2242  RSTRING_GETMEM(str1, ptr1, len1);
2243  RSTRING_GETMEM(str2, ptr2, len2);
2244  termlen = rb_enc_mbminlen(enc);
2245  if (len1 > LONG_MAX - len2) {
2246  rb_raise(rb_eArgError, "string size too big");
2247  }
2248  str3 = str_new0(rb_cString, 0, len1+len2, termlen);
2249  ptr3 = RSTRING_PTR(str3);
2250  memcpy(ptr3, ptr1, len1);
2251  memcpy(ptr3+len1, ptr2, len2);
2252  TERM_FILL(&ptr3[len1+len2], termlen);
2253 
2256  RB_GC_GUARD(str1);
2257  RB_GC_GUARD(str2);
2258  return str3;
2259 }
2260 
2261 /* A variant of rb_str_plus that does not raise but return Qundef instead. */
2262 MJIT_FUNC_EXPORTED VALUE
2263 rb_str_opt_plus(VALUE str1, VALUE str2)
2264 {
2265  assert(RBASIC_CLASS(str1) == rb_cString);
2266  assert(RBASIC_CLASS(str2) == rb_cString);
2267  long len1, len2;
2268  MAYBE_UNUSED(char) *ptr1, *ptr2;
2269  RSTRING_GETMEM(str1, ptr1, len1);
2270  RSTRING_GETMEM(str2, ptr2, len2);
2271  int enc1 = rb_enc_get_index(str1);
2272  int enc2 = rb_enc_get_index(str2);
2273 
2274  if (enc1 < 0) {
2275  return Qundef;
2276  }
2277  else if (enc2 < 0) {
2278  return Qundef;
2279  }
2280  else if (enc1 != enc2) {
2281  return Qundef;
2282  }
2283  else if (len1 > LONG_MAX - len2) {
2284  return Qundef;
2285  }
2286  else {
2287  return rb_str_plus(str1, str2);
2288  }
2289 
2290 }
2291 
2292 /*
2293  * call-seq:
2294  * string * integer -> new_string
2295  *
2296  * Returns a new \String containing +integer+ copies of +self+:
2297  *
2298  * "Ho! " * 3 # => "Ho! Ho! Ho! "
2299  * "Ho! " * 0 # => ""
2300  *
2301  */
2302 
2303 VALUE
2305 {
2306  VALUE str2;
2307  long n, len;
2308  char *ptr2;
2309  int termlen;
2310 
2311  if (times == INT2FIX(1)) {
2312  return str_duplicate(rb_cString, str);
2313  }
2314  if (times == INT2FIX(0)) {
2315  str2 = str_alloc_embed(rb_cString, 0);
2316  rb_enc_copy(str2, str);
2317  return str2;
2318  }
2319  len = NUM2LONG(times);
2320  if (len < 0) {
2321  rb_raise(rb_eArgError, "negative argument");
2322  }
2323  if (RSTRING_LEN(str) == 1 && RSTRING_PTR(str)[0] == 0) {
2324  if (STR_EMBEDDABLE_P(len, 1)) {
2325  str2 = str_alloc_embed(rb_cString, len + 1);
2326  memset(RSTRING_PTR(str2), 0, len + 1);
2327  }
2328  else {
2329  str2 = str_alloc_heap(rb_cString);
2330  RSTRING(str2)->as.heap.aux.capa = len;
2331  RSTRING(str2)->as.heap.ptr = ZALLOC_N(char, (size_t)len + 1);
2332  STR_SET_NOEMBED(str2);
2333  }
2334  STR_SET_LEN(str2, len);
2335  rb_enc_copy(str2, str);
2336  return str2;
2337  }
2338  if (len && LONG_MAX/len < RSTRING_LEN(str)) {
2339  rb_raise(rb_eArgError, "argument too big");
2340  }
2341 
2342  len *= RSTRING_LEN(str);
2343  termlen = TERM_LEN(str);
2344  str2 = str_new0(rb_cString, 0, len, termlen);
2345  ptr2 = RSTRING_PTR(str2);
2346  if (len) {
2347  n = RSTRING_LEN(str);
2348  memcpy(ptr2, RSTRING_PTR(str), n);
2349  while (n <= len/2) {
2350  memcpy(ptr2 + n, ptr2, n);
2351  n *= 2;
2352  }
2353  memcpy(ptr2 + n, ptr2, len-n);
2354  }
2355  STR_SET_LEN(str2, len);
2356  TERM_FILL(&ptr2[len], termlen);
2357  rb_enc_cr_str_copy_for_substr(str2, str);
2358 
2359  return str2;
2360 }
2361 
2362 /*
2363  * call-seq:
2364  * string % object -> new_string
2365  *
2366  * Returns the result of formatting +object+ into the format specification +self+
2367  * (see Kernel#sprintf for formatting details):
2368  *
2369  * "%05d" % 123 # => "00123"
2370  *
2371  * If +self+ contains multiple substitutions, +object+ must be
2372  * an \Array or \Hash containing the values to be substituted:
2373  *
2374  * "%-5s: %016x" % [ "ID", self.object_id ] # => "ID : 00002b054ec93168"
2375  * "foo = %{foo}" % {foo: 'bar'} # => "foo = bar"
2376  * "foo = %{foo}, baz = %{baz}" % {foo: 'bar', baz: 'bat'} # => "foo = bar, baz = bat"
2377  *
2378  */
2379 
2380 static VALUE
2381 rb_str_format_m(VALUE str, VALUE arg)
2382 {
2383  VALUE tmp = rb_check_array_type(arg);
2384 
2385  if (!NIL_P(tmp)) {
2386  return rb_str_format(RARRAY_LENINT(tmp), RARRAY_CONST_PTR(tmp), str);
2387  }
2388  return rb_str_format(1, &arg, str);
2389 }
2390 
2391 static inline void
2392 rb_check_lockedtmp(VALUE str)
2393 {
2394  if (FL_TEST(str, STR_TMPLOCK)) {
2395  rb_raise(rb_eRuntimeError, "can't modify string; temporarily locked");
2396  }
2397 }
2398 
2399 static inline void
2400 str_modifiable(VALUE str)
2401 {
2402  rb_check_lockedtmp(str);
2403  rb_check_frozen(str);
2404 }
2405 
2406 static inline int
2407 str_dependent_p(VALUE str)
2408 {
2409  if (STR_EMBED_P(str) || !FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2410  return 0;
2411  }
2412  else {
2413  return 1;
2414  }
2415 }
2416 
2417 static inline int
2418 str_independent(VALUE str)
2419 {
2420  str_modifiable(str);
2421  return !str_dependent_p(str);
2422 }
2423 
2424 static void
2425 str_make_independent_expand(VALUE str, long len, long expand, const int termlen)
2426 {
2427  char *ptr;
2428  char *oldptr;
2429  long capa = len + expand;
2430 
2431  if (len > capa) len = capa;
2432 
2433  if (!STR_EMBED_P(str) && str_embed_capa(str) >= capa + termlen) {
2434  ptr = RSTRING(str)->as.heap.ptr;
2435  STR_SET_EMBED(str);
2436  memcpy(RSTRING(str)->as.embed.ary, ptr, len);
2437  TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
2438  STR_SET_EMBED_LEN(str, len);
2439  return;
2440  }
2441 
2442  ptr = ALLOC_N(char, (size_t)capa + termlen);
2443  oldptr = RSTRING_PTR(str);
2444  if (oldptr) {
2445  memcpy(ptr, oldptr, len);
2446  }
2447  if (FL_TEST_RAW(str, STR_NOEMBED|STR_NOFREE|STR_SHARED) == STR_NOEMBED) {
2448  xfree(oldptr);
2449  }
2450  STR_SET_NOEMBED(str);
2451  FL_UNSET(str, STR_SHARED|STR_NOFREE);
2452  TERM_FILL(ptr + len, termlen);
2453  RSTRING(str)->as.heap.ptr = ptr;
2454  RSTRING(str)->as.heap.len = len;
2455  RSTRING(str)->as.heap.aux.capa = capa;
2456 }
2457 
2458 void
2460 {
2461  if (!str_independent(str))
2462  str_make_independent(str);
2463  ENC_CODERANGE_CLEAR(str);
2464 }
2465 
2466 void
2467 rb_str_modify_expand(VALUE str, long expand)
2468 {
2469  int termlen = TERM_LEN(str);
2470  long len = RSTRING_LEN(str);
2471 
2472  if (expand < 0) {
2473  rb_raise(rb_eArgError, "negative expanding string size");
2474  }
2475  if (expand >= LONG_MAX - len) {
2476  rb_raise(rb_eArgError, "string size too big");
2477  }
2478 
2479  if (!str_independent(str)) {
2480  str_make_independent_expand(str, len, expand, termlen);
2481  }
2482  else if (expand > 0) {
2483  RESIZE_CAPA_TERM(str, len + expand, termlen);
2484  }
2485  ENC_CODERANGE_CLEAR(str);
2486 }
2487 
2488 /* As rb_str_modify(), but don't clear coderange */
2489 static void
2490 str_modify_keep_cr(VALUE str)
2491 {
2492  if (!str_independent(str))
2493  str_make_independent(str);
2494  if (ENC_CODERANGE(str) == ENC_CODERANGE_BROKEN)
2495  /* Force re-scan later */
2496  ENC_CODERANGE_CLEAR(str);
2497 }
2498 
2499 static inline void
2500 str_discard(VALUE str)
2501 {
2502  str_modifiable(str);
2503  if (!STR_EMBED_P(str) && !FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2504  ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
2505  RSTRING(str)->as.heap.ptr = 0;
2506  RSTRING(str)->as.heap.len = 0;
2507  }
2508 }
2509 
2510 void
2512 {
2513  rb_encoding *enc = rb_enc_get(str);
2514  if (!rb_enc_asciicompat(enc)) {
2515  rb_raise(rb_eEncCompatError, "ASCII incompatible encoding: %s", rb_enc_name(enc));
2516  }
2517 }
2518 
2519 VALUE
2521 {
2522  VALUE s = *ptr;
2523  if (!RB_TYPE_P(s, T_STRING)) {
2524  s = rb_str_to_str(s);
2525  *ptr = s;
2526  }
2527  return s;
2528 }
2529 
2530 char *
2532 {
2533  VALUE str = rb_string_value(ptr);
2534  return RSTRING_PTR(str);
2535 }
2536 
2537 static int
2538 zero_filled(const char *s, int n)
2539 {
2540  for (; n > 0; --n) {
2541  if (*s++) return 0;
2542  }
2543  return 1;
2544 }
2545 
2546 static const char *
2547 str_null_char(const char *s, long len, const int minlen, rb_encoding *enc)
2548 {
2549  const char *e = s + len;
2550 
2551  for (; s + minlen <= e; s += rb_enc_mbclen(s, e, enc)) {
2552  if (zero_filled(s, minlen)) return s;
2553  }
2554  return 0;
2555 }
2556 
2557 static char *
2558 str_fill_term(VALUE str, char *s, long len, int termlen)
2559 {
2560  /* This function assumes that (capa + termlen) bytes of memory
2561  * is allocated, like many other functions in this file.
2562  */
2563  if (str_dependent_p(str)) {
2564  if (!zero_filled(s + len, termlen))
2565  str_make_independent_expand(str, len, 0L, termlen);
2566  }
2567  else {
2568  TERM_FILL(s + len, termlen);
2569  return s;
2570  }
2571  return RSTRING_PTR(str);
2572 }
2573 
2574 void
2575 rb_str_change_terminator_length(VALUE str, const int oldtermlen, const int termlen)
2576 {
2577  long capa = str_capacity(str, oldtermlen) + oldtermlen;
2578  long len = RSTRING_LEN(str);
2579 
2580  assert(capa >= len);
2581  if (capa - len < termlen) {
2582  rb_check_lockedtmp(str);
2583  str_make_independent_expand(str, len, 0L, termlen);
2584  }
2585  else if (str_dependent_p(str)) {
2586  if (termlen > oldtermlen)
2587  str_make_independent_expand(str, len, 0L, termlen);
2588  }
2589  else {
2590  if (!STR_EMBED_P(str)) {
2591  /* modify capa instead of realloc */
2592  assert(!FL_TEST((str), STR_SHARED));
2593  RSTRING(str)->as.heap.aux.capa = capa - termlen;
2594  }
2595  if (termlen > oldtermlen) {
2596  TERM_FILL(RSTRING_PTR(str) + len, termlen);
2597  }
2598  }
2599 
2600  return;
2601 }
2602 
2603 static char *
2604 str_null_check(VALUE str, int *w)
2605 {
2606  char *s = RSTRING_PTR(str);
2607  long len = RSTRING_LEN(str);
2608  rb_encoding *enc = rb_enc_get(str);
2609  const int minlen = rb_enc_mbminlen(enc);
2610 
2611  if (minlen > 1) {
2612  *w = 1;
2613  if (str_null_char(s, len, minlen, enc)) {
2614  return NULL;
2615  }
2616  return str_fill_term(str, s, len, minlen);
2617  }
2618  *w = 0;
2619  if (!s || memchr(s, 0, len)) {
2620  return NULL;
2621  }
2622  if (s[len]) {
2623  s = str_fill_term(str, s, len, minlen);
2624  }
2625  return s;
2626 }
2627 
2628 char *
2629 rb_str_to_cstr(VALUE str)
2630 {
2631  int w;
2632  return str_null_check(str, &w);
2633 }
2634 
2635 char *
2637 {
2638  VALUE str = rb_string_value(ptr);
2639  int w;
2640  char *s = str_null_check(str, &w);
2641  if (!s) {
2642  if (w) {
2643  rb_raise(rb_eArgError, "string contains null char");
2644  }
2645  rb_raise(rb_eArgError, "string contains null byte");
2646  }
2647  return s;
2648 }
2649 
2650 char *
2651 rb_str_fill_terminator(VALUE str, const int newminlen)
2652 {
2653  char *s = RSTRING_PTR(str);
2654  long len = RSTRING_LEN(str);
2655  return str_fill_term(str, s, len, newminlen);
2656 }
2657 
2658 VALUE
2660 {
2661  str = rb_check_convert_type_with_id(str, T_STRING, "String", idTo_str);
2662  return str;
2663 }
2664 
2665 /*
2666  * call-seq:
2667  * String.try_convert(object) -> object, new_string, or nil
2668  *
2669  * If +object+ is a \String object, returns +object+.
2670  *
2671  * Otherwise if +object+ responds to <tt>:to_str</tt>,
2672  * calls <tt>object.to_str</tt> and returns the result.
2673  *
2674  * Returns +nil+ if +object+ does not respond to <tt>:to_str</tt>.
2675  *
2676  * Raises an exception unless <tt>object.to_str</tt> returns a \String object.
2677  */
2678 static VALUE
2679 rb_str_s_try_convert(VALUE dummy, VALUE str)
2680 {
2681  return rb_check_string_type(str);
2682 }
2683 
2684 static char*
2685 str_nth_len(const char *p, const char *e, long *nthp, rb_encoding *enc)
2686 {
2687  long nth = *nthp;
2688  if (rb_enc_mbmaxlen(enc) == 1) {
2689  p += nth;
2690  }
2691  else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2692  p += nth * rb_enc_mbmaxlen(enc);
2693  }
2694  else if (rb_enc_asciicompat(enc)) {
2695  const char *p2, *e2;
2696  int n;
2697 
2698  while (p < e && 0 < nth) {
2699  e2 = p + nth;
2700  if (e < e2) {
2701  *nthp = nth;
2702  return (char *)e;
2703  }
2704  if (ISASCII(*p)) {
2705  p2 = search_nonascii(p, e2);
2706  if (!p2) {
2707  nth -= e2 - p;
2708  *nthp = nth;
2709  return (char *)e2;
2710  }
2711  nth -= p2 - p;
2712  p = p2;
2713  }
2714  n = rb_enc_mbclen(p, e, enc);
2715  p += n;
2716  nth--;
2717  }
2718  *nthp = nth;
2719  if (nth != 0) {
2720  return (char *)e;
2721  }
2722  return (char *)p;
2723  }
2724  else {
2725  while (p < e && nth--) {
2726  p += rb_enc_mbclen(p, e, enc);
2727  }
2728  }
2729  if (p > e) p = e;
2730  *nthp = nth;
2731  return (char*)p;
2732 }
2733 
2734 char*
2735 rb_enc_nth(const char *p, const char *e, long nth, rb_encoding *enc)
2736 {
2737  return str_nth_len(p, e, &nth, enc);
2738 }
2739 
2740 static char*
2741 str_nth(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
2742 {
2743  if (singlebyte)
2744  p += nth;
2745  else {
2746  p = str_nth_len(p, e, &nth, enc);
2747  }
2748  if (!p) return 0;
2749  if (p > e) p = e;
2750  return (char *)p;
2751 }
2752 
2753 /* char offset to byte offset */
2754 static long
2755 str_offset(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
2756 {
2757  const char *pp = str_nth(p, e, nth, enc, singlebyte);
2758  if (!pp) return e - p;
2759  return pp - p;
2760 }
2761 
2762 long
2763 rb_str_offset(VALUE str, long pos)
2764 {
2765  return str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
2766  STR_ENC_GET(str), single_byte_optimizable(str));
2767 }
2768 
2769 #ifdef NONASCII_MASK
2770 static char *
2771 str_utf8_nth(const char *p, const char *e, long *nthp)
2772 {
2773  long nth = *nthp;
2774  if ((int)SIZEOF_VOIDP * 2 < e - p && (int)SIZEOF_VOIDP * 2 < nth) {
2775  const uintptr_t *s, *t;
2776  const uintptr_t lowbits = SIZEOF_VOIDP - 1;
2777  s = (const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
2778  t = (const uintptr_t*)(~lowbits & (uintptr_t)e);
2779  while (p < (const char *)s) {
2780  if (is_utf8_lead_byte(*p)) nth--;
2781  p++;
2782  }
2783  do {
2784  nth -= count_utf8_lead_bytes_with_word(s);
2785  s++;
2786  } while (s < t && (int)SIZEOF_VOIDP <= nth);
2787  p = (char *)s;
2788  }
2789  while (p < e) {
2790  if (is_utf8_lead_byte(*p)) {
2791  if (nth == 0) break;
2792  nth--;
2793  }
2794  p++;
2795  }
2796  *nthp = nth;
2797  return (char *)p;
2798 }
2799 
2800 static long
2801 str_utf8_offset(const char *p, const char *e, long nth)
2802 {
2803  const char *pp = str_utf8_nth(p, e, &nth);
2804  return pp - p;
2805 }
2806 #endif
2807 
2808 /* byte offset to char offset */
2809 long
2810 rb_str_sublen(VALUE str, long pos)
2811 {
2812  if (single_byte_optimizable(str) || pos < 0)
2813  return pos;
2814  else {
2815  char *p = RSTRING_PTR(str);
2816  return enc_strlen(p, p + pos, STR_ENC_GET(str), ENC_CODERANGE(str));
2817  }
2818 }
2819 
2820 VALUE
2821 rb_str_subseq(VALUE str, long beg, long len)
2822 {
2823  VALUE str2;
2824 
2825  if (!STR_EMBEDDABLE_P(len, TERM_LEN(str)) &&
2826  SHARABLE_SUBSTRING_P(beg, len, RSTRING_LEN(str))) {
2827  long olen;
2828  str2 = rb_str_new_shared(rb_str_new_frozen_String(str));
2829  RSTRING(str2)->as.heap.ptr += beg;
2830  olen = RSTRING(str2)->as.heap.len;
2831  if (olen > len) RSTRING(str2)->as.heap.len = len;
2832  }
2833  else {
2834  str2 = rb_str_new(RSTRING_PTR(str)+beg, len);
2835  RB_GC_GUARD(str);
2836  }
2837 
2838  rb_enc_cr_str_copy_for_substr(str2, str);
2839 
2840  return str2;
2841 }
2842 
2843 char *
2844 rb_str_subpos(VALUE str, long beg, long *lenp)
2845 {
2846  long len = *lenp;
2847  long slen = -1L;
2848  long blen = RSTRING_LEN(str);
2849  rb_encoding *enc = STR_ENC_GET(str);
2850  char *p, *s = RSTRING_PTR(str), *e = s + blen;
2851 
2852  if (len < 0) return 0;
2853  if (!blen) {
2854  len = 0;
2855  }
2856  if (single_byte_optimizable(str)) {
2857  if (beg > blen) return 0;
2858  if (beg < 0) {
2859  beg += blen;
2860  if (beg < 0) return 0;
2861  }
2862  if (len > blen - beg)
2863  len = blen - beg;
2864  if (len < 0) return 0;
2865  p = s + beg;
2866  goto end;
2867  }
2868  if (beg < 0) {
2869  if (len > -beg) len = -beg;
2870  if (-beg * rb_enc_mbmaxlen(enc) < RSTRING_LEN(str) / 8) {
2871  beg = -beg;
2872  while (beg-- > len && (e = rb_enc_prev_char(s, e, e, enc)) != 0);
2873  p = e;
2874  if (!p) return 0;
2875  while (len-- > 0 && (p = rb_enc_prev_char(s, p, e, enc)) != 0);
2876  if (!p) return 0;
2877  len = e - p;
2878  goto end;
2879  }
2880  else {
2881  slen = str_strlen(str, enc);
2882  beg += slen;
2883  if (beg < 0) return 0;
2884  p = s + beg;
2885  if (len == 0) goto end;
2886  }
2887  }
2888  else if (beg > 0 && beg > RSTRING_LEN(str)) {
2889  return 0;
2890  }
2891  if (len == 0) {
2892  if (beg > str_strlen(str, enc)) return 0; /* str's enc */
2893  p = s + beg;
2894  }
2895 #ifdef NONASCII_MASK
2896  else if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID &&
2897  enc == rb_utf8_encoding()) {
2898  p = str_utf8_nth(s, e, &beg);
2899  if (beg > 0) return 0;
2900  len = str_utf8_offset(p, e, len);
2901  }
2902 #endif
2903  else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2904  int char_sz = rb_enc_mbmaxlen(enc);
2905 
2906  p = s + beg * char_sz;
2907  if (p > e) {
2908  return 0;
2909  }
2910  else if (len * char_sz > e - p)
2911  len = e - p;
2912  else
2913  len *= char_sz;
2914  }
2915  else if ((p = str_nth_len(s, e, &beg, enc)) == e) {
2916  if (beg > 0) return 0;
2917  len = 0;
2918  }
2919  else {
2920  len = str_offset(p, e, len, enc, 0);
2921  }
2922  end:
2923  *lenp = len;
2924  RB_GC_GUARD(str);
2925  return p;
2926 }
2927 
2928 static VALUE str_substr(VALUE str, long beg, long len, int empty);
2929 
2930 VALUE
2931 rb_str_substr(VALUE str, long beg, long len)
2932 {
2933  return str_substr(str, beg, len, TRUE);
2934 }
2935 
2936 static VALUE
2937 str_substr(VALUE str, long beg, long len, int empty)
2938 {
2939  VALUE str2;
2940  char *p = rb_str_subpos(str, beg, &len);
2941 
2942  if (!p) return Qnil;
2943  if (!STR_EMBEDDABLE_P(len, TERM_LEN(str)) &&
2944  SHARABLE_SUBSTRING_P(p, len, RSTRING_END(str))) {
2945  long ofs = p - RSTRING_PTR(str);
2946  str2 = rb_str_new_frozen(str);
2947  str2 = str_new_shared(rb_cString, str2);
2948  RSTRING(str2)->as.heap.ptr += ofs;
2949  RSTRING(str2)->as.heap.len = len;
2950  ENC_CODERANGE_CLEAR(str2);
2951  }
2952  else {
2953  if (!len && !empty) return Qnil;
2954  str2 = rb_str_new(p, len);
2955  RB_GC_GUARD(str);
2956  }
2957  rb_enc_cr_str_copy_for_substr(str2, str);
2958 
2959  return str2;
2960 }
2961 
2962 VALUE
2964 {
2965  if (OBJ_FROZEN(str)) return str;
2966  rb_str_resize(str, RSTRING_LEN(str));
2967  return rb_obj_freeze(str);
2968 }
2969 
2970 
2971 /*
2972  * call-seq:
2973  * +string -> new_string or self
2974  *
2975  * Returns +self+ if +self+ is not frozen.
2976  *
2977  * Otherwise. returns <tt>self.dup</tt>, which is not frozen.
2978  */
2979 static VALUE
2980 str_uplus(VALUE str)
2981 {
2982  if (OBJ_FROZEN(str)) {
2983  return rb_str_dup(str);
2984  }
2985  else {
2986  return str;
2987  }
2988 }
2989 
2990 /*
2991  * call-seq:
2992  * -string -> frozen_string
2993  *
2994  * Returns a frozen, possibly pre-existing copy of the string.
2995  *
2996  * The returned \String will be deduplicated as long as it does not have
2997  * any instance variables set on it.
2998  */
2999 static VALUE
3000 str_uminus(VALUE str)
3001 {
3002  if (!BARE_STRING_P(str) && !rb_obj_frozen_p(str)) {
3003  str = rb_str_dup(str);
3004  }
3005  return rb_fstring(str);
3006 }
3007 
3008 RUBY_ALIAS_FUNCTION(rb_str_dup_frozen(VALUE str), rb_str_new_frozen, (str))
3009 #define rb_str_dup_frozen rb_str_new_frozen
3010 
3011 VALUE
3012 rb_str_locktmp(VALUE str)
3013 {
3014  if (FL_TEST(str, STR_TMPLOCK)) {
3015  rb_raise(rb_eRuntimeError, "temporal locking already locked string");
3016  }
3017  FL_SET(str, STR_TMPLOCK);
3018  return str;
3019 }
3020 
3021 VALUE
3023 {
3024  if (!FL_TEST(str, STR_TMPLOCK)) {
3025  rb_raise(rb_eRuntimeError, "temporal unlocking already unlocked string");
3026  }
3027  FL_UNSET(str, STR_TMPLOCK);
3028  return str;
3029 }
3030 
3031 RUBY_FUNC_EXPORTED VALUE
3032 rb_str_locktmp_ensure(VALUE str, VALUE (*func)(VALUE), VALUE arg)
3033 {
3034  rb_str_locktmp(str);
3035  return rb_ensure(func, arg, rb_str_unlocktmp, str);
3036 }
3037 
3038 void
3040 {
3041  long capa;
3042  const int termlen = TERM_LEN(str);
3043 
3044  str_modifiable(str);
3045  if (STR_SHARED_P(str)) {
3046  rb_raise(rb_eRuntimeError, "can't set length of shared string");
3047  }
3048  if (len > (capa = (long)str_capacity(str, termlen)) || len < 0) {
3049  rb_bug("probable buffer overflow: %ld for %ld", len, capa);
3050  }
3051  STR_SET_LEN(str, len);
3052  TERM_FILL(&RSTRING_PTR(str)[len], termlen);
3053 }
3054 
3055 VALUE
3057 {
3058  long slen;
3059  int independent;
3060 
3061  if (len < 0) {
3062  rb_raise(rb_eArgError, "negative string size (or size too big)");
3063  }
3064 
3065  independent = str_independent(str);
3066  ENC_CODERANGE_CLEAR(str);
3067  slen = RSTRING_LEN(str);
3068 
3069  {
3070  long capa;
3071  const int termlen = TERM_LEN(str);
3072  if (STR_EMBED_P(str)) {
3073  if (len == slen) return str;
3074  if (str_embed_capa(str) >= len + termlen) {
3075  STR_SET_EMBED_LEN(str, len);
3076  TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
3077  return str;
3078  }
3079  str_make_independent_expand(str, slen, len - slen, termlen);
3080  }
3081  else if (str_embed_capa(str) >= len + termlen) {
3082  char *ptr = STR_HEAP_PTR(str);
3083  STR_SET_EMBED(str);
3084  if (slen > len) slen = len;
3085  if (slen > 0) MEMCPY(RSTRING(str)->as.embed.ary, ptr, char, slen);
3086  TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
3087  STR_SET_EMBED_LEN(str, len);
3088  if (independent) ruby_xfree(ptr);
3089  return str;
3090  }
3091  else if (!independent) {
3092  if (len == slen) return str;
3093  str_make_independent_expand(str, slen, len - slen, termlen);
3094  }
3095  else if ((capa = RSTRING(str)->as.heap.aux.capa) < len ||
3096  (capa - len) > (len < 1024 ? len : 1024)) {
3097  SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char,
3098  (size_t)len + termlen, STR_HEAP_SIZE(str));
3099  RSTRING(str)->as.heap.aux.capa = len;
3100  }
3101  else if (len == slen) return str;
3102  RSTRING(str)->as.heap.len = len;
3103  TERM_FILL(RSTRING(str)->as.heap.ptr + len, termlen); /* sentinel */
3104  }
3105  return str;
3106 }
3107 
3108 static VALUE
3109 str_buf_cat(VALUE str, const char *ptr, long len)
3110 {
3111  long capa, total, olen, off = -1;
3112  char *sptr;
3113  const int termlen = TERM_LEN(str);
3114 #if !USE_RVARGC
3115  assert(termlen < RSTRING_EMBED_LEN_MAX + 1); /* < (LONG_MAX/2) */
3116 #endif
3117 
3118  RSTRING_GETMEM(str, sptr, olen);
3119  if (ptr >= sptr && ptr <= sptr + olen) {
3120  off = ptr - sptr;
3121  }
3122  rb_str_modify(str);
3123  if (len == 0) return 0;
3124  if (STR_EMBED_P(str)) {
3125  capa = str_embed_capa(str) - termlen;
3126  sptr = RSTRING(str)->as.embed.ary;
3127  olen = RSTRING_EMBED_LEN(str);
3128  }
3129  else {
3130  capa = RSTRING(str)->as.heap.aux.capa;
3131  sptr = RSTRING(str)->as.heap.ptr;
3132  olen = RSTRING(str)->as.heap.len;
3133  }
3134  if (olen > LONG_MAX - len) {
3135  rb_raise(rb_eArgError, "string sizes too big");
3136  }
3137  total = olen + len;
3138  if (capa < total) {
3139  if (total >= LONG_MAX / 2) {
3140  capa = total;
3141  }
3142  while (total > capa) {
3143  capa = 2 * capa + termlen; /* == 2*(capa+termlen)-termlen */
3144  }
3145  RESIZE_CAPA_TERM(str, capa, termlen);
3146  sptr = RSTRING_PTR(str);
3147  }
3148  if (off != -1) {
3149  ptr = sptr + off;
3150  }
3151  memcpy(sptr + olen, ptr, len);
3152  STR_SET_LEN(str, total);
3153  TERM_FILL(sptr + total, termlen); /* sentinel */
3154 
3155  return str;
3156 }
3157 
3158 #define str_buf_cat2(str, ptr) str_buf_cat((str), (ptr), strlen(ptr))
3159 
3160 VALUE
3161 rb_str_cat(VALUE str, const char *ptr, long len)
3162 {
3163  if (len == 0) return str;
3164  if (len < 0) {
3165  rb_raise(rb_eArgError, "negative string size (or size too big)");
3166  }
3167  return str_buf_cat(str, ptr, len);
3168 }
3169 
3170 VALUE
3171 rb_str_cat_cstr(VALUE str, const char *ptr)
3172 {
3173  must_not_null(ptr);
3174  return rb_str_buf_cat(str, ptr, strlen(ptr));
3175 }
3176 
3177 RUBY_ALIAS_FUNCTION(rb_str_buf_cat(VALUE str, const char *ptr, long len), rb_str_cat, (str, ptr, len))
3178 RUBY_ALIAS_FUNCTION(rb_str_buf_cat2(VALUE str, const char *ptr), rb_str_cat_cstr, (str, ptr))
3179 RUBY_ALIAS_FUNCTION(rb_str_cat2(VALUE str, const char *ptr), rb_str_cat_cstr, (str, ptr))
3180 
3181 static VALUE
3182 rb_enc_cr_str_buf_cat(VALUE str, const char *ptr, long len,
3183  int ptr_encindex, int ptr_cr, int *ptr_cr_ret)
3184 {
3185  int str_encindex = ENCODING_GET(str);
3186  int res_encindex;
3187  int str_cr, res_cr;
3188  rb_encoding *str_enc, *ptr_enc;
3189 
3190  str_cr = RSTRING_LEN(str) ? ENC_CODERANGE(str) : ENC_CODERANGE_7BIT;
3191 
3192  if (str_encindex == ptr_encindex) {
3193  if (str_cr != ENC_CODERANGE_UNKNOWN && ptr_cr == ENC_CODERANGE_UNKNOWN) {
3194  ptr_cr = coderange_scan(ptr, len, rb_enc_from_index(ptr_encindex));
3195  }
3196  }
3197  else {
3198  str_enc = rb_enc_from_index(str_encindex);
3199  ptr_enc = rb_enc_from_index(ptr_encindex);
3200  if (!rb_enc_asciicompat(str_enc) || !rb_enc_asciicompat(ptr_enc)) {
3201  if (len == 0)
3202  return str;
3203  if (RSTRING_LEN(str) == 0) {
3204  rb_str_buf_cat(str, ptr, len);
3205  ENCODING_CODERANGE_SET(str, ptr_encindex, ptr_cr);
3206  return str;
3207  }
3208  goto incompatible;
3209  }
3210  if (ptr_cr == ENC_CODERANGE_UNKNOWN) {
3211  ptr_cr = coderange_scan(ptr, len, ptr_enc);
3212  }
3213  if (str_cr == ENC_CODERANGE_UNKNOWN) {
3214  if (ENCODING_IS_ASCII8BIT(str) || ptr_cr != ENC_CODERANGE_7BIT) {
3215  str_cr = rb_enc_str_coderange(str);
3216  }
3217  }
3218  }
3219  if (ptr_cr_ret)
3220  *ptr_cr_ret = ptr_cr;
3221 
3222  if (str_encindex != ptr_encindex &&
3223  str_cr != ENC_CODERANGE_7BIT &&
3224  ptr_cr != ENC_CODERANGE_7BIT) {
3225  str_enc = rb_enc_from_index(str_encindex);
3226  ptr_enc = rb_enc_from_index(ptr_encindex);
3227  goto incompatible;
3228  }
3229 
3230  if (str_cr == ENC_CODERANGE_UNKNOWN) {
3231  res_encindex = str_encindex;
3232  res_cr = ENC_CODERANGE_UNKNOWN;
3233  }
3234  else if (str_cr == ENC_CODERANGE_7BIT) {
3235  if (ptr_cr == ENC_CODERANGE_7BIT) {
3236  res_encindex = str_encindex;
3237  res_cr = ENC_CODERANGE_7BIT;
3238  }
3239  else {
3240  res_encindex = ptr_encindex;
3241  res_cr = ptr_cr;
3242  }
3243  }
3244  else if (str_cr == ENC_CODERANGE_VALID) {
3245  res_encindex = str_encindex;
3246  if (ENC_CODERANGE_CLEAN_P(ptr_cr))
3247  res_cr = str_cr;
3248  else
3249  res_cr = ptr_cr;
3250  }
3251  else { /* str_cr == ENC_CODERANGE_BROKEN */
3252  res_encindex = str_encindex;
3253  res_cr = str_cr;
3254  if (0 < len) res_cr = ENC_CODERANGE_UNKNOWN;
3255  }
3256 
3257  if (len < 0) {
3258  rb_raise(rb_eArgError, "negative string size (or size too big)");
3259  }
3260  str_buf_cat(str, ptr, len);
3261  ENCODING_CODERANGE_SET(str, res_encindex, res_cr);
3262  return str;
3263 
3264  incompatible:
3265  rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
3266  rb_enc_name(str_enc), rb_enc_name(ptr_enc));
3268 }
3269 
3270 VALUE
3271 rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *ptr_enc)
3272 {
3273  return rb_enc_cr_str_buf_cat(str, ptr, len,
3274  rb_enc_to_index(ptr_enc), ENC_CODERANGE_UNKNOWN, NULL);
3275 }
3276 
3277 VALUE
3278 rb_str_buf_cat_ascii(VALUE str, const char *ptr)
3279 {
3280  /* ptr must reference NUL terminated ASCII string. */
3281  int encindex = ENCODING_GET(str);
3282  rb_encoding *enc = rb_enc_from_index(encindex);
3283  if (rb_enc_asciicompat(enc)) {
3284  return rb_enc_cr_str_buf_cat(str, ptr, strlen(ptr),
3285  encindex, ENC_CODERANGE_7BIT, 0);
3286  }
3287  else {
3288  char *buf = ALLOCA_N(char, rb_enc_mbmaxlen(enc));
3289  while (*ptr) {
3290  unsigned int c = (unsigned char)*ptr;
3291  int len = rb_enc_codelen(c, enc);
3292  rb_enc_mbcput(c, buf, enc);
3293  rb_enc_cr_str_buf_cat(str, buf, len,
3294  encindex, ENC_CODERANGE_VALID, 0);
3295  ptr++;
3296  }
3297  return str;
3298  }
3299 }
3300 
3301 VALUE
3303 {
3304  int str2_cr;
3305 
3306  str2_cr = ENC_CODERANGE(str2);
3307 
3308  rb_enc_cr_str_buf_cat(str, RSTRING_PTR(str2), RSTRING_LEN(str2),
3309  ENCODING_GET(str2), str2_cr, &str2_cr);
3310 
3311  ENC_CODERANGE_SET(str2, str2_cr);
3312 
3313  return str;
3314 }
3315 
3316 VALUE
3318 {
3319  StringValue(str2);
3320  return rb_str_buf_append(str, str2);
3321 }
3322 
3323 #define MIN_PRE_ALLOC_SIZE 48
3324 
3325 MJIT_FUNC_EXPORTED VALUE
3326 rb_str_concat_literals(size_t num, const VALUE *strary)
3327 {
3328  VALUE str;
3329  size_t i, s;
3330  long len = 1;
3331 
3332  if (UNLIKELY(!num)) return rb_str_new(0, 0);
3333  if (UNLIKELY(num == 1)) return rb_str_resurrect(strary[0]);
3334 
3335  for (i = 0; i < num; ++i) { len += RSTRING_LEN(strary[i]); }
3336  if (LIKELY(len < MIN_PRE_ALLOC_SIZE)) {
3337  str = rb_str_resurrect(strary[0]);
3338  s = 1;
3339  }
3340  else {
3341  str = rb_str_buf_new(len);
3342  rb_enc_copy(str, strary[0]);
3343  s = 0;
3344  }
3345 
3346  for (i = s; i < num; ++i) {
3347  const VALUE v = strary[i];
3348  int encidx = ENCODING_GET(v);
3349 
3350  rb_enc_cr_str_buf_cat(str, RSTRING_PTR(v), RSTRING_LEN(v),
3351  encidx, ENC_CODERANGE(v), NULL);
3352  if (encidx != ENCINDEX_US_ASCII) {
3353  if (ENCODING_GET_INLINED(str) == ENCINDEX_US_ASCII)
3354  rb_enc_set_index(str, encidx);
3355  }
3356  }
3357  return str;
3358 }
3359 
3360 /*
3361  * call-seq:
3362  * concat(*objects) -> string
3363  *
3364  * Concatenates each object in +objects+ to +self+ and returns +self+:
3365  *
3366  * s = 'foo'
3367  * s.concat('bar', 'baz') # => "foobarbaz"
3368  * s # => "foobarbaz"
3369  *
3370  * For each given object +object+ that is an \Integer,
3371  * the value is considered a codepoint and converted to a character before concatenation:
3372  *
3373  * s = 'foo'
3374  * s.concat(32, 'bar', 32, 'baz') # => "foo bar baz"
3375  *
3376  * Related: String#<<, which takes a single argument.
3377  */
3378 static VALUE
3379 rb_str_concat_multi(int argc, VALUE *argv, VALUE str)
3380 {
3381  str_modifiable(str);
3382 
3383  if (argc == 1) {
3384  return rb_str_concat(str, argv[0]);
3385  }
3386  else if (argc > 1) {
3387  int i;
3388  VALUE arg_str = rb_str_tmp_new(0);
3389  rb_enc_copy(arg_str, str);
3390  for (i = 0; i < argc; i++) {
3391  rb_str_concat(arg_str, argv[i]);
3392  }
3393  rb_str_buf_append(str, arg_str);
3394  }
3395 
3396  return str;
3397 }
3398 
3399 /*
3400  * call-seq:
3401  * string << object -> string
3402  *
3403  * Concatenates +object+ to +self+ and returns +self+:
3404  *
3405  * s = 'foo'
3406  * s << 'bar' # => "foobar"
3407  * s # => "foobar"
3408  *
3409  * If +object+ is an \Integer,
3410  * the value is considered a codepoint and converted to a character before concatenation:
3411  *
3412  * s = 'foo'
3413  * s << 33 # => "foo!"
3414  *
3415  * Related: String#concat, which takes multiple arguments.
3416  */
3417 VALUE
3419 {
3420  unsigned int code;
3421  rb_encoding *enc = STR_ENC_GET(str1);
3422  int encidx;
3423 
3424  if (RB_INTEGER_TYPE_P(str2)) {
3425  if (rb_num_to_uint(str2, &code) == 0) {
3426  }
3427  else if (FIXNUM_P(str2)) {
3428  rb_raise(rb_eRangeError, "%ld out of char range", FIX2LONG(str2));
3429  }
3430  else {
3431  rb_raise(rb_eRangeError, "bignum out of char range");
3432  }
3433  }
3434  else {
3435  return rb_str_append(str1, str2);
3436  }
3437 
3438  encidx = rb_enc_to_index(enc);
3439  if (encidx == ENCINDEX_ASCII || encidx == ENCINDEX_US_ASCII) {
3440  /* US-ASCII automatically extended to ASCII-8BIT */
3441  char buf[1];
3442  buf[0] = (char)code;
3443  if (code > 0xFF) {
3444  rb_raise(rb_eRangeError, "%u out of char range", code);
3445  }
3446  rb_str_cat(str1, buf, 1);
3447  if (encidx == ENCINDEX_US_ASCII && code > 127) {
3448  rb_enc_associate_index(str1, ENCINDEX_ASCII);
3450  }
3451  }
3452  else {
3453  long pos = RSTRING_LEN(str1);
3454  int cr = ENC_CODERANGE(str1);
3455  int len;
3456  char *buf;
3457 
3458  switch (len = rb_enc_codelen(code, enc)) {
3459  case ONIGERR_INVALID_CODE_POINT_VALUE:
3460  rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
3461  break;
3462  case ONIGERR_TOO_BIG_WIDE_CHAR_VALUE:
3463  case 0:
3464  rb_raise(rb_eRangeError, "%u out of char range", code);
3465  break;
3466  }
3467  buf = ALLOCA_N(char, len + 1);
3468  rb_enc_mbcput(code, buf, enc);
3469  if (rb_enc_precise_mbclen(buf, buf + len + 1, enc) != len) {
3470  rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
3471  }
3472  rb_str_resize(str1, pos+len);
3473  memcpy(RSTRING_PTR(str1) + pos, buf, len);
3474  if (cr == ENC_CODERANGE_7BIT && code > 127)
3475  cr = ENC_CODERANGE_VALID;
3476  ENC_CODERANGE_SET(str1, cr);
3477  }
3478  return str1;
3479 }
3480 
3481 /*
3482  * call-seq:
3483  * prepend(*other_strings) -> string
3484  *
3485  * Prepends each string in +other_strings+ to +self+ and returns +self+:
3486  *
3487  * s = 'foo'
3488  * s.prepend('bar', 'baz') # => "barbazfoo"
3489  * s # => "barbazfoo"
3490  *
3491  * Related: String#concat.
3492  */
3493 
3494 static VALUE
3495 rb_str_prepend_multi(int argc, VALUE *argv, VALUE str)
3496 {
3497  str_modifiable(str);
3498 
3499  if (argc == 1) {
3500  rb_str_update(str, 0L, 0L, argv[0]);
3501  }
3502  else if (argc > 1) {
3503  int i;
3504  VALUE arg_str = rb_str_tmp_new(0);
3505  rb_enc_copy(arg_str, str);
3506  for (i = 0; i < argc; i++) {
3507  rb_str_append(arg_str, argv[i]);
3508  }
3509  rb_str_update(str, 0L, 0L, arg_str);
3510  }
3511 
3512  return str;
3513 }
3514 
3515 st_index_t
3517 {
3518  int e = ENCODING_GET(str);
3519  if (e && rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT) {
3520  e = 0;
3521  }
3522  return rb_memhash((const void *)RSTRING_PTR(str), RSTRING_LEN(str)) ^ e;
3523 }
3524 
3525 int
3527 {
3528  long len1, len2;
3529  const char *ptr1, *ptr2;
3530  RSTRING_GETMEM(str1, ptr1, len1);
3531  RSTRING_GETMEM(str2, ptr2, len2);
3532  return (len1 != len2 ||
3533  !rb_str_comparable(str1, str2) ||
3534  memcmp(ptr1, ptr2, len1) != 0);
3535 }
3536 
3537 /*
3538  * call-seq:
3539  * hash -> integer
3540  *
3541  * Returns the integer hash value for +self+.
3542  * The value is based on the length, content and encoding of +self+.
3543  *
3544  * Related: Object#hash.
3545  */
3546 
3547 static VALUE
3548 rb_str_hash_m(VALUE str)
3549 {
3550  st_index_t hval = rb_str_hash(str);
3551  return ST2FIX(hval);
3552 }
3553 
3554 #define lesser(a,b) (((a)>(b))?(b):(a))
3555 
3556 int
3558 {
3559  int idx1, idx2;
3560  int rc1, rc2;
3561 
3562  if (RSTRING_LEN(str1) == 0) return TRUE;
3563  if (RSTRING_LEN(str2) == 0) return TRUE;
3564  idx1 = ENCODING_GET(str1);
3565  idx2 = ENCODING_GET(str2);
3566  if (idx1 == idx2) return TRUE;
3567  rc1 = rb_enc_str_coderange(str1);
3568  rc2 = rb_enc_str_coderange(str2);
3569  if (rc1 == ENC_CODERANGE_7BIT) {
3570  if (rc2 == ENC_CODERANGE_7BIT) return TRUE;
3572  return TRUE;
3573  }
3574  if (rc2 == ENC_CODERANGE_7BIT) {
3576  return TRUE;
3577  }
3578  return FALSE;
3579 }
3580 
3581 int
3583 {
3584  long len1, len2;
3585  const char *ptr1, *ptr2;
3586  int retval;
3587 
3588  if (str1 == str2) return 0;
3589  RSTRING_GETMEM(str1, ptr1, len1);
3590  RSTRING_GETMEM(str2, ptr2, len2);
3591  if (ptr1 == ptr2 || (retval = memcmp(ptr1, ptr2, lesser(len1, len2))) == 0) {
3592  if (len1 == len2) {
3593  if (!rb_str_comparable(str1, str2)) {
3594  if (ENCODING_GET(str1) > ENCODING_GET(str2))
3595  return 1;
3596  return -1;
3597  }
3598  return 0;
3599  }
3600  if (len1 > len2) return 1;
3601  return -1;
3602  }
3603  if (retval > 0) return 1;
3604  return -1;
3605 }
3606 
3607 /*
3608  * call-seq:
3609  * string == object -> true or false
3610  * string === object -> true or false
3611  *
3612  * Returns +true+ if +object+ has the same length and content;
3613  * as +self+; +false+ otherwise:
3614  *
3615  * s = 'foo'
3616  * s == 'foo' # => true
3617  * s == 'food' # => false
3618  * s == 'FOO' # => false
3619  *
3620  * Returns +false+ if the two strings' encodings are not compatible:
3621  * "\u{e4 f6 fc}".encode("ISO-8859-1") == ("\u{c4 d6 dc}") # => false
3622  *
3623  * If +object+ is not an instance of \String but responds to +to_str+, then the
3624  * two strings are compared using <code>object.==</code>.
3625  */
3626 
3627 VALUE
3629 {
3630  if (str1 == str2) return Qtrue;
3631  if (!RB_TYPE_P(str2, T_STRING)) {
3632  if (!rb_respond_to(str2, idTo_str)) {
3633  return Qfalse;
3634  }
3635  return rb_equal(str2, str1);
3636  }
3637  return rb_str_eql_internal(str1, str2);
3638 }
3639 
3640 /*
3641  * call-seq:
3642  * eql?(object) -> true or false
3643  *
3644  * Returns +true+ if +object+ has the same length and content;
3645  * as +self+; +false+ otherwise:
3646  *
3647  * s = 'foo'
3648  * s.eql?('foo') # => true
3649  * s.eql?('food') # => false
3650  * s.eql?('FOO') # => false
3651  *
3652  * Returns +false+ if the two strings' encodings are not compatible:
3653  *
3654  * "\u{e4 f6 fc}".encode("ISO-8859-1").eql?("\u{c4 d6 dc}") # => false
3655  *
3656  */
3657 
3658 MJIT_FUNC_EXPORTED VALUE
3659 rb_str_eql(VALUE str1, VALUE str2)
3660 {
3661  if (str1 == str2) return Qtrue;
3662  if (!RB_TYPE_P(str2, T_STRING)) return Qfalse;
3663  return rb_str_eql_internal(str1, str2);
3664 }
3665 
3666 /*
3667  * call-seq:
3668  * string <=> other_string -> -1, 0, 1, or nil
3669  *
3670  * Compares +self+ and +other_string+, returning:
3671  *
3672  * - -1 if +other_string+ is larger.
3673  * - 0 if the two are equal.
3674  * - 1 if +other_string+ is smaller.
3675  * - +nil+ if the two are incomparable.
3676  *
3677  * Examples:
3678  *
3679  * 'foo' <=> 'foo' # => 0
3680  * 'foo' <=> 'food' # => -1
3681  * 'food' <=> 'foo' # => 1
3682  * 'FOO' <=> 'foo' # => -1
3683  * 'foo' <=> 'FOO' # => 1
3684  * 'foo' <=> 1 # => nil
3685  *
3686  */
3687 
3688 static VALUE
3689 rb_str_cmp_m(VALUE str1, VALUE str2)
3690 {
3691  int result;
3692  VALUE s = rb_check_string_type(str2);
3693  if (NIL_P(s)) {
3694  return rb_invcmp(str1, str2);
3695  }
3696  result = rb_str_cmp(str1, s);
3697  return INT2FIX(result);
3698 }
3699 
3700 static VALUE str_casecmp(VALUE str1, VALUE str2);
3701 static VALUE str_casecmp_p(VALUE str1, VALUE str2);
3702 
3703 /*
3704  * call-seq:
3705  * casecmp(other_string) -> -1, 0, 1, or nil
3706  *
3707  * Compares <tt>self.downcase</tt> and <tt>other_string.downcase</tt>; returns:
3708  *
3709  * - -1 if <tt>other_string.downcase</tt> is larger.
3710  * - 0 if the two are equal.
3711  * - 1 if <tt>other_string.downcase</tt> is smaller.
3712  * - +nil+ if the two are incomparable.
3713  *
3714  * Examples:
3715  *
3716  * 'foo'.casecmp('foo') # => 0
3717  * 'foo'.casecmp('food') # => -1
3718  * 'food'.casecmp('foo') # => 1
3719  * 'FOO'.casecmp('foo') # => 0
3720  * 'foo'.casecmp('FOO') # => 0
3721  * 'foo'.casecmp(1) # => nil
3722  *
3723  * See {Case Mapping}[doc/case_mapping_rdoc.html].
3724  *
3725  * Related: String#casecmp?.
3726  *
3727  */
3728 
3729 static VALUE
3730 rb_str_casecmp(VALUE str1, VALUE str2)
3731 {
3732  VALUE s = rb_check_string_type(str2);
3733  if (NIL_P(s)) {
3734  return Qnil;
3735  }
3736  return str_casecmp(str1, s);
3737 }
3738 
3739 static VALUE
3740 str_casecmp(VALUE str1, VALUE str2)
3741 {
3742  long len;
3743  rb_encoding *enc;
3744  const char *p1, *p1end, *p2, *p2end;
3745 
3746  enc = rb_enc_compatible(str1, str2);
3747  if (!enc) {
3748  return Qnil;
3749  }
3750 
3751  p1 = RSTRING_PTR(str1); p1end = RSTRING_END(str1);
3752  p2 = RSTRING_PTR(str2); p2end = RSTRING_END(str2);
3753  if (single_byte_optimizable(str1) && single_byte_optimizable(str2)) {
3754  while (p1 < p1end && p2 < p2end) {
3755  if (*p1 != *p2) {
3756  unsigned int c1 = TOLOWER(*p1 & 0xff);
3757  unsigned int c2 = TOLOWER(*p2 & 0xff);
3758  if (c1 != c2)
3759  return INT2FIX(c1 < c2 ? -1 : 1);
3760  }
3761  p1++;
3762  p2++;
3763  }
3764  }
3765  else {
3766  while (p1 < p1end && p2 < p2end) {
3767  int l1, c1 = rb_enc_ascget(p1, p1end, &l1, enc);
3768  int l2, c2 = rb_enc_ascget(p2, p2end, &l2, enc);
3769 
3770  if (0 <= c1 && 0 <= c2) {
3771  c1 = TOLOWER(c1);
3772  c2 = TOLOWER(c2);
3773  if (c1 != c2)
3774  return INT2FIX(c1 < c2 ? -1 : 1);
3775  }
3776  else {
3777  int r;
3778  l1 = rb_enc_mbclen(p1, p1end, enc);
3779  l2 = rb_enc_mbclen(p2, p2end, enc);
3780  len = l1 < l2 ? l1 : l2;
3781  r = memcmp(p1, p2, len);
3782  if (r != 0)
3783  return INT2FIX(r < 0 ? -1 : 1);
3784  if (l1 != l2)
3785  return INT2FIX(l1 < l2 ? -1 : 1);
3786  }
3787  p1 += l1;
3788  p2 += l2;
3789  }
3790  }
3791  if (RSTRING_LEN(str1) == RSTRING_LEN(str2)) return INT2FIX(0);
3792  if (RSTRING_LEN(str1) > RSTRING_LEN(str2)) return INT2FIX(1);
3793  return INT2FIX(-1);
3794 }
3795 
3796 /*
3797  * call-seq:
3798  * casecmp?(other_string) -> true, false, or nil
3799  *
3800  * Returns +true+ if +self+ and +other_string+ are equal after
3801  * Unicode case folding, otherwise +false+:
3802  *
3803  * 'foo'.casecmp?('foo') # => true
3804  * 'foo'.casecmp?('food') # => false
3805  * 'food'.casecmp?('foo') # => false
3806  * 'FOO'.casecmp?('foo') # => true
3807  * 'foo'.casecmp?('FOO') # => true
3808  *
3809  * Returns +nil+ if the two values are incomparable:
3810  *
3811  * 'foo'.casecmp?(1) # => nil
3812  *
3813  * See {Case Mapping}[doc/case_mapping_rdoc.html].
3814  *
3815  * Related: String#casecmp.
3816  *
3817  */
3818 
3819 static VALUE
3820 rb_str_casecmp_p(VALUE str1, VALUE str2)
3821 {
3822  VALUE s = rb_check_string_type(str2);
3823  if (NIL_P(s)) {
3824  return Qnil;
3825  }
3826  return str_casecmp_p(str1, s);
3827 }
3828 
3829 static VALUE
3830 str_casecmp_p(VALUE str1, VALUE str2)
3831 {
3832  rb_encoding *enc;
3833  VALUE folded_str1, folded_str2;
3834  VALUE fold_opt = sym_fold;
3835 
3836  enc = rb_enc_compatible(str1, str2);
3837  if (!enc) {
3838  return Qnil;
3839  }
3840 
3841  folded_str1 = rb_str_downcase(1, &fold_opt, str1);
3842  folded_str2 = rb_str_downcase(1, &fold_opt, str2);
3843 
3844  return rb_str_eql(folded_str1, folded_str2);
3845 }
3846 
3847 static long
3848 strseq_core(const char *str_ptr, const char *str_ptr_end, long str_len,
3849  const char *sub_ptr, long sub_len, long offset, rb_encoding *enc)
3850 {
3851  const char *search_start = str_ptr;
3852  long pos, search_len = str_len - offset;
3853 
3854  for (;;) {
3855  const char *t;
3856  pos = rb_memsearch(sub_ptr, sub_len, search_start, search_len, enc);
3857  if (pos < 0) return pos;
3858  t = rb_enc_right_char_head(search_start, search_start+pos, str_ptr_end, enc);
3859  if (t == search_start + pos) break;
3860  search_len -= t - search_start;
3861  if (search_len <= 0) return -1;
3862  offset += t - search_start;
3863  search_start = t;
3864  }
3865  return pos + offset;
3866 }
3867 
3868 #define rb_str_index(str, sub, offset) rb_strseq_index(str, sub, offset, 0)
3869 
3870 static long
3871 rb_strseq_index(VALUE str, VALUE sub, long offset, int in_byte)
3872 {
3873  const char *str_ptr, *str_ptr_end, *sub_ptr;
3874  long str_len, sub_len;
3875  rb_encoding *enc;
3876 
3877  enc = rb_enc_check(str, sub);
3878  if (is_broken_string(sub)) return -1;
3879 
3880  str_ptr = RSTRING_PTR(str);
3881  str_ptr_end = RSTRING_END(str);
3882  str_len = RSTRING_LEN(str);
3883  sub_ptr = RSTRING_PTR(sub);
3884  sub_len = RSTRING_LEN(sub);
3885 
3886  if (str_len < sub_len) return -1;
3887 
3888  if (offset != 0) {
3889  long str_len_char, sub_len_char;
3890  int single_byte = single_byte_optimizable(str);
3891  str_len_char = (in_byte || single_byte) ? str_len : str_strlen(str, enc);
3892  sub_len_char = in_byte ? sub_len : str_strlen(sub, enc);
3893  if (offset < 0) {
3894  offset += str_len_char;
3895  if (offset < 0) return -1;
3896  }
3897  if (str_len_char - offset < sub_len_char) return -1;
3898  if (!in_byte) offset = str_offset(str_ptr, str_ptr_end, offset, enc, single_byte);
3899  str_ptr += offset;
3900  }
3901  if (sub_len == 0) return offset;
3902 
3903  /* need proceed one character at a time */
3904  return strseq_core(str_ptr, str_ptr_end, str_len, sub_ptr, sub_len, offset, enc);
3905 }
3906 
3907 
3908 /*
3909  * call-seq:
3910  * index(substring, offset = 0) -> integer or nil
3911  * index(regexp, offset = 0) -> integer or nil
3912  *
3913  * Returns the \Integer index of the first occurrence of the given +substring+,
3914  * or +nil+ if none found:
3915  *
3916  * 'foo'.index('f') # => 0
3917  * 'foo'.index('o') # => 1
3918  * 'foo'.index('oo') # => 1
3919  * 'foo'.index('ooo') # => nil
3920  *
3921  * Returns the \Integer index of the first match for the given \Regexp +regexp+,
3922  * or +nil+ if none found:
3923  *
3924  * 'foo'.index(/f/) # => 0
3925  * 'foo'.index(/o/) # => 1
3926  * 'foo'.index(/oo/) # => 1
3927  * 'foo'.index(/ooo/) # => nil
3928  *
3929  * \Integer argument +offset+, if given, specifies the position in the
3930  * string to begin the search:
3931  *
3932  * 'foo'.index('o', 1) # => 1
3933  * 'foo'.index('o', 2) # => 2
3934  * 'foo'.index('o', 3) # => nil
3935  *
3936  * If +offset+ is negative, counts backward from the end of +self+:
3937  *
3938  * 'foo'.index('o', -1) # => 2
3939  * 'foo'.index('o', -2) # => 1
3940  * 'foo'.index('o', -3) # => 1
3941  * 'foo'.index('o', -4) # => nil
3942  *
3943  * Related: String#rindex.
3944  */
3945 
3946 static VALUE
3947 rb_str_index_m(int argc, VALUE *argv, VALUE str)
3948 {
3949  VALUE sub;
3950  VALUE initpos;
3951  long pos;
3952 
3953  if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
3954  pos = NUM2LONG(initpos);
3955  }
3956  else {
3957  pos = 0;
3958  }
3959  if (pos < 0) {
3960  pos += str_strlen(str, NULL);
3961  if (pos < 0) {
3962  if (RB_TYPE_P(sub, T_REGEXP)) {
3964  }
3965  return Qnil;
3966  }
3967  }
3968 
3969  if (RB_TYPE_P(sub, T_REGEXP)) {
3970  if (pos > str_strlen(str, NULL))
3971  return Qnil;
3972  pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
3973  rb_enc_check(str, sub), single_byte_optimizable(str));
3974 
3975  if (rb_reg_search(sub, str, pos, 0) < 0) {
3976  return Qnil;
3977  }
3978  else {
3979  VALUE match = rb_backref_get();
3980  struct re_registers *regs = RMATCH_REGS(match);
3981  pos = rb_str_sublen(str, BEG(0));
3982  return LONG2NUM(pos);
3983  }
3984  }
3985  else {
3986  StringValue(sub);
3987  pos = rb_str_index(str, sub, pos);
3988  pos = rb_str_sublen(str, pos);
3989  }
3990 
3991  if (pos == -1) return Qnil;
3992  return LONG2NUM(pos);
3993 }
3994 
3995 #ifdef HAVE_MEMRCHR
3996 static long
3997 str_rindex(VALUE str, VALUE sub, const char *s, long pos, rb_encoding *enc)
3998 {
3999  char *hit, *adjusted;
4000  int c;
4001  long slen, searchlen;
4002  char *sbeg, *e, *t;
4003 
4004  slen = RSTRING_LEN(sub);
4005  if (slen == 0) return pos;
4006  sbeg = RSTRING_PTR(str);
4007  e = RSTRING_END(str);
4008  t = RSTRING_PTR(sub);
4009  c = *t & 0xff;
4010  searchlen = s - sbeg + 1;
4011 
4012  do {
4013  hit = memrchr(sbeg, c, searchlen);
4014  if (!hit) break;
4015  adjusted = rb_enc_left_char_head(sbeg, hit, e, enc);
4016  if (hit != adjusted) {
4017  searchlen = adjusted - sbeg;
4018  continue;
4019  }
4020  if (memcmp(hit, t, slen) == 0)
4021  return rb_str_sublen(str, hit - sbeg);
4022  searchlen = adjusted - sbeg;
4023  } while (searchlen > 0);
4024 
4025  return -1;
4026 }
4027 #else
4028 static long
4029 str_rindex(VALUE str, VALUE sub, const char *s, long pos, rb_encoding *enc)
4030 {
4031  long slen;
4032  char *sbeg, *e, *t;
4033 
4034  sbeg = RSTRING_PTR(str);
4035  e = RSTRING_END(str);
4036  t = RSTRING_PTR(sub);
4037  slen = RSTRING_LEN(sub);
4038 
4039  while (s) {
4040  if (memcmp(s, t, slen) == 0) {
4041  return pos;
4042  }
4043  if (pos == 0) break;
4044  pos--;
4045  s = rb_enc_prev_char(sbeg, s, e, enc);
4046  }
4047 
4048  return -1;
4049 }
4050 #endif
4051 
4052 static long
4053 rb_str_rindex(VALUE str, VALUE sub, long pos)
4054 {
4055  long len, slen;
4056  char *sbeg, *s;
4057  rb_encoding *enc;
4058  int singlebyte;
4059 
4060  enc = rb_enc_check(str, sub);
4061  if (is_broken_string(sub)) return -1;
4062  singlebyte = single_byte_optimizable(str);
4063  len = singlebyte ? RSTRING_LEN(str) : str_strlen(str, enc); /* rb_enc_check */
4064  slen = str_strlen(sub, enc); /* rb_enc_check */
4065 
4066  /* substring longer than string */
4067  if (len < slen) return -1;
4068  if (len - pos < slen) pos = len - slen;
4069  if (len == 0) return pos;
4070 
4071  sbeg = RSTRING_PTR(str);
4072 
4073  if (pos == 0) {
4074  if (memcmp(sbeg, RSTRING_PTR(sub), RSTRING_LEN(sub)) == 0)
4075  return 0;
4076  else
4077  return -1;
4078  }
4079 
4080  s = str_nth(sbeg, RSTRING_END(str), pos, enc, singlebyte);
4081  return str_rindex(str, sub, s, pos, enc);
4082 }
4083 
4084 /*
4085  * call-seq:
4086  * rindex(substring, offset = self.length) -> integer or nil
4087  * rindex(regexp, offset = self.length) -> integer or nil
4088  *
4089  * Returns the \Integer index of the _last_ occurrence of the given +substring+,
4090  * or +nil+ if none found:
4091  *
4092  * 'foo'.rindex('f') # => 0
4093  * 'foo'.rindex('o') # => 2
4094  * 'foo'.rindex('oo') # => 1
4095  * 'foo'.rindex('ooo') # => nil
4096  *
4097  * Returns the \Integer index of the _last_ match for the given \Regexp +regexp+,
4098  * or +nil+ if none found:
4099  *
4100  * 'foo'.rindex(/f/) # => 0
4101  * 'foo'.rindex(/o/) # => 2
4102  * 'foo'.rindex(/oo/) # => 1
4103  * 'foo'.rindex(/ooo/) # => nil
4104  *
4105  * The _last_ match means starting at the possible last position, not
4106  * the last of longest matches.
4107  *
4108  * 'foo'.rindex(/o+/) # => 2
4109  * $~ #=> #<MatchData "o">
4110  *
4111  * To get the last longest match, needs to combine with negative
4112  * lookbehind.
4113  *
4114  * 'foo'.rindex(/(?<!o)o+/) # => 1
4115  * $~ #=> #<MatchData "oo">
4116  *
4117  * Or String#index with negative lookforward.
4118  *
4119  * 'foo'.index(/o+(?!.*o)/) # => 1
4120  * $~ #=> #<MatchData "oo">
4121  *
4122  * \Integer argument +offset+, if given and non-negative, specifies the maximum starting position in the
4123  * string to _end_ the search:
4124  *
4125  * 'foo'.rindex('o', 0) # => nil
4126  * 'foo'.rindex('o', 1) # => 1
4127  * 'foo'.rindex('o', 2) # => 2
4128  * 'foo'.rindex('o', 3) # => 2
4129  *
4130  * If +offset+ is a negative \Integer, the maximum starting position in the
4131  * string to _end_ the search is the sum of the string's length and +offset+:
4132  *
4133  * 'foo'.rindex('o', -1) # => 2
4134  * 'foo'.rindex('o', -2) # => 1
4135  * 'foo'.rindex('o', -3) # => nil
4136  * 'foo'.rindex('o', -4) # => nil
4137  *
4138  * Related: String#index.
4139  */
4140 
4141 static VALUE
4142 rb_str_rindex_m(int argc, VALUE *argv, VALUE str)
4143 {
4144  VALUE sub;
4145  VALUE vpos;
4146  rb_encoding *enc = STR_ENC_GET(str);
4147  long pos, len = str_strlen(str, enc); /* str's enc */
4148 
4149  if (rb_scan_args(argc, argv, "11", &sub, &vpos) == 2) {
4150  pos = NUM2LONG(vpos);
4151  if (pos < 0) {
4152  pos += len;
4153  if (pos < 0) {
4154  if (RB_TYPE_P(sub, T_REGEXP)) {
4156  }
4157  return Qnil;
4158  }
4159  }
4160  if (pos > len) pos = len;
4161  }
4162  else {
4163  pos = len;
4164  }
4165 
4166  if (RB_TYPE_P(sub, T_REGEXP)) {
4167  /* enc = rb_get_check(str, sub); */
4168  pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
4169  enc, single_byte_optimizable(str));
4170 
4171  if (rb_reg_search(sub, str, pos, 1) >= 0) {
4172  VALUE match = rb_backref_get();
4173  struct re_registers *regs = RMATCH_REGS(match);
4174  pos = rb_str_sublen(str, BEG(0));
4175  return LONG2NUM(pos);
4176  }
4177  }
4178  else {
4179  StringValue(sub);
4180  pos = rb_str_rindex(str, sub, pos);
4181  if (pos >= 0) return LONG2NUM(pos);
4182  }
4183  return Qnil;
4184 }
4185 
4186 /*
4187  * call-seq:
4188  * string =~ regexp -> integer or nil
4189  * string =~ object -> integer or nil
4190  *
4191  * Returns the \Integer index of the first substring that matches
4192  * the given +regexp+, or +nil+ if no match found:
4193  *
4194  * 'foo' =~ /f/ # => 0
4195  * 'foo' =~ /o/ # => 1
4196  * 'foo' =~ /x/ # => nil
4197  *
4198  * Note: also updates
4199  * {Regexp-related global variables}[Regexp.html#class-Regexp-label-Special+global+variables].
4200  *
4201  * If the given +object+ is not a \Regexp, returns the value
4202  * returned by <tt>object =~ self</tt>.
4203  *
4204  * Note that <tt>string =~ regexp</tt> is different from <tt>regexp =~ string</tt>
4205  * (see {Regexp#=~}[https://ruby-doc.org/core-2.7.1/Regexp.html#method-i-3D-7E]):
4206  *
4207  * number= nil
4208  * "no. 9" =~ /(?<number>\d+)/
4209  * number # => nil (not assigned)
4210  * /(?<number>\d+)/ =~ "no. 9"
4211  * number #=> "9"
4212  *
4213  */
4214 
4215 static VALUE
4216 rb_str_match(VALUE x, VALUE y)
4217 {
4218  switch (OBJ_BUILTIN_TYPE(y)) {
4219  case T_STRING:
4220  rb_raise(rb_eTypeError, "type mismatch: String given");
4221 
4222  case T_REGEXP:
4223  return rb_reg_match(y, x);
4224 
4225  default:
4226  return rb_funcall(y, idEqTilde, 1, x);
4227  }
4228 }
4229 
4230 
4231 static VALUE get_pat(VALUE);
4232 
4233 
4234 /*
4235  * call-seq:
4236  * match(pattern, offset = 0) -> matchdata or nil
4237  * match(pattern, offset = 0) {|matchdata| ... } -> object
4238  *
4239  * Returns a \Matchdata object (or +nil+) based on +self+ and the given +pattern+.
4240  *
4241  * Note: also updates
4242  * {Regexp-related global variables}[Regexp.html#class-Regexp-label-Special+global+variables].
4243  *
4244  * - Computes +regexp+ by converting +pattern+ (if not already a \Regexp).
4245  * regexp = Regexp.new(pattern)
4246  * - Computes +matchdata+, which will be either a \MatchData object or +nil+
4247  * (see Regexp#match):
4248  * matchdata = <tt>regexp.match(self)
4249  *
4250  * With no block given, returns the computed +matchdata+:
4251  *
4252  * 'foo'.match('f') # => #<MatchData "f">
4253  * 'foo'.match('o') # => #<MatchData "o">
4254  * 'foo'.match('x') # => nil
4255  *
4256  * If \Integer argument +offset+ is given, the search begins at index +offset+:
4257  *
4258  * 'foo'.match('f', 1) # => nil
4259  * 'foo'.match('o', 1) # => #<MatchData "o">
4260  *
4261  * With a block given, calls the block with the computed +matchdata+
4262  * and returns the block's return value:
4263  *
4264  * 'foo'.match(/o/) {|matchdata| matchdata } # => #<MatchData "o">
4265  * 'foo'.match(/x/) {|matchdata| matchdata } # => nil
4266  * 'foo'.match(/f/, 1) {|matchdata| matchdata } # => nil
4267  *
4268  */
4269 
4270 static VALUE
4271 rb_str_match_m(int argc, VALUE *argv, VALUE str)
4272 {
4273  VALUE re, result;
4274  if (argc < 1)
4275  rb_check_arity(argc, 1, 2);
4276  re = argv[0];
4277  argv[0] = str;
4278  result = rb_funcallv(get_pat(re), rb_intern("match"), argc, argv);
4279  if (!NIL_P(result) && rb_block_given_p()) {
4280  return rb_yield(result);
4281  }
4282  return result;
4283 }
4284 
4285 /*
4286  * call-seq:
4287  * match?(pattern, offset = 0) -> true or false
4288  *
4289  * Returns +true+ or +false+ based on whether a match is found for +self+ and +pattern+.
4290  *
4291  * Note: does not update
4292  * {Regexp-related global variables}[Regexp.html#class-Regexp-label-Special+global+variables].
4293  *
4294  * Computes +regexp+ by converting +pattern+ (if not already a \Regexp).
4295  * regexp = Regexp.new(pattern)
4296  *
4297  * Returns +true+ if <tt>self+.match(regexp)</tt> returns a \Matchdata object,
4298  * +false+ otherwise:
4299  *
4300  * 'foo'.match?(/o/) # => true
4301  * 'foo'.match?('o') # => true
4302  * 'foo'.match?(/x/) # => false
4303  *
4304  * If \Integer argument +offset+ is given, the search begins at index +offset+:
4305  * 'foo'.match?('f', 1) # => false
4306  * 'foo'.match?('o', 1) # => true
4307  *
4308  */
4309 
4310 static VALUE
4311 rb_str_match_m_p(int argc, VALUE *argv, VALUE str)
4312 {
4313  VALUE re;
4314  rb_check_arity(argc, 1, 2);
4315  re = get_pat(argv[0]);
4316  return rb_reg_match_p(re, str, argc > 1 ? NUM2LONG(argv[1]) : 0);
4317 }
4318 
4319 enum neighbor_char {
4320  NEIGHBOR_NOT_CHAR,
4321  NEIGHBOR_FOUND,
4322  NEIGHBOR_WRAPPED
4323 };
4324 
4325 static enum neighbor_char
4326 enc_succ_char(char *p, long len, rb_encoding *enc)
4327 {
4328  long i;
4329  int l;
4330 
4331  if (rb_enc_mbminlen(enc) > 1) {
4332  /* wchar, trivial case */
4333  int r = rb_enc_precise_mbclen(p, p + len, enc), c;
4334  if (!MBCLEN_CHARFOUND_P(r)) {
4335  return NEIGHBOR_NOT_CHAR;
4336  }
4337  c = rb_enc_mbc_to_codepoint(p, p + len, enc) + 1;
4338  l = rb_enc_code_to_mbclen(c, enc);
4339  if (!l) return NEIGHBOR_NOT_CHAR;
4340  if (l != len) return NEIGHBOR_WRAPPED;
4341  rb_enc_mbcput(c, p, enc);
4342  r = rb_enc_precise_mbclen(p, p + len, enc);
4343  if (!MBCLEN_CHARFOUND_P(r)) {
4344  return NEIGHBOR_NOT_CHAR;
4345  }
4346  return NEIGHBOR_FOUND;
4347  }
4348  while (1) {
4349  for (i = len-1; 0 <= i && (unsigned char)p[i] == 0xff; i--)
4350  p[i] = '\0';
4351  if (i < 0)
4352  return NEIGHBOR_WRAPPED;
4353  ++((unsigned char*)p)[i];
4354  l = rb_enc_precise_mbclen(p, p+len, enc);
4355  if (MBCLEN_CHARFOUND_P(l)) {
4356  l = MBCLEN_CHARFOUND_LEN(l);
4357  if (l == len) {
4358  return NEIGHBOR_FOUND;
4359  }
4360  else {
4361  memset(p+l, 0xff, len-l);
4362  }
4363  }
4364  if (MBCLEN_INVALID_P(l) && i < len-1) {
4365  long len2;
4366  int l2;
4367  for (len2 = len-1; 0 < len2; len2--) {
4368  l2 = rb_enc_precise_mbclen(p, p+len2, enc);
4369  if (!MBCLEN_INVALID_P(l2))
4370  break;
4371  }
4372  memset(p+len2+1, 0xff, len-(len2+1));
4373  }
4374  }
4375 }
4376 
4377 static enum neighbor_char
4378 enc_pred_char(char *p, long len, rb_encoding *enc)
4379 {
4380  long i;
4381  int l;
4382  if (rb_enc_mbminlen(enc) > 1) {
4383  /* wchar, trivial case */
4384  int r = rb_enc_precise_mbclen(p, p + len, enc), c;
4385  if (!MBCLEN_CHARFOUND_P(r)) {
4386  return NEIGHBOR_NOT_CHAR;
4387  }
4388  c = rb_enc_mbc_to_codepoint(p, p + len, enc);
4389  if (!c) return NEIGHBOR_NOT_CHAR;
4390  --c;
4391  l = rb_enc_code_to_mbclen(c, enc);
4392  if (!l) return NEIGHBOR_NOT_CHAR;
4393  if (l != len) return NEIGHBOR_WRAPPED;
4394  rb_enc_mbcput(c, p, enc);
4395  r = rb_enc_precise_mbclen(p, p + len, enc);
4396  if (!MBCLEN_CHARFOUND_P(r)) {
4397  return NEIGHBOR_NOT_CHAR;
4398  }
4399  return NEIGHBOR_FOUND;
4400  }
4401  while (1) {
4402  for (i = len-1; 0 <= i && (unsigned char)p[i] == 0; i--)
4403  p[i] = '\xff';
4404  if (i < 0)
4405  return NEIGHBOR_WRAPPED;
4406  --((unsigned char*)p)[i];
4407  l = rb_enc_precise_mbclen(p, p+len, enc);
4408  if (MBCLEN_CHARFOUND_P(l)) {
4409  l = MBCLEN_CHARFOUND_LEN(l);
4410  if (l == len) {
4411  return NEIGHBOR_FOUND;
4412  }
4413  else {
4414  memset(p+l, 0, len-l);
4415  }
4416  }
4417  if (MBCLEN_INVALID_P(l) && i < len-1) {
4418  long len2;
4419  int l2;
4420  for (len2 = len-1; 0 < len2; len2--) {
4421  l2 = rb_enc_precise_mbclen(p, p+len2, enc);
4422  if (!MBCLEN_INVALID_P(l2))
4423  break;
4424  }
4425  memset(p+len2+1, 0, len-(len2+1));
4426  }
4427  }
4428 }
4429 
4430 /*
4431  overwrite +p+ by succeeding letter in +enc+ and returns
4432  NEIGHBOR_FOUND or NEIGHBOR_WRAPPED.
4433  When NEIGHBOR_WRAPPED, carried-out letter is stored into carry.
4434  assuming each ranges are successive, and mbclen
4435  never change in each ranges.
4436  NEIGHBOR_NOT_CHAR is returned if invalid character or the range has only one
4437  character.
4438  */
4439 static enum neighbor_char
4440 enc_succ_alnum_char(char *p, long len, rb_encoding *enc, char *carry)
4441 {
4442  enum neighbor_char ret;
4443  unsigned int c;
4444  int ctype;
4445  int range;
4446  char save[ONIGENC_CODE_TO_MBC_MAXLEN];
4447 
4448  /* skip 03A2, invalid char between GREEK CAPITAL LETTERS */
4449  int try;
4450  const int max_gaps = 1;
4451 
4452  c = rb_enc_mbc_to_codepoint(p, p+len, enc);
4453  if (rb_enc_isctype(c, ONIGENC_CTYPE_DIGIT, enc))
4454  ctype = ONIGENC_CTYPE_DIGIT;
4455  else if (rb_enc_isctype(c, ONIGENC_CTYPE_ALPHA, enc))
4456  ctype = ONIGENC_CTYPE_ALPHA;
4457  else
4458  return NEIGHBOR_NOT_CHAR;
4459 
4460  MEMCPY(save, p, char, len);
4461  for (try = 0; try <= max_gaps; ++try) {
4462  ret = enc_succ_char(p, len, enc);
4463  if (ret == NEIGHBOR_FOUND) {
4464  c = rb_enc_mbc_to_codepoint(p, p+len, enc);
4465  if (rb_enc_isctype(c, ctype, enc))
4466  return NEIGHBOR_FOUND;
4467  }
4468  }
4469  MEMCPY(p, save, char, len);
4470  range = 1;
4471  while (1) {
4472  MEMCPY(save, p, char, len);
4473  ret = enc_pred_char(p, len, enc);
4474  if (ret == NEIGHBOR_FOUND) {
4475  c = rb_enc_mbc_to_codepoint(p, p+len, enc);
4476  if (!rb_enc_isctype(c, ctype, enc)) {
4477  MEMCPY(p, save, char, len);
4478  break;
4479  }
4480  }
4481  else {
4482  MEMCPY(p, save, char, len);
4483  break;
4484  }
4485  range++;
4486  }
4487  if (range == 1) {
4488  return NEIGHBOR_NOT_CHAR;
4489  }
4490 
4491  if (ctype != ONIGENC_CTYPE_DIGIT) {
4492  MEMCPY(carry, p, char, len);
4493  return NEIGHBOR_WRAPPED;
4494  }
4495 
4496  MEMCPY(carry, p, char, len);
4497  enc_succ_char(carry, len, enc);
4498  return NEIGHBOR_WRAPPED;
4499 }
4500 
4501 
4502 static VALUE str_succ(VALUE str);
4503 
4504 /*
4505  * call-seq:
4506  * succ -> new_str
4507  *
4508  * Returns the successor to +self+. The successor is calculated by
4509  * incrementing characters.
4510  *
4511  * The first character to be incremented is the rightmost alphanumeric:
4512  * or, if no alphanumerics, the rightmost character:
4513  *
4514  * 'THX1138'.succ # => "THX1139"
4515  * '<<koala>>'.succ # => "<<koalb>>"
4516  * '***'.succ # => '**+'
4517  *
4518  * The successor to a digit is another digit, "carrying" to the next-left
4519  * character for a "rollover" from 9 to 0, and prepending another digit
4520  * if necessary:
4521  *
4522  * '00'.succ # => "01"
4523  * '09'.succ # => "10"
4524  * '99'.succ # => "100"
4525  *
4526  * The successor to a letter is another letter of the same case,
4527  * carrying to the next-left character for a rollover,
4528  * and prepending another same-case letter if necessary:
4529  *
4530  * 'aa'.succ # => "ab"
4531  * 'az'.succ # => "ba"
4532  * 'zz'.succ # => "aaa"
4533  * 'AA'.succ # => "AB"
4534  * 'AZ'.succ # => "BA"
4535  * 'ZZ'.succ # => "AAA"
4536  *
4537  * The successor to a non-alphanumeric character is the next character
4538  * in the underlying character set's collating sequence,
4539  * carrying to the next-left character for a rollover,
4540  * and prepending another character if necessary:
4541  *
4542  * s = 0.chr * 3
4543  * s # => "\x00\x00\x00"
4544  * s.succ # => "\x00\x00\x01"
4545  * s = 255.chr * 3
4546  * s # => "\xFF\xFF\xFF"
4547  * s.succ # => "\x01\x00\x00\x00"
4548  *
4549  * Carrying can occur between and among mixtures of alphanumeric characters:
4550  *
4551  * s = 'zz99zz99'
4552  * s.succ # => "aaa00aa00"
4553  * s = '99zz99zz'
4554  * s.succ # => "100aa00aa"
4555  *
4556  * The successor to an empty \String is a new empty \String:
4557  *
4558  * ''.succ # => ""
4559  *
4560  * String#next is an alias for String#succ.
4561  */
4562 
4563 VALUE
4565 {
4566  VALUE str;
4567  str = rb_str_new(RSTRING_PTR(orig), RSTRING_LEN(orig));
4568  rb_enc_cr_str_copy_for_substr(str, orig);
4569  return str_succ(str);
4570 }
4571 
4572 static VALUE
4573 str_succ(VALUE str)
4574 {
4575  rb_encoding *enc;
4576  char *sbeg, *s, *e, *last_alnum = 0;
4577  int found_alnum = 0;
4578  long l, slen;
4579  char carry[ONIGENC_CODE_TO_MBC_MAXLEN] = "\1";
4580  long carry_pos = 0, carry_len = 1;
4581  enum neighbor_char neighbor = NEIGHBOR_FOUND;
4582 
4583  slen = RSTRING_LEN(str);
4584  if (slen == 0) return str;
4585 
4586  enc = STR_ENC_GET(str);
4587  sbeg = RSTRING_PTR(str);
4588  s = e = sbeg + slen;
4589 
4590  while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
4591  if (neighbor == NEIGHBOR_NOT_CHAR && last_alnum) {
4592  if (ISALPHA(*last_alnum) ? ISDIGIT(*s) :
4593  ISDIGIT(*last_alnum) ? ISALPHA(*s) : 0) {
4594  break;
4595  }
4596  }
4597  l = rb_enc_precise_mbclen(s, e, enc);
4598  if (!ONIGENC_MBCLEN_CHARFOUND_P(l)) continue;
4599  l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
4600  neighbor = enc_succ_alnum_char(s, l, enc, carry);
4601  switch (neighbor) {
4602  case NEIGHBOR_NOT_CHAR:
4603  continue;
4604  case NEIGHBOR_FOUND:
4605  return str;
4606  case NEIGHBOR_WRAPPED:
4607  last_alnum = s;
4608  break;
4609  }
4610  found_alnum = 1;
4611  carry_pos = s - sbeg;
4612  carry_len = l;
4613  }
4614  if (!found_alnum) { /* str contains no alnum */
4615  s = e;
4616  while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
4617  enum neighbor_char neighbor;
4618  char tmp[ONIGENC_CODE_TO_MBC_MAXLEN];
4619  l = rb_enc_precise_mbclen(s, e, enc);
4620  if (!ONIGENC_MBCLEN_CHARFOUND_P(l)) continue;
4621  l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
4622  MEMCPY(tmp, s, char, l);
4623  neighbor = enc_succ_char(tmp, l, enc);
4624  switch (neighbor) {
4625  case NEIGHBOR_FOUND:
4626  MEMCPY(s, tmp, char, l);
4627  return str;
4628  break;
4629  case NEIGHBOR_WRAPPED:
4630  MEMCPY(s, tmp, char, l);
4631  break;
4632  case NEIGHBOR_NOT_CHAR:
4633  break;
4634  }
4635  if (rb_enc_precise_mbclen(s, s+l, enc) != l) {
4636  /* wrapped to \0...\0. search next valid char. */
4637  enc_succ_char(s, l, enc);
4638  }
4639  if (!rb_enc_asciicompat(enc)) {
4640  MEMCPY(carry, s, char, l);
4641  carry_len = l;
4642  }
4643  carry_pos = s - sbeg;
4644  }
4646  }
4647  RESIZE_CAPA(str, slen + carry_len);
4648  sbeg = RSTRING_PTR(str);
4649  s = sbeg + carry_pos;
4650  memmove(s + carry_len, s, slen - carry_pos);
4651  memmove(s, carry, carry_len);
4652  slen += carry_len;
4653  STR_SET_LEN(str, slen);
4654  TERM_FILL(&sbeg[slen], rb_enc_mbminlen(enc));
4655  rb_enc_str_coderange(str);
4656  return str;
4657 }
4658 
4659 
4660 /*
4661  * call-seq:
4662  * succ! -> self
4663  *
4664  * Equivalent to String#succ, but modifies +self+ in place; returns +self+.
4665  *
4666  * String#next! is an alias for String#succ!.
4667  */
4668 
4669 static VALUE
4670 rb_str_succ_bang(VALUE str)
4671 {
4672  rb_str_modify(str);
4673  str_succ(str);
4674  return str;
4675 }
4676 
4677 static int
4678 all_digits_p(const char *s, long len)
4679 {
4680  while (len-- > 0) {
4681  if (!ISDIGIT(*s)) return 0;
4682  s++;
4683  }
4684  return 1;
4685 }
4686 
4687 static int
4688 str_upto_i(VALUE str, VALUE arg)
4689 {
4690  rb_yield(str);
4691  return 0;
4692 }
4693 
4694 /*
4695  * call-seq:
4696  * upto(other_string, exclusive = false) {|string| ... } -> self
4697  * upto(other_string, exclusive = false) -> new_enumerator
4698  *
4699  * With a block given, calls the block with each \String value
4700  * returned by successive calls to String#succ;
4701  * the first value is +self+, the next is <tt>self.succ</tt>, and so on;
4702  * the sequence terminates when value +other_string+ is reached;
4703  * returns +self+:
4704  *
4705  * 'a8'.upto('b6') {|s| print s, ' ' } # => "a8"
4706  * Output:
4707  *
4708  * a8 a9 b0 b1 b2 b3 b4 b5 b6
4709  *
4710  * If argument +exclusive+ is given as a truthy object, the last value is omitted:
4711  *
4712  * 'a8'.upto('b6', true) {|s| print s, ' ' } # => "a8"
4713  *
4714  * Output:
4715  *
4716  * a8 a9 b0 b1 b2 b3 b4 b5
4717  *
4718  * If +other_string+ would not be reached, does not call the block:
4719  *
4720  * '25'.upto('5') {|s| fail s }
4721  * 'aa'.upto('a') {|s| fail s }
4722  *
4723  * With no block given, returns a new \Enumerator:
4724  *
4725  * 'a8'.upto('b6') # => #<Enumerator: "a8":upto("b6")>
4726  *
4727  */
4728 
4729 static VALUE
4730 rb_str_upto(int argc, VALUE *argv, VALUE beg)
4731 {
4732  VALUE end, exclusive;
4733 
4734  rb_scan_args(argc, argv, "11", &end, &exclusive);
4735  RETURN_ENUMERATOR(beg, argc, argv);
4736  return rb_str_upto_each(beg, end, RTEST(exclusive), str_upto_i, Qnil);
4737 }
4738 
4739 VALUE
4740 rb_str_upto_each(VALUE beg, VALUE end, int excl, int (*each)(VALUE, VALUE), VALUE arg)
4741 {
4742  VALUE current, after_end;
4743  ID succ;
4744  int n, ascii;
4745  rb_encoding *enc;
4746 
4747  CONST_ID(succ, "succ");
4748  StringValue(end);
4749  enc = rb_enc_check(beg, end);
4750  ascii = (is_ascii_string(beg) && is_ascii_string(end));
4751  /* single character */
4752  if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1 && ascii) {
4753  char c = RSTRING_PTR(beg)[0];
4754  char e = RSTRING_PTR(end)[0];
4755 
4756  if (c > e || (excl && c == e)) return beg;
4757  for (;;) {
4758  if ((*each)(rb_enc_str_new(&c, 1, enc), arg)) break;
4759  if (!excl && c == e) break;
4760  c++;
4761  if (excl && c == e) break;
4762  }
4763  return beg;
4764  }
4765  /* both edges are all digits */
4766  if (ascii && ISDIGIT(RSTRING_PTR(beg)[0]) && ISDIGIT(RSTRING_PTR(end)[0]) &&
4767  all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg)) &&
4768  all_digits_p(RSTRING_PTR(end), RSTRING_LEN(end))) {
4769  VALUE b, e;
4770  int width;
4771 
4772  width = RSTRING_LENINT(beg);
4773  b = rb_str_to_inum(beg, 10, FALSE);
4774  e = rb_str_to_inum(end, 10, FALSE);
4775  if (FIXNUM_P(b) && FIXNUM_P(e)) {
4776  long bi = FIX2LONG(b);
4777  long ei = FIX2LONG(e);
4778  rb_encoding *usascii = rb_usascii_encoding();
4779 
4780  while (bi <= ei) {
4781  if (excl && bi == ei) break;
4782  if ((*each)(rb_enc_sprintf(usascii, "%.*ld", width, bi), arg)) break;
4783  bi++;
4784  }
4785  }
4786  else {
4787  ID op = excl ? '<' : idLE;
4788  VALUE args[2], fmt = rb_fstring_lit("%.*d");
4789 
4790  args[0] = INT2FIX(width);
4791  while (rb_funcall(b, op, 1, e)) {
4792  args[1] = b;
4793  if ((*each)(rb_str_format(numberof(args), args, fmt), arg)) break;
4794  b = rb_funcallv(b, succ, 0, 0);
4795  }
4796  }
4797  return beg;
4798  }
4799  /* normal case */
4800  n = rb_str_cmp(beg, end);
4801  if (n > 0 || (excl && n == 0)) return beg;
4802 
4803  after_end = rb_funcallv(end, succ, 0, 0);
4804  current = str_duplicate(rb_cString, beg);
4805  while (!rb_str_equal(current, after_end)) {
4806  VALUE next = Qnil;
4807  if (excl || !rb_str_equal(current, end))
4808  next = rb_funcallv(current, succ, 0, 0);
4809  if ((*each)(current, arg)) break;
4810  if (NIL_P(next)) break;
4811  current = next;
4812  StringValue(current);
4813  if (excl && rb_str_equal(current, end)) break;
4814  if (RSTRING_LEN(current) > RSTRING_LEN(end) || RSTRING_LEN(current) == 0)
4815  break;
4816  }
4817 
4818  return beg;
4819 }
4820 
4821 VALUE
4822 rb_str_upto_endless_each(VALUE beg, int (*each)(VALUE, VALUE), VALUE arg)
4823 {
4824  VALUE current;
4825  ID succ;
4826 
4827  CONST_ID(succ, "succ");
4828  /* both edges are all digits */
4829  if (is_ascii_string(beg) && ISDIGIT(RSTRING_PTR(beg)[0]) &&
4830  all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg))) {
4831  VALUE b, args[2], fmt = rb_fstring_lit("%.*d");
4832  int width = RSTRING_LENINT(beg);
4833  b = rb_str_to_inum(beg, 10, FALSE);
4834  if (FIXNUM_P(b)) {
4835  long bi = FIX2LONG(b);
4836  rb_encoding *usascii = rb_usascii_encoding();
4837 
4838  while (FIXABLE(bi)) {
4839  if ((*each)(rb_enc_sprintf(usascii, "%.*ld", width, bi), arg)) break;
4840  bi++;
4841  }
4842  b = LONG2NUM(bi);
4843  }
4844  args[0] = INT2FIX(width);
4845  while (1) {
4846  args[1] = b;
4847  if ((*each)(rb_str_format(numberof(args), args, fmt), arg)) break;
4848  b = rb_funcallv(b, succ, 0, 0);
4849  }
4850  }
4851  /* normal case */
4852  current = str_duplicate(rb_cString, beg);
4853  while (1) {
4854  VALUE next = rb_funcallv(current, succ, 0, 0);
4855  if ((*each)(current, arg)) break;
4856  current = next;
4857  StringValue(current);
4858  if (RSTRING_LEN(current) == 0)
4859  break;
4860  }
4861 
4862  return beg;
4863 }
4864 
4865 static int
4866 include_range_i(VALUE str, VALUE arg)
4867 {
4868  VALUE *argp = (VALUE *)arg;
4869  if (!rb_equal(str, *argp)) return 0;
4870  *argp = Qnil;
4871  return 1;
4872 }
4873 
4874 VALUE
4875 rb_str_include_range_p(VALUE beg, VALUE end, VALUE val, VALUE exclusive)
4876 {
4877  beg = rb_str_new_frozen(beg);
4878  StringValue(end);
4879  end = rb_str_new_frozen(end);
4880  if (NIL_P(val)) return Qfalse;
4881  val = rb_check_string_type(val);
4882  if (NIL_P(val)) return Qfalse;
4883  if (rb_enc_asciicompat(STR_ENC_GET(beg)) &&
4884  rb_enc_asciicompat(STR_ENC_GET(end)) &&
4885  rb_enc_asciicompat(STR_ENC_GET(val))) {
4886  const char *bp = RSTRING_PTR(beg);
4887  const char *ep = RSTRING_PTR(end);
4888  const char *vp = RSTRING_PTR(val);
4889  if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1) {
4890  if (RSTRING_LEN(val) == 0 || RSTRING_LEN(val) > 1)
4891  return Qfalse;
4892  else {
4893  char b = *bp;
4894  char e = *ep;
4895  char v = *vp;
4896 
4897  if (ISASCII(b) && ISASCII(e) && ISASCII(v)) {
4898  if (b <= v && v < e) return Qtrue;
4899  return RBOOL(!RTEST(exclusive) && v == e);
4900  }
4901  }
4902  }
4903 #if 0
4904  /* both edges are all digits */
4905  if (ISDIGIT(*bp) && ISDIGIT(*ep) &&
4906  all_digits_p(bp, RSTRING_LEN(beg)) &&
4907  all_digits_p(ep, RSTRING_LEN(end))) {
4908  /* TODO */
4909  }
4910 #endif
4911  }
4912  rb_str_upto_each(beg, end, RTEST(exclusive), include_range_i, (VALUE)&val);
4913 
4914  return RBOOL(NIL_P(val));
4915 }
4916 
4917 static VALUE
4918 rb_str_subpat(VALUE str, VALUE re, VALUE backref)
4919 {
4920  if (rb_reg_search(re, str, 0, 0) >= 0) {
4921  VALUE match = rb_backref_get();
4922  int nth = rb_reg_backref_number(match, backref);
4923  return rb_reg_nth_match(nth, match);
4924  }
4925  return Qnil;
4926 }
4927 
4928 static VALUE
4929 rb_str_aref(VALUE str, VALUE indx)
4930 {
4931  long idx;
4932 
4933  if (FIXNUM_P(indx)) {
4934  idx = FIX2LONG(indx);
4935  }
4936  else if (RB_TYPE_P(indx, T_REGEXP)) {
4937  return rb_str_subpat(str, indx, INT2FIX(0));
4938  }
4939  else if (RB_TYPE_P(indx, T_STRING)) {
4940  if (rb_str_index(str, indx, 0) != -1)
4941  return str_duplicate(rb_cString, indx);
4942  return Qnil;
4943  }
4944  else {
4945  /* check if indx is Range */
4946  long beg, len = str_strlen(str, NULL);
4947  switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
4948  case Qfalse:
4949  break;
4950  case Qnil:
4951  return Qnil;
4952  default:
4953  return rb_str_substr(str, beg, len);
4954  }
4955  idx = NUM2LONG(indx);
4956  }
4957 
4958  return str_substr(str, idx, 1, FALSE);
4959 }
4960 
4961 
4962 /*
4963  * call-seq:
4964  * string[index] -> new_string or nil
4965  * string[start, length] -> new_string or nil
4966  * string[range] -> new_string or nil
4967  * string[regexp, capture = 0] -> new_string or nil
4968  * string[substring] -> new_string or nil
4969  *
4970  * Returns the substring of +self+ specified by the arguments.
4971  *
4972  * When the single \Integer argument +index+ is given,
4973  * returns the 1-character substring found in +self+ at offset +index+:
4974  *
4975  * 'bar'[2] # => "r"
4976  *
4977  * Counts backward from the end of +self+ if +index+ is negative:
4978  *
4979  * 'foo'[-3] # => "f"
4980  *
4981  * Returns +nil+ if +index+ is out of range:
4982  *
4983  * 'foo'[3] # => nil
4984  * 'foo'[-4] # => nil
4985  *
4986  * When the two \Integer arguments +start+ and +length+ are given,
4987  * returns the substring of the given +length+ found in +self+ at offset +start+:
4988  *
4989  * 'foo'[0, 2] # => "fo"
4990  * 'foo'[0, 0] # => ""
4991  *
4992  * Counts backward from the end of +self+ if +start+ is negative:
4993  *
4994  * 'foo'[-2, 2] # => "oo"
4995  *
4996  * Special case: returns a new empty \String if +start+ is equal to the length of +self+:
4997  *
4998  * 'foo'[3, 2] # => ""
4999  *
5000  * Returns +nil+ if +start+ is out of range:
5001  *
5002  * 'foo'[4, 2] # => nil
5003  * 'foo'[-4, 2] # => nil
5004  *
5005  * Returns the trailing substring of +self+ if +length+ is large:
5006  *
5007  * 'foo'[1, 50] # => "oo"
5008  *
5009  * Returns +nil+ if +length+ is negative:
5010  *
5011  * 'foo'[0, -1] # => nil
5012  *
5013  * When the single \Range argument +range+ is given,
5014  * derives +start+ and +length+ values from the given +range+,
5015  * and returns values as above:
5016  *
5017  * - <tt>'foo'[0..1]</tt> is equivalent to <tt>'foo'[0, 2]</tt>.
5018  * - <tt>'foo'[0...1]</tt> is equivalent to <tt>'foo'[0, 1]</tt>.
5019  *
5020  * When the \Regexp argument +regexp+ is given,
5021  * and the +capture+ argument is <tt>0</tt>,
5022  * returns the first matching substring found in +self+,
5023  * or +nil+ if none found:
5024  *
5025  * 'foo'[/o/] # => "o"
5026  * 'foo'[/x/] # => nil
5027  * s = 'hello there'
5028  * s[/[aeiou](.)\1/] # => "ell"
5029  * s[/[aeiou](.)\1/, 0] # => "ell"
5030  *
5031  * If argument +capture+ is given and not <tt>0</tt>,
5032  * it should be either an \Integer capture group index or a \String or \Symbol capture group name;
5033  * the method call returns only the specified capture
5034  * (see {Regexp Capturing}[Regexp.html#class-Regexp-label-Capturing]):
5035  *
5036  * s = 'hello there'
5037  * s[/[aeiou](.)\1/, 1] # => "l"
5038  * s[/(?<vowel>[aeiou])(?<non_vowel>[^aeiou])/, "non_vowel"] # => "l"
5039  * s[/(?<vowel>[aeiou])(?<non_vowel>[^aeiou])/, :vowel] # => "e"
5040  *
5041  * If an invalid capture group index is given, +nil+ is returned. If an invalid
5042  * capture group name is given, +IndexError+ is raised.
5043  *
5044  * When the single \String argument +substring+ is given,
5045  * returns the substring from +self+ if found, otherwise +nil+:
5046  *
5047  * 'foo'['oo'] # => "oo"
5048  * 'foo'['xx'] # => nil
5049  *
5050  * String#slice is an alias for String#[].
5051  */
5052 
5053 static VALUE
5054 rb_str_aref_m(int argc, VALUE *argv, VALUE str)
5055 {
5056  if (argc == 2) {
5057  if (RB_TYPE_P(argv[0], T_REGEXP)) {
5058  return rb_str_subpat(str, argv[0], argv[1]);
5059  }
5060  else {
5061  long beg = NUM2LONG(argv[0]);
5062  long len = NUM2LONG(argv[1]);
5063  return rb_str_substr(str, beg, len);
5064  }
5065  }
5066  rb_check_arity(argc, 1, 2);
5067  return rb_str_aref(str, argv[0]);
5068 }
5069 
5070 VALUE
5071 rb_str_drop_bytes(VALUE str, long len)
5072 {
5073  char *ptr = RSTRING_PTR(str);
5074  long olen = RSTRING_LEN(str), nlen;
5075 
5076  str_modifiable(str);
5077  if (len > olen) len = olen;
5078  nlen = olen - len;
5079  if (str_embed_capa(str) >= nlen + TERM_LEN(str)) {
5080  char *oldptr = ptr;
5081  int fl = (int)(RBASIC(str)->flags & (STR_NOEMBED|STR_SHARED|STR_NOFREE));
5082  STR_SET_EMBED(str);
5083  STR_SET_EMBED_LEN(str, nlen);
5084  ptr = RSTRING(str)->as.embed.ary;
5085  memmove(ptr, oldptr + len, nlen);
5086  if (fl == STR_NOEMBED) xfree(oldptr);
5087  }
5088  else {
5089  if (!STR_SHARED_P(str)) {
5090  VALUE shared = heap_str_make_shared(rb_obj_class(str), str);
5091  rb_enc_cr_str_exact_copy(shared, str);
5092  OBJ_FREEZE(shared);
5093  }
5094  ptr = RSTRING(str)->as.heap.ptr += len;
5095  RSTRING(str)->as.heap.len = nlen;
5096  }
5097  ptr[nlen] = 0;
5098  ENC_CODERANGE_CLEAR(str);
5099  return str;
5100 }
5101 
5102 static void
5103 rb_str_splice_0(VALUE str, long beg, long len, VALUE val)
5104 {
5105  char *sptr;
5106  long slen, vlen = RSTRING_LEN(val);
5107  int cr;
5108 
5109  if (beg == 0 && vlen == 0) {
5110  rb_str_drop_bytes(str, len);
5111  return;
5112  }
5113 
5114  str_modify_keep_cr(str);
5115  RSTRING_GETMEM(str, sptr, slen);
5116  if (len < vlen) {
5117  /* expand string */
5118  RESIZE_CAPA(str, slen + vlen - len);
5119  sptr = RSTRING_PTR(str);
5120  }
5121 
5122  if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT)
5123  cr = rb_enc_str_coderange(val);
5124  else
5125  cr = ENC_CODERANGE_UNKNOWN;
5126 
5127  if (vlen != len) {
5128  memmove(sptr + beg + vlen,
5129  sptr + beg + len,
5130  slen - (beg + len));
5131  }
5132  if (vlen < beg && len < 0) {
5133  MEMZERO(sptr + slen, char, -len);
5134  }
5135  if (vlen > 0) {
5136  memmove(sptr + beg, RSTRING_PTR(val), vlen);
5137  }
5138  slen += vlen - len;
5139  STR_SET_LEN(str, slen);
5140  TERM_FILL(&sptr[slen], TERM_LEN(str));
5141  ENC_CODERANGE_SET(str, cr);
5142 }
5143 
5144 void
5145 rb_str_update(VALUE str, long beg, long len, VALUE val)
5146 {
5147  long slen;
5148  char *p, *e;
5149  rb_encoding *enc;
5150  int singlebyte = single_byte_optimizable(str);
5151  int cr;
5152 
5153  if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
5154 
5155  StringValue(val);
5156  enc = rb_enc_check(str, val);
5157  slen = str_strlen(str, enc); /* rb_enc_check */
5158 
5159  if ((slen < beg) || ((beg < 0) && (beg + slen < 0))) {
5160  rb_raise(rb_eIndexError, "index %ld out of string", beg);
5161  }
5162  if (beg < 0) {
5163  beg += slen;
5164  }
5165  assert(beg >= 0);
5166  assert(beg <= slen);
5167  if (len > slen - beg) {
5168  len = slen - beg;
5169  }
5170  str_modify_keep_cr(str);
5171  p = str_nth(RSTRING_PTR(str), RSTRING_END(str), beg, enc, singlebyte);
5172  if (!p) p = RSTRING_END(str);
5173  e = str_nth(p, RSTRING_END(str), len, enc, singlebyte);
5174  if (!e) e = RSTRING_END(str);
5175  /* error check */
5176  beg = p - RSTRING_PTR(str); /* physical position */
5177  len = e - p; /* physical length */
5178  rb_str_splice_0(str, beg, len, val);
5179  rb_enc_associate(str, enc);
5181  if (cr != ENC_CODERANGE_BROKEN)
5182  ENC_CODERANGE_SET(str, cr);
5183 }
5184 
5185 #define rb_str_splice(str, beg, len, val) rb_str_update(str, beg, len, val)
5186 
5187 static void
5188 rb_str_subpat_set(VALUE str, VALUE re, VALUE backref, VALUE val)
5189 {
5190  int nth;
5191  VALUE match;
5192  long start, end, len;
5193  rb_encoding *enc;
5194  struct re_registers *regs;
5195 
5196  if (rb_reg_search(re, str, 0, 0) < 0) {
5197  rb_raise(rb_eIndexError, "regexp not matched");
5198  }
5199  match = rb_backref_get();
5200  nth = rb_reg_backref_number(match, backref);
5201  regs = RMATCH_REGS(match);
5202  if ((nth >= regs->num_regs) || ((nth < 0) && (-nth >= regs->num_regs))) {
5203  rb_raise(rb_eIndexError, "index %d out of regexp", nth);
5204  }
5205  if (nth < 0) {
5206  nth += regs->num_regs;
5207  }
5208 
5209  start = BEG(nth);
5210  if (start == -1) {
5211  rb_raise(rb_eIndexError, "regexp group %d not matched", nth);
5212  }
5213  end = END(nth);
5214  len = end - start;
5215  StringValue(val);
5216  enc = rb_enc_check_str(str, val);
5217  rb_str_splice_0(str, start, len, val);
5218  rb_enc_associate(str, enc);
5219 }
5220 
5221 static VALUE
5222 rb_str_aset(VALUE str, VALUE indx, VALUE val)
5223 {
5224  long idx, beg;
5225 
5226  switch (TYPE(indx)) {
5227  case T_REGEXP:
5228  rb_str_subpat_set(str, indx, INT2FIX(0), val);
5229  return val;
5230 
5231  case T_STRING:
5232  beg = rb_str_index(str, indx, 0);
5233  if (beg < 0) {
5234  rb_raise(rb_eIndexError, "string not matched");
5235  }
5236  beg = rb_str_sublen(str, beg);
5237  rb_str_splice(str, beg, str_strlen(indx, NULL), val);
5238  return val;
5239 
5240  default:
5241  /* check if indx is Range */
5242  {
5243  long beg, len;
5244  if (rb_range_beg_len(indx, &beg, &len, str_strlen(str, NULL), 2)) {
5245  rb_str_splice(str, beg, len, val);
5246  return val;
5247  }
5248  }
5249  /* FALLTHROUGH */
5250 
5251  case T_FIXNUM:
5252  idx = NUM2LONG(indx);
5253  rb_str_splice(str, idx, 1, val);
5254  return val;
5255  }
5256 }
5257 
5258 /*
5259  * call-seq:
5260  * str[integer] = new_str
5261  * str[integer, integer] = new_str
5262  * str[range] = aString
5263  * str[regexp] = new_str
5264  * str[regexp, integer] = new_str
5265  * str[regexp, name] = new_str
5266  * str[other_str] = new_str
5267  *
5268  * Element Assignment---Replaces some or all of the content of
5269  * <i>str</i>. The portion of the string affected is determined using
5270  * the same criteria as String#[]. If the replacement string is not
5271  * the same length as the text it is replacing, the string will be
5272  * adjusted accordingly. If the regular expression or string is used
5273  * as the index doesn't match a position in the string, IndexError is
5274  * raised. If the regular expression form is used, the optional
5275  * second Integer allows you to specify which portion of the match to
5276  * replace (effectively using the MatchData indexing rules. The forms
5277  * that take an Integer will raise an IndexError if the value is out
5278  * of range; the Range form will raise a RangeError, and the Regexp
5279  * and String will raise an IndexError on negative match.
5280  */
5281 
5282 static VALUE
5283 rb_str_aset_m(int argc, VALUE *argv, VALUE str)
5284 {
5285  if (argc == 3) {
5286  if (RB_TYPE_P(argv[0], T_REGEXP)) {
5287  rb_str_subpat_set(str, argv[0], argv[1], argv[2]);
5288  }
5289  else {
5290  rb_str_splice(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]), argv[2]);
5291  }
5292  return argv[2];
5293  }
5294  rb_check_arity(argc, 2, 3);
5295  return rb_str_aset(str, argv[0], argv[1]);
5296 }
5297 
5298 /*
5299  * call-seq:
5300  * insert(index, other_string) -> self
5301  *
5302  * Inserts the given +other_string+ into +self+; returns +self+.
5303  *
5304  * If the \Integer +index+ is positive, inserts +other_string+ at offset +index+:
5305  *
5306  * 'foo'.insert(1, 'bar') # => "fbaroo"
5307  *
5308  * If the \Integer +index+ is negative, counts backward from the end of +self+
5309  * and inserts +other_string+ at offset <tt>index+1</tt>
5310  * (that is, _after_ <tt>self[index]</tt>):
5311  *
5312  * 'foo'.insert(-2, 'bar') # => "fobaro"
5313  *
5314  */
5315 
5316 static VALUE
5317 rb_str_insert(VALUE str, VALUE idx, VALUE str2)
5318 {
5319  long pos = NUM2LONG(idx);
5320 
5321  if (pos == -1) {
5322  return rb_str_append(str, str2);
5323  }
5324  else if (pos < 0) {
5325  pos++;
5326  }
5327  rb_str_splice(str, pos, 0, str2);
5328  return str;
5329 }
5330 
5331 
5332 /*
5333  * call-seq:
5334  * slice!(index) -> new_string or nil
5335  * slice!(start, length) -> new_string or nil
5336  * slice!(range) -> new_string or nil
5337  * slice!(regexp, capture = 0) -> new_string or nil
5338  * slice!(substring) -> new_string or nil
5339  *
5340  * Removes the substring of +self+ specified by the arguments;
5341  * returns the removed substring.
5342  *
5343  * See String#[] for details about the arguments that specify the substring.
5344  *
5345  * A few examples:
5346  *
5347  * string = "This is a string"
5348  * string.slice!(2) #=> "i"
5349  * string.slice!(3..6) #=> " is "
5350  * string.slice!(/s.*t/) #=> "sa st"
5351  * string.slice!("r") #=> "r"
5352  * string #=> "Thing"
5353  *
5354  */
5355 
5356 static VALUE
5357 rb_str_slice_bang(int argc, VALUE *argv, VALUE str)
5358 {
5359  VALUE result = Qnil;
5360  VALUE indx;
5361  long beg, len = 1;
5362  char *p;
5363 
5364  rb_check_arity(argc, 1, 2);
5365  str_modify_keep_cr(str);
5366  indx = argv[0];
5367  if (RB_TYPE_P(indx, T_REGEXP)) {
5368  if (rb_reg_search(indx, str, 0, 0) < 0) return Qnil;
5369  VALUE match = rb_backref_get();
5370  struct re_registers *regs = RMATCH_REGS(match);
5371  int nth = 0;
5372  if (argc > 1 && (nth = rb_reg_backref_number(match, argv[1])) < 0) {
5373  if ((nth += regs->num_regs) <= 0) return Qnil;
5374  }
5375  else if (nth >= regs->num_regs) return Qnil;
5376  beg = BEG(nth);
5377  len = END(nth) - beg;
5378  goto subseq;
5379  }
5380  else if (argc == 2) {
5381  beg = NUM2LONG(indx);
5382  len = NUM2LONG(argv[1]);
5383  goto num_index;
5384  }
5385  else if (FIXNUM_P(indx)) {
5386  beg = FIX2LONG(indx);
5387  if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
5388  if (!len) return Qnil;
5389  beg = p - RSTRING_PTR(str);
5390  goto subseq;
5391  }
5392  else if (RB_TYPE_P(indx, T_STRING)) {
5393  beg = rb_str_index(str, indx, 0);
5394  if (beg == -1) return Qnil;
5395  len = RSTRING_LEN(indx);
5396  result = str_duplicate(rb_cString, indx);
5397  goto squash;
5398  }
5399  else {
5400  switch (rb_range_beg_len(indx, &beg, &len, str_strlen(str, NULL), 0)) {
5401  case Qnil:
5402  return Qnil;
5403  case Qfalse:
5404  beg = NUM2LONG(indx);
5405  if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
5406  if (!len) return Qnil;
5407  beg = p - RSTRING_PTR(str);
5408  goto subseq;
5409  default:
5410  goto num_index;
5411  }
5412  }
5413 
5414  num_index:
5415  if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
5416  beg = p - RSTRING_PTR(str);
5417 
5418  subseq:
5419  result = rb_str_new(RSTRING_PTR(str)+beg, len);
5420  rb_enc_cr_str_copy_for_substr(result, str);
5421 
5422  squash:
5423  if (len > 0) {
5424  if (beg == 0) {
5425  rb_str_drop_bytes(str, len);
5426  }
5427  else {
5428  char *sptr = RSTRING_PTR(str);
5429  long slen = RSTRING_LEN(str);
5430  if (beg + len > slen) /* pathological check */
5431  len = slen - beg;
5432  memmove(sptr + beg,
5433  sptr + beg + len,
5434  slen - (beg + len));
5435  slen -= len;
5436  STR_SET_LEN(str, slen);
5437  TERM_FILL(&sptr[slen], TERM_LEN(str));
5438  }
5439  }
5440  return result;
5441 }
5442 
5443 static VALUE
5444 get_pat(VALUE pat)
5445 {
5446  VALUE val;
5447 
5448  switch (OBJ_BUILTIN_TYPE(pat)) {
5449  case T_REGEXP:
5450  return pat;
5451 
5452  case T_STRING:
5453  break;
5454 
5455  default:
5456  val = rb_check_string_type(pat);
5457  if (NIL_P(val)) {
5458  Check_Type(pat, T_REGEXP);
5459  }
5460  pat = val;
5461  }
5462 
5463  return rb_reg_regcomp(pat);
5464 }
5465 
5466 static VALUE
5467 get_pat_quoted(VALUE pat, int check)
5468 {
5469  VALUE val;
5470 
5471  switch (OBJ_BUILTIN_TYPE(pat)) {
5472  case T_REGEXP:
5473  return pat;
5474 
5475  case T_STRING:
5476  break;
5477 
5478  default:
5479  val = rb_check_string_type(pat);
5480  if (NIL_P(val)) {
5481  Check_Type(pat, T_REGEXP);
5482  }
5483  pat = val;
5484  }
5485  if (check && is_broken_string(pat)) {
5486  rb_exc_raise(rb_reg_check_preprocess(pat));
5487  }
5488  return pat;
5489 }
5490 
5491 static long
5492 rb_pat_search(VALUE pat, VALUE str, long pos, int set_backref_str)
5493 {
5494  if (BUILTIN_TYPE(pat) == T_STRING) {
5495  pos = rb_strseq_index(str, pat, pos, 1);
5496  if (set_backref_str) {
5497  if (pos >= 0) {
5498  str = rb_str_new_frozen_String(str);
5499  rb_backref_set_string(str, pos, RSTRING_LEN(pat));
5500  }
5501  else {
5503  }
5504  }
5505  return pos;
5506  }
5507  else {
5508  return rb_reg_search0(pat, str, pos, 0, set_backref_str);
5509  }
5510 }
5511 
5512 
5513 /*
5514  * call-seq:
5515  * sub!(pattern, replacement) -> self or nil
5516  * sub!(pattern) {|match| ... } -> self or nil
5517  *
5518  * Returns +self+ with only the first occurrence
5519  * (not all occurrences) of the given +pattern+ replaced.
5520  *
5521  * See {Substitution Methods}[#class-String-label-Substitution+Methods].
5522  *
5523  * Related: String#sub, String#gsub, String#gsub!.
5524  *
5525  */
5526 
5527 static VALUE
5528 rb_str_sub_bang(int argc, VALUE *argv, VALUE str)
5529 {
5530  VALUE pat, repl, hash = Qnil;
5531  int iter = 0;
5532  long plen;
5533  int min_arity = rb_block_given_p() ? 1 : 2;
5534  long beg;
5535 
5536  rb_check_arity(argc, min_arity, 2);
5537  if (argc == 1) {
5538  iter = 1;
5539  }
5540  else {
5541  repl = argv[1];
5542  hash = rb_check_hash_type(argv[1]);
5543  if (NIL_P(hash)) {
5544  StringValue(repl);
5545  }
5546  }
5547 
5548  pat = get_pat_quoted(argv[0], 1);
5549 
5550  str_modifiable(str);
5551  beg = rb_pat_search(pat, str, 0, 1);
5552  if (beg >= 0) {
5553  rb_encoding *enc;
5554  int cr = ENC_CODERANGE(str);
5555  long beg0, end0;
5556  VALUE match, match0 = Qnil;
5557  struct re_registers *regs;
5558  char *p, *rp;
5559  long len, rlen;
5560 
5561  match = rb_backref_get();
5562  regs = RMATCH_REGS(match);
5563  if (RB_TYPE_P(pat, T_STRING)) {
5564  beg0 = beg;
5565  end0 = beg0 + RSTRING_LEN(pat);
5566  match0 = pat;
5567  }
5568  else {
5569  beg0 = BEG(0);
5570  end0 = END(0);
5571  if (iter) match0 = rb_reg_nth_match(0, match);
5572  }
5573 
5574  if (iter || !NIL_P(hash)) {
5575  p = RSTRING_PTR(str); len = RSTRING_LEN(str);
5576 
5577  if (iter) {
5578  repl = rb_obj_as_string(rb_yield(match0));
5579  }
5580  else {
5581  repl = rb_hash_aref(hash, rb_str_subseq(str, beg0, end0 - beg0));
5582  repl = rb_obj_as_string(repl);
5583  }
5584  str_mod_check(str, p, len);
5585  rb_check_frozen(str);
5586  }
5587  else {
5588  repl = rb_reg_regsub(repl, str, regs, RB_TYPE_P(pat, T_STRING) ? Qnil : pat);
5589  }
5590 
5591  enc = rb_enc_compatible(str, repl);
5592  if (!enc) {
5593  rb_encoding *str_enc = STR_ENC_GET(str);
5594  p = RSTRING_PTR(str); len = RSTRING_LEN(str);
5595  if (coderange_scan(p, beg0, str_enc) != ENC_CODERANGE_7BIT ||
5596  coderange_scan(p+end0, len-end0, str_enc) != ENC_CODERANGE_7BIT) {
5597  rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
5598  rb_enc_name(str_enc),
5599  rb_enc_name(STR_ENC_GET(repl)));
5600  }
5601  enc = STR_ENC_GET(repl);
5602  }
5603  rb_str_modify(str);
5604  rb_enc_associate(str, enc);
5605  if (ENC_CODERANGE_UNKNOWN < cr && cr < ENC_CODERANGE_BROKEN) {
5606  int cr2 = ENC_CODERANGE(repl);
5607  if (cr2 == ENC_CODERANGE_BROKEN ||
5608  (cr == ENC_CODERANGE_VALID && cr2 == ENC_CODERANGE_7BIT))
5609  cr = ENC_CODERANGE_UNKNOWN;
5610  else
5611  cr = cr2;
5612  }
5613  plen = end0 - beg0;
5614  rlen = RSTRING_LEN(repl);
5615  len = RSTRING_LEN(str);
5616  if (rlen > plen) {
5617  RESIZE_CAPA(str, len + rlen - plen);
5618  }
5619  p = RSTRING_PTR(str);
5620  if (rlen != plen) {
5621  memmove(p + beg0 + rlen, p + beg0 + plen, len - beg0 - plen);
5622  }
5623  rp = RSTRING_PTR(repl);
5624  memmove(p + beg0, rp, rlen);
5625  len += rlen - plen;
5626  STR_SET_LEN(str, len);
5627  TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
5628  ENC_CODERANGE_SET(str, cr);
5629 
5630  return str;
5631  }
5632  return Qnil;
5633 }
5634 
5635 
5636 /*
5637  * call-seq:
5638  * sub(pattern, replacement) -> new_string
5639  * sub(pattern) {|match| ... } -> new_string
5640  *
5641  * Returns a copy of +self+ with only the first occurrence
5642  * (not all occurrences) of the given +pattern+ replaced.
5643  *
5644  * See {Substitution Methods}[#class-String-label-Substitution+Methods].
5645  *
5646  * Related: String#sub!, String#gsub, String#gsub!.
5647  *
5648  */
5649 
5650 static VALUE
5651 rb_str_sub(int argc, VALUE *argv, VALUE str)
5652 {
5653  str = str_duplicate(rb_cString, str);
5654  rb_str_sub_bang(argc, argv, str);
5655  return str;
5656 }
5657 
5658 static VALUE
5659 str_gsub(int argc, VALUE *argv, VALUE str, int bang)
5660 {
5661  VALUE pat, val = Qnil, repl, match, match0 = Qnil, dest, hash = Qnil;
5662  struct re_registers *regs;
5663  long beg, beg0, end0;
5664  long offset, blen, slen, len, last;
5665  enum {STR, ITER, MAP} mode = STR;
5666  char *sp, *cp;
5667  int need_backref = -1;
5668  rb_encoding *str_enc;
5669 
5670  switch (argc) {
5671  case 1:
5672  RETURN_ENUMERATOR(str, argc, argv);
5673  mode = ITER;
5674  break;
5675  case 2:
5676  repl = argv[1];
5677  hash = rb_check_hash_type(argv[1]);
5678  if (NIL_P(hash)) {
5679  StringValue(repl);
5680  }
5681  else {
5682  mode = MAP;
5683  }
5684  break;
5685  default:
5686  rb_error_arity(argc, 1, 2);
5687  }
5688 
5689  pat = get_pat_quoted(argv[0], 1);
5690  beg = rb_pat_search(pat, str, 0, need_backref);
5691  if (beg < 0) {
5692  if (bang) return Qnil; /* no match, no substitution */
5693  return str_duplicate(rb_cString, str);
5694  }
5695 
5696  offset = 0;
5697  blen = RSTRING_LEN(str) + 30; /* len + margin */
5698  dest = rb_str_buf_new(blen);
5699  sp = RSTRING_PTR(str);
5700  slen = RSTRING_LEN(str);
5701  cp = sp;
5702  str_enc = STR_ENC_GET(str);
5703  rb_enc_associate(dest, str_enc);
5705 
5706  do {
5707  match = rb_backref_get();
5708  regs = RMATCH_REGS(match);
5709  if (RB_TYPE_P(pat, T_STRING)) {
5710  beg0 = beg;
5711  end0 = beg0 + RSTRING_LEN(pat);
5712  match0 = pat;
5713  }
5714  else {
5715  beg0 = BEG(0);
5716  end0 = END(0);
5717  if (mode == ITER) match0 = rb_reg_nth_match(0, match);
5718  }
5719 
5720  if (mode) {
5721  if (mode == ITER) {
5722  val = rb_obj_as_string(rb_yield(match0));
5723  }
5724  else {
5725  val = rb_hash_aref(hash, rb_str_subseq(str, beg0, end0 - beg0));
5726  val = rb_obj_as_string(val);
5727  }
5728  str_mod_check(str, sp, slen);
5729  if (val == dest) { /* paranoid check [ruby-dev:24827] */
5730  rb_raise(rb_eRuntimeError, "block should not cheat");
5731  }
5732  }
5733  else if (need_backref) {
5734  val = rb_reg_regsub(repl, str, regs, RB_TYPE_P(pat, T_STRING) ? Qnil : pat);
5735  if (need_backref < 0) {
5736  need_backref = val != repl;
5737  }
5738  }
5739  else {
5740  val = repl;
5741  }
5742 
5743  len = beg0 - offset; /* copy pre-match substr */
5744  if (len) {
5745  rb_enc_str_buf_cat(dest, cp, len, str_enc);
5746  }
5747 
5748  rb_str_buf_append(dest, val);
5749 
5750  last = offset;
5751  offset = end0;
5752  if (beg0 == end0) {
5753  /*
5754  * Always consume at least one character of the input string
5755  * in order to prevent infinite loops.
5756  */
5757  if (RSTRING_LEN(str) <= end0) break;
5758  len = rb_enc_fast_mbclen(RSTRING_PTR(str)+end0, RSTRING_END(str), str_enc);
5759  rb_enc_str_buf_cat(dest, RSTRING_PTR(str)+end0, len, str_enc);
5760  offset = end0 + len;
5761  }
5762  cp = RSTRING_PTR(str) + offset;
5763  if (offset > RSTRING_LEN(str)) break;
5764  beg = rb_pat_search(pat, str, offset, need_backref);
5765  } while (beg >= 0);
5766  if (RSTRING_LEN(str) > offset) {
5767  rb_enc_str_buf_cat(dest, cp, RSTRING_LEN(str) - offset, str_enc);
5768  }
5769  rb_pat_search(pat, str, last, 1);
5770  if (bang) {
5771  str_shared_replace(str, dest);
5772  }
5773  else {
5774  str = dest;
5775  }
5776 
5777  return str;
5778 }
5779 
5780 
5781 /*
5782  * call-seq:
5783  * gsub!(pattern, replacement) -> self or nil
5784  * gsub!(pattern) {|match| ... } -> self or nil
5785  * gsub!(pattern) -> an_enumerator
5786  *
5787  * Performs the specified substring replacement(s) on +self+;
5788  * returns +self+ if any replacement occurred, +nil+ otherwise.
5789  *
5790  * See {Substitution Methods}[#class-String-label-Substitution+Methods].
5791  *
5792  * Returns an Enumerator if no +replacement+ and no block given.
5793  *
5794  * Related: String#sub, String#gsub, String#sub!.
5795  *
5796  */
5797 
5798 static VALUE
5799 rb_str_gsub_bang(int argc, VALUE *argv, VALUE str)
5800 {
5801  str_modify_keep_cr(str);
5802  return str_gsub(argc, argv, str, 1);
5803 }
5804 
5805 
5806 /*
5807  * call-seq:
5808  * gsub(pattern, replacement) -> new_string
5809  * gsub(pattern) {|match| ... } -> new_string
5810  * gsub(pattern) -> enumerator
5811  *
5812  * Returns a copy of +self+ with all occurrences of the given +pattern+ replaced.
5813  *
5814  * See {Substitution Methods}[#class-String-label-Substitution+Methods].
5815  *
5816  * Returns an Enumerator if no +replacement+ and no block given.
5817  *
5818  * Related: String#sub, String#sub!, String#gsub!.
5819  *
5820  */
5821 
5822 static VALUE
5823 rb_str_gsub(int argc, VALUE *argv, VALUE str)
5824 {
5825  return str_gsub(argc, argv, str, 0);
5826 }
5827 
5828 
5829 /*
5830  * call-seq:
5831  * replace(other_string) -> self
5832  *
5833  * Replaces the contents of +self+ with the contents of +other_string+:
5834  *
5835  * s = 'foo' # => "foo"
5836  * s.replace('bar') # => "bar"
5837  *
5838  */
5839 
5840 VALUE
5842 {
5843  str_modifiable(str);
5844  if (str == str2) return str;
5845 
5846  StringValue(str2);
5847  str_discard(str);
5848  return str_replace(str, str2);
5849 }
5850 
5851 /*
5852  * call-seq:
5853  * clear -> self
5854  *
5855  * Removes the contents of +self+:
5856  *
5857  * s = 'foo' # => "foo"
5858  * s.clear # => ""
5859  *
5860  */
5861 
5862 static VALUE
5863 rb_str_clear(VALUE str)
5864 {
5865  str_discard(str);
5866  STR_SET_EMBED(str);
5867  STR_SET_EMBED_LEN(str, 0);
5868  RSTRING_PTR(str)[0] = 0;
5869  if (rb_enc_asciicompat(STR_ENC_GET(str)))
5871  else
5873  return str;
5874 }
5875 
5876 /*
5877  * call-seq:
5878  * chr -> string
5879  *
5880  * Returns a string containing the first character of +self+:
5881  *
5882  * s = 'foo' # => "foo"
5883  * s.chr # => "f"
5884  *
5885  */
5886 
5887 static VALUE
5888 rb_str_chr(VALUE str)
5889 {
5890  return rb_str_substr(str, 0, 1);
5891 }
5892 
5893 /*
5894  * call-seq:
5895  * getbyte(index) -> integer
5896  *
5897  * Returns the byte at zero-based +index+ as an integer:
5898  *
5899  * s = 'abcde' # => "abcde"
5900  * s.getbyte(0) # => 97
5901  * s.getbyte(1) # => 98
5902  *
5903  * Related: String#setbyte.
5904  */
5905 static VALUE
5906 rb_str_getbyte(VALUE str, VALUE index)
5907 {
5908  long pos = NUM2LONG(index);
5909 
5910  if (pos < 0)
5911  pos += RSTRING_LEN(str);
5912  if (pos < 0 || RSTRING_LEN(str) <= pos)
5913  return Qnil;
5914 
5915  return INT2FIX((unsigned char)RSTRING_PTR(str)[pos]);
5916 }
5917 
5918 /*
5919  * call-seq:
5920  * setbyte(index, integer) -> integer
5921  *
5922  * Sets the byte at zero-based +index+ to +integer+; returns +integer+:
5923  *
5924  * s = 'abcde' # => "abcde"
5925  * s.setbyte(0, 98) # => 98
5926  * s # => "bbcde"
5927  *
5928  * Related: String#getbyte.
5929  */
5930 static VALUE
5931 rb_str_setbyte(VALUE str, VALUE index, VALUE value)
5932 {
5933  long pos = NUM2LONG(index);
5934  long len = RSTRING_LEN(str);
5935  char *ptr, *head, *left = 0;
5936  rb_encoding *enc;
5937  int cr = ENC_CODERANGE_UNKNOWN, width, nlen;
5938 
5939  if (pos < -len || len <= pos)
5940  rb_raise(rb_eIndexError, "index %ld out of string", pos);
5941  if (pos < 0)
5942  pos += len;
5943 
5944  VALUE v = rb_to_int(value);
5945  VALUE w = rb_int_and(v, INT2FIX(0xff));
5946  char byte = (char)(NUM2INT(w) & 0xFF);
5947 
5948  if (!str_independent(str))
5949  str_make_independent(str);
5950  enc = STR_ENC_GET(str);
5951  head = RSTRING_PTR(str);
5952  ptr = &head[pos];
5953  if (!STR_EMBED_P(str)) {
5954  cr = ENC_CODERANGE(str);
5955  switch (cr) {
5956  case ENC_CODERANGE_7BIT:
5957  left = ptr;
5958  *ptr = byte;
5959  if (ISASCII(byte)) goto end;
5960  nlen = rb_enc_precise_mbclen(left, head+len, enc);
5961  if (!MBCLEN_CHARFOUND_P(nlen))
5963  else
5965  goto end;
5966  case ENC_CODERANGE_VALID:
5967  left = rb_enc_left_char_head(head, ptr, head+len, enc);
5968  width = rb_enc_precise_mbclen(left, head+len, enc);
5969  *ptr = byte;
5970  nlen = rb_enc_precise_mbclen(left, head+len, enc);
5971  if (!MBCLEN_CHARFOUND_P(nlen))
5973  else if (MBCLEN_CHARFOUND_LEN(nlen) != width || ISASCII(byte))
5974  ENC_CODERANGE_CLEAR(str);
5975  goto end;
5976  }
5977  }
5978  ENC_CODERANGE_CLEAR(str);
5979  *ptr = byte;
5980 
5981  end:
5982  return value;
5983 }
5984 
5985 static VALUE
5986 str_byte_substr(VALUE str, long beg, long len, int empty)
5987 {
5988  char *p, *s = RSTRING_PTR(str);
5989  long n = RSTRING_LEN(str);
5990  VALUE str2;
5991 
5992  if (beg > n || len < 0) return Qnil;
5993  if (beg < 0) {
5994  beg += n;
5995  if (beg < 0) return Qnil;
5996  }
5997  if (len > n - beg)
5998  len = n - beg;
5999  if (len <= 0) {
6000  if (!empty) return Qnil;
6001  len = 0;
6002  p = 0;
6003  }
6004  else
6005  p = s + beg;
6006 
6007  if (!STR_EMBEDDABLE_P(len, TERM_LEN(str)) && SHARABLE_SUBSTRING_P(beg, len, n)) {
6008  str2 = rb_str_new_frozen(str);
6009  str2 = str_new_shared(rb_cString, str2);
6010  RSTRING(str2)->as.heap.ptr += beg;
6011  RSTRING(str2)->as.heap.len = len;
6012  }
6013  else {
6014  str2 = rb_str_new(p, len);
6015  }
6016 
6017  str_enc_copy(str2, str);
6018 
6019  if (RSTRING_LEN(str2) == 0) {
6020  if (!rb_enc_asciicompat(STR_ENC_GET(str)))
6022  else
6024  }
6025  else {
6026  switch (ENC_CODERANGE(str)) {
6027  case ENC_CODERANGE_7BIT:
6029  break;
6030  default:
6032  break;
6033  }
6034  }
6035 
6036  return str2;
6037 }
6038 
6039 static VALUE
6040 str_byte_aref(VALUE str, VALUE indx)
6041 {
6042  long idx;
6043  if (FIXNUM_P(indx)) {
6044  idx = FIX2LONG(indx);
6045  }
6046  else {
6047  /* check if indx is Range */
6048  long beg, len = RSTRING_LEN(str);
6049 
6050  switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
6051  case Qfalse:
6052  break;
6053  case Qnil:
6054  return Qnil;
6055  default:
6056  return str_byte_substr(str, beg, len, TRUE);
6057  }
6058 
6059  idx = NUM2LONG(indx);
6060  }
6061  return str_byte_substr(str, idx, 1, FALSE);
6062 }
6063 
6064 /*
6065  * call-seq:
6066  * byteslice(index, length = 1) -> string or nil
6067  * byteslice(range) -> string or nil
6068  *
6069  * Returns a substring of +self+, or +nil+ if the substring cannot be constructed.
6070  *
6071  * With integer arguments +index+ and +length+ given,
6072  * returns the substring beginning at the given +index+
6073  * of the given +length+ (if possible),
6074  * or +nil+ if +length+ is negative or +index+ falls outside of +self+:
6075  *
6076  * s = '0123456789' # => "0123456789"
6077  * s.byteslice(2) # => "2"
6078  * s.byteslice(200) # => nil
6079  * s.byteslice(4, 3) # => "456"
6080  * s.byteslice(4, 30) # => "456789"
6081  * s.byteslice(4, -1) # => nil
6082  * s.byteslice(40, 2) # => nil
6083  *
6084  * In either case above, counts backwards from the end of +self+
6085  * if +index+ is negative:
6086  *
6087  * s = '0123456789' # => "0123456789"
6088  * s.byteslice(-4) # => "6"
6089  * s.byteslice(-4, 3) # => "678"
6090  *
6091  * With Range argument +range+ given, returns
6092  * <tt>byteslice(range.begin, range.size)</tt>:
6093  *
6094  * s = '0123456789' # => "0123456789"
6095  * s.byteslice(4..6) # => "456"
6096  * s.byteslice(-6..-4) # => "456"
6097  * s.byteslice(5..2) # => "" # range.size is zero.
6098  * s.byteslice(40..42) # => nil
6099  *
6100  * In all cases, a returned string has the same encoding as +self+:
6101  *
6102  * s.encoding # => #<Encoding:UTF-8>
6103  * s.byteslice(4).encoding # => #<Encoding:UTF-8>
6104  *
6105  */
6106 
6107 static VALUE
6108 rb_str_byteslice(int argc, VALUE *argv, VALUE str)
6109 {
6110  if (argc == 2) {
6111  long beg = NUM2LONG(argv[0]);
6112  long end = NUM2LONG(argv[1]);
6113  return str_byte_substr(str, beg, end, TRUE);
6114  }
6115  rb_check_arity(argc, 1, 2);
6116  return str_byte_aref(str, argv[0]);
6117 }
6118 
6119 /*
6120  * call-seq:
6121  * reverse -> string
6122  *
6123  * Returns a new string with the characters from +self+ in reverse order.
6124  *
6125  * 'stressed'.reverse # => "desserts"
6126  *
6127  */
6128 
6129 static VALUE
6130 rb_str_reverse(VALUE str)
6131 {
6132  rb_encoding *enc;
6133  VALUE rev;
6134  char *s, *e, *p;
6135  int cr;
6136 
6137  if (RSTRING_LEN(str) <= 1) return str_duplicate(rb_cString, str);
6138  enc = STR_ENC_GET(str);
6139  rev = rb_str_new(0, RSTRING_LEN(str));
6140  s = RSTRING_PTR(str); e = RSTRING_END(str);
6141  p = RSTRING_END(rev);
6142  cr = ENC_CODERANGE(str);
6143 
6144  if (RSTRING_LEN(str) > 1) {
6145  if (single_byte_optimizable(str)) {
6146  while (s < e) {
6147  *--p = *s++;
6148  }
6149  }
6150  else if (cr == ENC_CODERANGE_VALID) {
6151  while (s < e) {
6152  int clen = rb_enc_fast_mbclen(s, e, enc);
6153 
6154  p -= clen;
6155  memcpy(p, s, clen);
6156  s += clen;
6157  }
6158  }
6159  else {
6160  cr = rb_enc_asciicompat(enc) ?
6162  while (s < e) {
6163  int clen = rb_enc_mbclen(s, e, enc);
6164 
6165  if (clen > 1 || (*s & 0x80)) cr = ENC_CODERANGE_UNKNOWN;
6166  p -= clen;
6167  memcpy(p, s, clen);
6168  s += clen;
6169  }
6170  }
6171  }
6172  STR_SET_LEN(rev, RSTRING_LEN(str));
6173  str_enc_copy(rev, str);
6174  ENC_CODERANGE_SET(rev, cr);
6175 
6176  return rev;
6177 }
6178 
6179 
6180 /*
6181  * call-seq:
6182  * reverse! -> self
6183  *
6184  * Returns +self+ with its characters reversed:
6185  *
6186  * s = 'stressed'
6187  * s.reverse! # => "desserts"
6188  * s # => "desserts"
6189  *
6190  */
6191 
6192 static VALUE
6193 rb_str_reverse_bang(VALUE str)
6194 {
6195  if (RSTRING_LEN(str) > 1) {
6196  if (single_byte_optimizable(str)) {
6197  char *s, *e, c;
6198 
6199  str_modify_keep_cr(str);
6200  s = RSTRING_PTR(str);
6201  e = RSTRING_END(str) - 1;
6202  while (s < e) {
6203  c = *s;
6204  *s++ = *e;
6205  *e-- = c;
6206  }
6207  }
6208  else {
6209  str_shared_replace(str, rb_str_reverse(str));
6210  }
6211  }
6212  else {
6213  str_modify_keep_cr(str);
6214  }
6215  return str;
6216 }
6217 
6218 
6219 /*
6220  * call-seq:
6221  * include? other_string -> true or false
6222  *
6223  * Returns +true+ if +self+ contains +other_string+, +false+ otherwise:
6224  *
6225  * s = 'foo'
6226  * s.include?('f') # => true
6227  * s.include?('fo') # => true
6228  * s.include?('food') # => false
6229  *
6230  */
6231 
6232 static VALUE
6233 rb_str_include(VALUE str, VALUE arg)
6234 {
6235  long i;
6236 
6237  StringValue(arg);
6238  i = rb_str_index(str, arg, 0);
6239 
6240  return RBOOL(i != -1);
6241 }
6242 
6243 
6244 /*
6245  * call-seq:
6246  * to_i(base = 10) -> integer
6247  *
6248  * Returns the result of interpreting leading characters in +self+
6249  * as an integer in the given +base+ (which must be in (2..36)):
6250  *
6251  * '123456'.to_i # => 123456
6252  * '123def'.to_i(16) # => 1195503
6253  *
6254  * Characters past a leading valid number (in the given +base+) are ignored:
6255  *
6256  * '12.345'.to_i # => 12
6257  * '12345'.to_i(2) # => 1
6258  *
6259  * Returns zero if there is no leading valid number:
6260  *
6261  * 'abcdef'.to_i # => 0
6262  * '2'.to_i(2) # => 0
6263  *
6264  */
6265 
6266 static VALUE
6267 rb_str_to_i(int argc, VALUE *argv, VALUE str)
6268 {
6269  int base = 10;
6270 
6271  if (rb_check_arity(argc, 0, 1) && (base = NUM2INT(argv[0])) < 0) {
6272  rb_raise(rb_eArgError, "invalid radix %d", base);
6273  }
6274  return rb_str_to_inum(str, base, FALSE);
6275 }
6276 
6277 
6278 /*
6279  * call-seq:
6280  * to_f -> float
6281  *
6282  * Returns the result of interpreting leading characters in +self+ as a Float:
6283  *
6284  * '3.14159'.to_f # => 3.14159
6285  '1.234e-2'.to_f # => 0.01234
6286  *
6287  * Characters past a leading valid number (in the given +base+) are ignored:
6288  *
6289  * '3.14 (pi to two places)'.to_f # => 3.14
6290  *
6291  * Returns zero if there is no leading valid number:
6292  *
6293  * 'abcdef'.to_f # => 0.0
6294  *
6295  */
6296 
6297 static VALUE
6298 rb_str_to_f(VALUE str)
6299 {
6300  return DBL2NUM(rb_str_to_dbl(str, FALSE));
6301 }
6302 
6303 
6304 /*
6305  * call-seq:
6306  * to_s -> self or string
6307  *
6308  * Returns +self+ if +self+ is a \String,
6309  * or +self+ converted to a \String if +self+ is a subclass of \String.
6310  *
6311  * String#to_str is an alias for String#to_s.
6312  *
6313  */
6314 
6315 static VALUE
6316 rb_str_to_s(VALUE str)
6317 {
6318  if (rb_obj_class(str) != rb_cString) {
6319  return str_duplicate(rb_cString, str);
6320  }
6321  return str;
6322 }
6323 
6324 #if 0
6325 static void
6326 str_cat_char(VALUE str, unsigned int c, rb_encoding *enc)
6327 {
6328  char s[RUBY_MAX_CHAR_LEN];
6329  int n = rb_enc_codelen(c, enc);
6330 
6331  rb_enc_mbcput(c, s, enc);
6332  rb_enc_str_buf_cat(str, s, n, enc);
6333 }
6334 #endif
6335 
6336 #define CHAR_ESC_LEN 13 /* sizeof(\x{ hex of 32bit unsigned int } \0) */
6337 
6338 int
6339 rb_str_buf_cat_escaped_char(VALUE result, unsigned int c, int unicode_p)
6340 {
6341  char buf[CHAR_ESC_LEN + 1];
6342  int l;
6343 
6344 #if SIZEOF_INT > 4
6345  c &= 0xffffffff;
6346 #endif
6347  if (unicode_p) {
6348  if (c < 0x7F && ISPRINT(c)) {
6349  snprintf(buf, CHAR_ESC_LEN, "%c", c);
6350  }
6351  else if (c < 0x10000) {
6352  snprintf(buf, CHAR_ESC_LEN, "\\u%04X", c);
6353  }
6354  else {
6355  snprintf(buf, CHAR_ESC_LEN, "\\u{%X}", c);
6356  }
6357  }
6358  else {
6359  if (c < 0x100) {
6360  snprintf(buf, CHAR_ESC_LEN, "\\x%02X", c);
6361  }
6362  else {
6363  snprintf(buf, CHAR_ESC_LEN, "\\x{%X}", c);
6364  }
6365  }
6366  l = (int)strlen(buf); /* CHAR_ESC_LEN cannot exceed INT_MAX */
6367  rb_str_buf_cat(result, buf, l);
6368  return l;
6369 }
6370 
6371 const char *
6372 ruby_escaped_char(int c)
6373 {
6374  switch (c) {
6375  case '\0': return "\\0";
6376  case '\n': return "\\n";
6377  case '\r': return "\\r";
6378  case '\t': return "\\t";
6379  case '\f': return "\\f";
6380  case '\013': return "\\v";
6381  case '\010': return "\\b";
6382  case '\007': return "\\a";
6383  case '\033': return "\\e";
6384  case '\x7f': return "\\c?";
6385  }
6386  return NULL;
6387 }
6388 
6389 VALUE
6390 rb_str_escape(VALUE str)
6391 {
6392  int encidx = ENCODING_GET(str);
6393  rb_encoding *enc = rb_enc_from_index(encidx);
6394  const char *p = RSTRING_PTR(str);
6395  const char *pend = RSTRING_END(str);
6396  const char *prev = p;
6397  char buf[CHAR_ESC_LEN + 1];
6398  VALUE result = rb_str_buf_new(0);
6399  int unicode_p = rb_enc_unicode_p(enc);
6400  int asciicompat = rb_enc_asciicompat(enc);
6401 
6402  while (p < pend) {
6403  unsigned int c;
6404  const char *cc;
6405  int n = rb_enc_precise_mbclen(p, pend, enc);
6406  if (!MBCLEN_CHARFOUND_P(n)) {
6407  if (p > prev) str_buf_cat(result, prev, p - prev);
6408  n = rb_enc_mbminlen(enc);
6409  if (pend < p + n)
6410  n = (int)(pend - p);
6411  while (n--) {
6412  snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
6413  str_buf_cat(result, buf, strlen(buf));
6414  prev = ++p;
6415  }
6416  continue;
6417  }
6418  n = MBCLEN_CHARFOUND_LEN(n);
6419  c = rb_enc_mbc_to_codepoint(p, pend, enc);
6420  p += n;
6421  cc = ruby_escaped_char(c);
6422  if (cc) {
6423  if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
6424  str_buf_cat(result, cc, strlen(cc));
6425  prev = p;
6426  }
6427  else if (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c)) {
6428  }
6429  else {
6430  if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
6431  rb_str_buf_cat_escaped_char(result, c, unicode_p);
6432  prev = p;
6433  }
6434  }
6435  if (p > prev) str_buf_cat(result, prev, p - prev);
6437 
6438  return result;
6439 }
6440 
6441 /*
6442  * call-seq:
6443  * inspect -> string
6444  *
6445  * Returns a printable version of +self+, enclosed in double-quotes,
6446  * and with special characters escaped:
6447  *
6448  * s = "foo\tbar\tbaz\n"
6449  * # => "foo\tbar\tbaz\n"
6450  * s.inspect
6451  * # => "\"foo\\tbar\\tbaz\\n\""
6452  *
6453  */
6454 
6455 VALUE
6457 {
6458  int encidx = ENCODING_GET(str);
6459  rb_encoding *enc = rb_enc_from_index(encidx), *actenc;
6460  const char *p, *pend, *prev;
6461  char buf[CHAR_ESC_LEN + 1];
6462  VALUE result = rb_str_buf_new(0);
6464  int unicode_p = rb_enc_unicode_p(enc);
6465  int asciicompat = rb_enc_asciicompat(enc);
6466 
6467  if (resenc == NULL) resenc = rb_default_external_encoding();
6468  if (!rb_enc_asciicompat(resenc)) resenc = rb_usascii_encoding();
6469  rb_enc_associate(result, resenc);
6470  str_buf_cat2(result, "\"");
6471 
6472  p = RSTRING_PTR(str); pend = RSTRING_END(str);
6473  prev = p;
6474  actenc = get_actual_encoding(encidx, str);
6475  if (actenc != enc) {
6476  enc = actenc;
6477  if (unicode_p) unicode_p = rb_enc_unicode_p(enc);
6478  }
6479  while (p < pend) {
6480  unsigned int c, cc;
6481  int n;
6482 
6483  n = rb_enc_precise_mbclen(p, pend, enc);
6484  if (!MBCLEN_CHARFOUND_P(n)) {
6485  if (p > prev) str_buf_cat(result, prev, p - prev);
6486  n = rb_enc_mbminlen(enc);
6487  if (pend < p + n)
6488  n = (int)(pend - p);
6489  while (n--) {
6490  snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
6491  str_buf_cat(result, buf, strlen(buf));
6492  prev = ++p;
6493  }
6494  continue;
6495  }
6496  n = MBCLEN_CHARFOUND_LEN(n);
6497  c = rb_enc_mbc_to_codepoint(p, pend, enc);
6498  p += n;
6499  if ((asciicompat || unicode_p) &&
6500  (c == '"'|| c == '\\' ||
6501  (c == '#' &&
6502  p < pend &&
6504  (cc = rb_enc_codepoint(p,pend,enc),
6505  (cc == '$' || cc == '@' || cc == '{'))))) {
6506  if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
6507  str_buf_cat2(result, "\\");
6508  if (asciicompat || enc == resenc) {
6509  prev = p - n;
6510  continue;
6511  }
6512  }
6513  switch (c) {
6514  case '\n': cc = 'n'; break;
6515  case '\r': cc = 'r'; break;
6516  case '\t': cc = 't'; break;
6517  case '\f': cc = 'f'; break;
6518  case '\013': cc = 'v'; break;
6519  case '\010': cc = 'b'; break;
6520  case '\007': cc = 'a'; break;
6521  case 033: cc = 'e'; break;
6522  default: cc = 0; break;
6523  }
6524  if (cc) {
6525  if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
6526  buf[0] = '\\';
6527  buf[1] = (char)cc;
6528  str_buf_cat(result, buf, 2);
6529  prev = p;
6530  continue;
6531  }
6532  if ((enc == resenc && rb_enc_isprint(c, enc)) ||
6533  (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c))) {
6534  continue;
6535  }
6536  else {
6537  if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
6538  rb_str_buf_cat_escaped_char(result, c, unicode_p);
6539  prev = p;
6540  continue;
6541  }
6542  }
6543  if (p > prev) str_buf_cat(result, prev, p - prev);
6544  str_buf_cat2(result, "\"");
6545 
6546  return result;
6547 }
6548 
6549 #define IS_EVSTR(p,e) ((p) < (e) && (*(p) == '$' || *(p) == '@' || *(p) == '{'))
6550 
6551 /*
6552  * call-seq:
6553  * dump -> string
6554  *
6555  * Returns a printable version of +self+, enclosed in double-quotes,
6556  * with special characters escaped, and with non-printing characters
6557  * replaced by hexadecimal notation:
6558  *
6559  * "hello \n ''".dump # => "\"hello \\n ''\""
6560  * "\f\x00\xff\\\"".dump # => "\"\\f\\x00\\xFF\\\\\\\"\""
6561  *
6562  * Related: String#undump (inverse of String#dump).
6563  *
6564  */
6565 
6566 VALUE
6568 {
6569  int encidx = rb_enc_get_index(str);
6570  rb_encoding *enc = rb_enc_from_index(encidx);
6571  long len;
6572  const char *p, *pend;
6573  char *q, *qend;
6574  VALUE result;
6575  int u8 = (encidx == rb_utf8_encindex());
6576  static const char nonascii_suffix[] = ".dup.force_encoding(\"%s\")";
6577 
6578  len = 2; /* "" */
6579  if (!rb_enc_asciicompat(enc)) {
6580  len += strlen(nonascii_suffix) - rb_strlen_lit("%s");
6581  len += strlen(enc->name);
6582  }
6583 
6584  p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
6585  while (p < pend) {
6586  int clen;
6587  unsigned char c = *p++;
6588 
6589  switch (c) {
6590  case '"': case '\\':
6591  case '\n': case '\r':
6592  case '\t': case '\f':
6593  case '\013': case '\010': case '\007': case '\033':
6594  clen = 2;
6595  break;
6596 
6597  case '#':
6598  clen = IS_EVSTR(p, pend) ? 2 : 1;
6599  break;
6600 
6601  default:
6602  if (ISPRINT(c)) {
6603  clen = 1;
6604  }
6605  else {
6606  if (u8 && c > 0x7F) { /* \u notation */
6607  int n = rb_enc_precise_mbclen(p-1, pend, enc);
6608  if (MBCLEN_CHARFOUND_P(n)) {
6609  unsigned int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
6610  if (cc <= 0xFFFF)
6611  clen = 6; /* \uXXXX */
6612  else if (cc <= 0xFFFFF)
6613  clen = 9; /* \u{XXXXX} */
6614  else
6615  clen = 10; /* \u{XXXXXX} */
6616  p += MBCLEN_CHARFOUND_LEN(n)-1;
6617  break;
6618  }
6619  }
6620  clen = 4; /* \xNN */
6621  }
6622  break;
6623  }
6624 
6625  if (clen > LONG_MAX - len) {
6626  rb_raise(rb_eRuntimeError, "string size too big");
6627  }
6628  len += clen;
6629  }
6630 
6631  result = rb_str_new(0, len);
6632  p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
6633  q = RSTRING_PTR(result); qend = q + len + 1;
6634 
6635  *q++ = '"';
6636  while (p < pend) {
6637  unsigned char c = *p++;
6638 
6639  if (c == '"' || c == '\\') {
6640  *q++ = '\\';
6641  *q++ = c;
6642  }
6643  else if (c == '#') {
6644  if (IS_EVSTR(p, pend)) *q++ = '\\';
6645  *q++ = '#';
6646  }
6647  else if (c == '\n') {
6648  *q++ = '\\';
6649  *q++ = 'n';
6650  }
6651  else if (c == '\r') {
6652  *q++ = '\\';
6653  *q++ = 'r';
6654  }
6655  else if (c == '\t') {
6656  *q++ = '\\';
6657  *q++ = 't';
6658  }
6659  else if (c == '\f') {
6660  *q++ = '\\';
6661  *q++ = 'f';
6662  }
6663  else if (c == '\013') {
6664  *q++ = '\\';
6665  *q++ = 'v';
6666  }
6667  else if (c == '\010') {
6668  *q++ = '\\';
6669  *q++ = 'b';
6670  }
6671  else if (c == '\007') {
6672  *q++ = '\\';
6673  *q++ = 'a';
6674  }
6675  else if (c == '\033') {
6676  *q++ = '\\';
6677  *q++ = 'e';
6678  }
6679  else if (ISPRINT(c)) {
6680  *q++ = c;
6681  }
6682  else {
6683  *q++ = '\\';
6684  if (u8) {
6685  int n = rb_enc_precise_mbclen(p-1, pend, enc) - 1;
6686  if (MBCLEN_CHARFOUND_P(n)) {
6687  int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
6688  p += n;
6689  if (cc <= 0xFFFF)
6690  snprintf(q, qend-q, "u%04X", cc); /* \uXXXX */
6691  else
6692  snprintf(q, qend-q, "u{%X}", cc); /* \u{XXXXX} or \u{XXXXXX} */
6693  q += strlen(q);
6694  continue;
6695  }
6696  }
6697  snprintf(q, qend-q, "x%02X", c);
6698  q += 3;
6699  }
6700  }
6701  *q++ = '"';
6702  *q = '\0';
6703  if (!rb_enc_asciicompat(enc)) {
6704  snprintf(q, qend-q, nonascii_suffix, enc->name);
6705  encidx = rb_ascii8bit_encindex();
6706  }
6707  /* result from dump is ASCII */
6708  rb_enc_associate_index(result, encidx);
6710  return result;
6711 }
6712 
6713 static int
6714 unescape_ascii(unsigned int c)
6715 {
6716  switch (c) {
6717  case 'n':
6718  return '\n';
6719  case 'r':
6720  return '\r';
6721  case 't':
6722  return '\t';
6723  case 'f':
6724  return '\f';
6725  case 'v':
6726  return '\13';
6727  case 'b':
6728  return '\010';
6729  case 'a':
6730  return '\007';
6731  case 'e':
6732  return 033;
6733  }
6734  UNREACHABLE_RETURN(-1);
6735 }
6736 
6737 static void
6738 undump_after_backslash(VALUE undumped, const char **ss, const char *s_end, rb_encoding **penc, bool *utf8, bool *binary)
6739 {
6740  const char *s = *ss;
6741  unsigned int c;
6742  int codelen;
6743  size_t hexlen;
6744  unsigned char buf[6];
6745  static rb_encoding *enc_utf8 = NULL;
6746 
6747  switch (*s) {
6748  case '\\':
6749  case '"':
6750  case '#':
6751  rb_str_cat(undumped, s, 1); /* cat itself */
6752  s++;
6753  break;
6754  case 'n':
6755  case 'r':
6756  case 't':
6757  case 'f':
6758  case 'v':
6759  case 'b':
6760  case 'a':
6761  case 'e':
6762  *buf = unescape_ascii(*s);
6763  rb_str_cat(undumped, (char *)buf, 1);
6764  s++;
6765  break;
6766  case 'u':
6767  if (*binary) {
6768  rb_raise(rb_eRuntimeError, "hex escape and Unicode escape are mixed");
6769  }
6770  *utf8 = true;
6771  if (++s >= s_end) {
6772  rb_raise(rb_eRuntimeError, "invalid Unicode escape");
6773  }
6774  if (enc_utf8 == NULL) enc_utf8 = rb_utf8_encoding();
6775  if (*penc != enc_utf8) {
6776  *penc = enc_utf8;
6777  rb_enc_associate(undumped, enc_utf8);
6778  }
6779  if (*s == '{') { /* handle \u{...} form */
6780  s++;
6781  for (;;) {
6782  if (s >= s_end) {
6783  rb_raise(rb_eRuntimeError, "unterminated Unicode escape");
6784  }
6785  if (*s == '}') {
6786  s++;
6787  break;
6788  }
6789  if (ISSPACE(*s)) {
6790  s++;
6791  continue;
6792  }
6793  c = scan_hex(s, s_end-s, &hexlen);
6794  if (hexlen == 0 || hexlen > 6) {
6795  rb_raise(rb_eRuntimeError, "invalid Unicode escape");
6796  }
6797  if (c > 0x10ffff) {
6798  rb_raise(rb_eRuntimeError, "invalid Unicode codepoint (too large)");
6799  }
6800  if (0xd800 <= c && c <= 0xdfff) {
6801  rb_raise(rb_eRuntimeError, "invalid Unicode codepoint");
6802  }
6803  codelen = rb_enc_mbcput(c, (char *)buf, *penc);
6804  rb_str_cat(undumped, (char *)buf, codelen);
6805  s += hexlen;
6806  }
6807  }
6808  else { /* handle \uXXXX form */
6809  c = scan_hex(s, 4, &hexlen);
6810  if (hexlen != 4) {
6811  rb_raise(rb_eRuntimeError, "invalid Unicode escape");
6812  }
6813  if (0xd800 <= c && c <= 0xdfff) {
6814  rb_raise(rb_eRuntimeError, "invalid Unicode codepoint");
6815  }
6816  codelen = rb_enc_mbcput(c, (char *)buf, *penc);
6817  rb_str_cat(undumped, (char *)buf, codelen);
6818  s += hexlen;
6819  }
6820  break;
6821  case 'x':
6822  if (*utf8) {
6823  rb_raise(rb_eRuntimeError, "hex escape and Unicode escape are mixed");
6824  }
6825  *binary = true;
6826  if (++s >= s_end) {
6827  rb_raise(rb_eRuntimeError, "invalid hex escape");
6828  }
6829  *buf = scan_hex(s, 2, &hexlen);
6830  if (hexlen != 2) {
6831  rb_raise(rb_eRuntimeError, "invalid hex escape");
6832  }
6833  rb_str_cat(undumped, (char *)buf, 1);
6834  s += hexlen;
6835  break;
6836  default:
6837  rb_str_cat(undumped, s-1, 2);
6838  s++;
6839  }
6840 
6841  *ss = s;
6842 }
6843 
6844 static VALUE rb_str_is_ascii_only_p(VALUE str);
6845 
6846 /*
6847  * call-seq:
6848  * undump -> string
6849  *
6850  * Returns an unescaped version of +self+:
6851  *
6852  * s_orig = "\f\x00\xff\\\"" # => "\f\u0000\xFF\\\""
6853  * s_dumped = s_orig.dump # => "\"\\f\\x00\\xFF\\\\\\\"\""
6854  * s_undumped = s_dumped.undump # => "\f\u0000\xFF\\\""
6855  * s_undumped == s_orig # => true
6856  *
6857  * Related: String#dump (inverse of String#undump).
6858  *
6859  */
6860 
6861 static VALUE
6862 str_undump(VALUE str)
6863 {
6864  const char *s = RSTRING_PTR(str);
6865  const char *s_end = RSTRING_END(str);
6866  rb_encoding *enc = rb_enc_get(str);
6867  VALUE undumped = rb_enc_str_new(s, 0L, enc);
6868  bool utf8 = false;
6869  bool binary = false;
6870  int w;
6871 
6872  rb_must_asciicompat(str);
6873  if (rb_str_is_ascii_only_p(str) == Qfalse) {
6874  rb_raise(rb_eRuntimeError, "non-ASCII character detected");
6875  }
6876  if (!str_null_check(str, &w)) {
6877  rb_raise(rb_eRuntimeError, "string contains null byte");
6878  }
6879  if (RSTRING_LEN(str) < 2) goto invalid_format;
6880  if (*s != '"') goto invalid_format;
6881 
6882  /* strip '"' at the start */
6883  s++;
6884 
6885  for (;;) {
6886  if (s >= s_end) {
6887  rb_raise(rb_eRuntimeError, "unterminated dumped string");
6888  }
6889 
6890  if (*s == '"') {
6891  /* epilogue */
6892  s++;
6893  if (s == s_end) {
6894  /* ascii compatible dumped string */
6895  break;
6896  }
6897  else {
6898  static const char force_encoding_suffix[] = ".force_encoding(\""; /* "\")" */
6899  static const char dup_suffix[] = ".dup";
6900  const char *encname;
6901  int encidx;
6902  ptrdiff_t size;
6903 
6904  /* check separately for strings dumped by older versions */
6905  size = sizeof(dup_suffix) - 1;
6906  if (s_end - s > size && memcmp(s, dup_suffix, size) == 0) s += size;
6907 
6908  size = sizeof(force_encoding_suffix) - 1;
6909  if (s_end - s <= size) goto invalid_format;
6910  if (memcmp(s, force_encoding_suffix, size) != 0) goto invalid_format;
6911  s += size;
6912 
6913  if (utf8) {
6914  rb_raise(rb_eRuntimeError, "dumped string contained Unicode escape but used force_encoding");
6915  }
6916 
6917  encname = s;
6918  s = memchr(s, '"', s_end-s);
6919  size = s - encname;
6920  if (!s) goto invalid_format;
6921  if (s_end - s != 2) goto invalid_format;
6922  if (s[0] != '"' || s[1] != ')') goto invalid_format;
6923 
6924  encidx = rb_enc_find_index2(encname, (long)size);
6925  if (encidx < 0) {
6926  rb_raise(rb_eRuntimeError, "dumped string has unknown encoding name");
6927  }
6928  rb_enc_associate_index(undumped, encidx);
6929  }
6930  break;
6931  }
6932 
6933  if (*s == '\\') {
6934  s++;
6935  if (s >= s_end) {
6936  rb_raise(rb_eRuntimeError, "invalid escape");
6937  }
6938  undump_after_backslash(undumped, &s, s_end, &enc, &utf8, &binary);
6939  }
6940  else {
6941  rb_str_cat(undumped, s++, 1);
6942  }
6943  }
6944 
6945  return undumped;
6946 invalid_format:
6947  rb_raise(rb_eRuntimeError, "invalid dumped string; not wrapped with '\"' nor '\"...\".force_encoding(\"...\")' form");
6948 }
6949 
6950 static void
6951 rb_str_check_dummy_enc(rb_encoding *enc)
6952 {
6953  if (rb_enc_dummy_p(enc)) {
6954  rb_raise(rb_eEncCompatError, "incompatible encoding with this operation: %s",
6955  rb_enc_name(enc));
6956  }
6957 }
6958 
6959 static rb_encoding *
6960 str_true_enc(VALUE str)
6961 {
6962  rb_encoding *enc = STR_ENC_GET(str);
6963  rb_str_check_dummy_enc(enc);
6964  return enc;
6965 }
6966 
6967 static OnigCaseFoldType
6968 check_case_options(int argc, VALUE *argv, OnigCaseFoldType flags)
6969 {
6970  if (argc==0)
6971  return flags;
6972  if (argc>2)
6973  rb_raise(rb_eArgError, "too many options");
6974  if (argv[0]==sym_turkic) {
6975  flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
6976  if (argc==2) {
6977  if (argv[1]==sym_lithuanian)
6978  flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
6979  else
6980  rb_raise(rb_eArgError, "invalid second option");
6981  }
6982  }
6983  else if (argv[0]==sym_lithuanian) {
6984  flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
6985  if (argc==2) {
6986  if (argv[1]==sym_turkic)
6987  flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
6988  else
6989  rb_raise(rb_eArgError, "invalid second option");
6990  }
6991  }
6992  else if (argc>1)
6993  rb_raise(rb_eArgError, "too many options");
6994  else if (argv[0]==sym_ascii)
6995  flags |= ONIGENC_CASE_ASCII_ONLY;
6996  else if (argv[0]==sym_fold) {
6997  if ((flags & (ONIGENC_CASE_UPCASE|ONIGENC_CASE_DOWNCASE)) == ONIGENC_CASE_DOWNCASE)
6998  flags ^= ONIGENC_CASE_FOLD|ONIGENC_CASE_DOWNCASE;
6999  else
7000  rb_raise(rb_eArgError, "option :fold only allowed for downcasing");
7001  }
7002  else
7003  rb_raise(rb_eArgError, "invalid option");
7004  return flags;
7005 }
7006 
7007 static inline bool
7008 case_option_single_p(OnigCaseFoldType flags, rb_encoding *enc, VALUE str)
7009 {
7010  if ((flags & ONIGENC_CASE_ASCII_ONLY) && (enc==rb_utf8_encoding() || rb_enc_mbmaxlen(enc) == 1))
7011  return true;
7012  return !(flags & ONIGENC_CASE_FOLD_TURKISH_AZERI) && ENC_CODERANGE(str) == ENC_CODERANGE_7BIT;
7013 }
7014 
7015 /* 16 should be long enough to absorb any kind of single character length increase */
7016 #define CASE_MAPPING_ADDITIONAL_LENGTH 20
7017 #ifndef CASEMAP_DEBUG
7018 # define CASEMAP_DEBUG 0
7019 #endif
7020 
7021 struct mapping_buffer;
7022 typedef struct mapping_buffer {
7023  size_t capa;
7024  size_t used;
7025  struct mapping_buffer *next;
7026  OnigUChar space[FLEX_ARY_LEN];
7027 } mapping_buffer;
7028 
7029 static void
7030 mapping_buffer_free(void *p)
7031 {
7032  mapping_buffer *previous_buffer;
7033  mapping_buffer *current_buffer = p;
7034  while (current_buffer) {
7035  previous_buffer = current_buffer;
7036  current_buffer = current_buffer->next;
7037  ruby_sized_xfree(previous_buffer, previous_buffer->capa);
7038  }
7039 }
7040 
7041 static const rb_data_type_t mapping_buffer_type = {
7042  "mapping_buffer",
7043  {0, mapping_buffer_free,}
7044 };
7045 
7046 static VALUE
7047 rb_str_casemap(VALUE source, OnigCaseFoldType *flags, rb_encoding *enc)
7048 {
7049  VALUE target;
7050 
7051  const OnigUChar *source_current, *source_end;
7052  int target_length = 0;
7053  VALUE buffer_anchor;
7054  mapping_buffer *current_buffer = 0;
7055  mapping_buffer **pre_buffer;
7056  size_t buffer_count = 0;
7057  int buffer_length_or_invalid;
7058 
7059  if (RSTRING_LEN(source) == 0) return str_duplicate(rb_cString, source);
7060 
7061  source_current = (OnigUChar*)RSTRING_PTR(source);
7062  source_end = (OnigUChar*)RSTRING_END(source);
7063 
7064  buffer_anchor = TypedData_Wrap_Struct(0, &mapping_buffer_type, 0);
7065  pre_buffer = (mapping_buffer **)&DATA_PTR(buffer_anchor);
7066  while (source_current < source_end) {
7067  /* increase multiplier using buffer count to converge quickly */
7068  size_t capa = (size_t)(source_end-source_current)*++buffer_count + CASE_MAPPING_ADDITIONAL_LENGTH;
7069  if (CASEMAP_DEBUG) {
7070  fprintf(stderr, "Buffer allocation, capa is %"PRIuSIZE"\n", capa); /* for tuning */
7071  }
7072  current_buffer = xmalloc(offsetof(mapping_buffer, space) + capa);
7073  *pre_buffer = current_buffer;
7074  pre_buffer = &current_buffer->next;
7075  current_buffer->next = NULL;
7076  current_buffer->capa = capa;
7077  buffer_length_or_invalid = enc->case_map(flags,
7078  &source_current, source_end,
7079  current_buffer->space,
7080  current_buffer->space+current_buffer->capa,
7081  enc);
7082  if (buffer_length_or_invalid < 0) {
7083  current_buffer = DATA_PTR(buffer_anchor);
7084  DATA_PTR(buffer_anchor) = 0;
7085  mapping_buffer_free(current_buffer);
7086  rb_raise(rb_eArgError, "input string invalid");
7087  }
7088  target_length += current_buffer->used = buffer_length_or_invalid;
7089  }
7090  if (CASEMAP_DEBUG) {
7091  fprintf(stderr, "Buffer count is %"PRIuSIZE"\n", buffer_count); /* for tuning */
7092  }
7093 
7094  if (buffer_count==1) {
7095  target = rb_str_new((const char*)current_buffer->space, target_length);
7096  }
7097  else {
7098  char *target_current;
7099 
7100  target = rb_str_new(0, target_length);
7101  target_current = RSTRING_PTR(target);
7102  current_buffer = DATA_PTR(buffer_anchor);
7103  while (current_buffer) {
7104  memcpy(target_current, current_buffer->space, current_buffer->used);
7105  target_current += current_buffer->used;
7106  current_buffer = current_buffer->next;
7107  }
7108  }
7109  current_buffer = DATA_PTR(buffer_anchor);
7110  DATA_PTR(buffer_anchor) = 0;
7111  mapping_buffer_free(current_buffer);
7112 
7113  RB_GC_GUARD(buffer_anchor);
7114 
7115  /* TODO: check about string terminator character */
7116  str_enc_copy(target, source);
7117  /*ENC_CODERANGE_SET(mapped, cr);*/
7118 
7119  return target;
7120 }
7121 
7122 static VALUE
7123 rb_str_ascii_casemap(VALUE source, VALUE target, OnigCaseFoldType *flags, rb_encoding *enc)
7124 {
7125  const OnigUChar *source_current, *source_end;
7126  OnigUChar *target_current, *target_end;
7127  long old_length = RSTRING_LEN(source);
7128  int length_or_invalid;
7129 
7130  if (old_length == 0) return Qnil;
7131 
7132  source_current = (OnigUChar*)RSTRING_PTR(source);
7133  source_end = (OnigUChar*)RSTRING_END(source);
7134  if (source == target) {
7135  target_current = (OnigUChar*)source_current;
7136  target_end = (OnigUChar*)source_end;
7137  }
7138  else {
7139  target_current = (OnigUChar*)RSTRING_PTR(target);
7140  target_end = (OnigUChar*)RSTRING_END(target);
7141  }
7142 
7143  length_or_invalid = onigenc_ascii_only_case_map(flags,
7144  &source_current, source_end,
7145  target_current, target_end, enc);
7146  if (length_or_invalid < 0)
7147  rb_raise(rb_eArgError, "input string invalid");
7148  if (CASEMAP_DEBUG && length_or_invalid != old_length) {
7149  fprintf(stderr, "problem with rb_str_ascii_casemap"
7150  "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
7151  rb_raise(rb_eArgError, "internal problem with rb_str_ascii_casemap"
7152  "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
7153  }
7154 
7155  str_enc_copy(target, source);
7156 
7157  return target;
7158 }
7159 
7160 static bool
7161 upcase_single(VALUE str)
7162 {
7163  char *s = RSTRING_PTR(str), *send = RSTRING_END(str);
7164  bool modified = false;
7165 
7166  while (s < send) {
7167  unsigned int c = *(unsigned char*)s;
7168 
7169  if ('a' <= c && c <= 'z') {
7170  *s = 'A' + (c - 'a');
7171  modified = true;
7172  }
7173  s++;
7174  }
7175  return modified;
7176 }
7177 
7178 /*
7179  * call-seq:
7180  * upcase!(*options) -> self or nil
7181  *
7182  * Upcases the characters in +self+;
7183  * returns +self+ if any changes were made, +nil+ otherwise:
7184  *
7185  * s = 'Hello World!' # => "Hello World!"
7186  * s.upcase! # => "HELLO WORLD!"
7187  * s # => "HELLO WORLD!"
7188  * s.upcase! # => nil
7189  *
7190  * The casing may be affected by the given +options+;
7191  * see {Case Mapping}[doc/case_mapping_rdoc.html].
7192  *
7193  * Related: String#upcase, String#downcase, String#downcase!.
7194  *
7195  */
7196 
7197 static VALUE
7198 rb_str_upcase_bang(int argc, VALUE *argv, VALUE str)
7199 {
7200  rb_encoding *enc;
7201  OnigCaseFoldType flags = ONIGENC_CASE_UPCASE;
7202 
7203  flags = check_case_options(argc, argv, flags);
7204  str_modify_keep_cr(str);
7205  enc = str_true_enc(str);
7206  if (case_option_single_p(flags, enc, str)) {
7207  if (upcase_single(str))
7208  flags |= ONIGENC_CASE_MODIFIED;
7209  }
7210  else if (flags&ONIGENC_CASE_ASCII_ONLY)
7211  rb_str_ascii_casemap(str, str, &flags, enc);
7212  else
7213  str_shared_replace(str, rb_str_casemap(str, &flags, enc));
7214 
7215  if (ONIGENC_CASE_MODIFIED&flags) return str;
7216  return Qnil;
7217 }
7218 
7219 
7220 /*
7221  * call-seq:
7222  * upcase(*options) -> string
7223  *
7224  * Returns a string containing the upcased characters in +self+:
7225  *
7226  * s = 'Hello World!' # => "Hello World!"
7227  * s.upcase # => "HELLO WORLD!"
7228  *
7229  * The casing may be affected by the given +options+;
7230  * see {Case Mapping}[doc/case_mapping_rdoc.html].
7231  *
7232  * Related: String#upcase!, String#downcase, String#downcase!.
7233  *
7234  */
7235 
7236 static VALUE
7237 rb_str_upcase(int argc, VALUE *argv, VALUE str)
7238 {
7239  rb_encoding *enc;
7240  OnigCaseFoldType flags = ONIGENC_CASE_UPCASE;
7241  VALUE ret;
7242 
7243  flags = check_case_options(argc, argv, flags);
7244  enc = str_true_enc(str);
7245  if (case_option_single_p(flags, enc, str)) {
7246  ret = rb_str_new(RSTRING_PTR(str), RSTRING_LEN(str));
7247  str_enc_copy(ret, str);
7248  upcase_single(ret);
7249  }
7250  else if (flags&ONIGENC_CASE_ASCII_ONLY) {
7251  ret = rb_str_new(0, RSTRING_LEN(str));
7252  rb_str_ascii_casemap(str, ret, &flags, enc);
7253  }
7254  else {
7255  ret = rb_str_casemap(str, &flags, enc);
7256  }
7257 
7258  return ret;
7259 }
7260 
7261 static bool
7262 downcase_single(VALUE str)
7263 {
7264  char *s = RSTRING_PTR(str), *send = RSTRING_END(str);
7265  bool modified = false;
7266 
7267  while (s < send) {
7268  unsigned int c = *(unsigned char*)s;
7269 
7270  if ('A' <= c && c <= 'Z') {
7271  *s = 'a' + (c - 'A');
7272  modified = true;
7273  }
7274  s++;
7275  }
7276 
7277  return modified;
7278 }
7279 
7280 /*
7281  * call-seq:
7282  * downcase!(*options) -> self or nil
7283  *
7284  * Downcases the characters in +self+;
7285  * returns +self+ if any changes were made, +nil+ otherwise:
7286  *
7287  * s = 'Hello World!' # => "Hello World!"
7288  * s.downcase! # => "hello world!"
7289  * s # => "hello world!"
7290  * s.downcase! # => nil
7291  *
7292  * The casing may be affected by the given +options+;
7293  * see {Case Mapping}[doc/case_mapping_rdoc.html].
7294  *
7295  * Related: String#downcase, String#upcase, String#upcase!.
7296  *
7297  */
7298 
7299 static VALUE
7300 rb_str_downcase_bang(int argc, VALUE *argv, VALUE str)
7301 {
7302  rb_encoding *enc;
7303  OnigCaseFoldType flags = ONIGENC_CASE_DOWNCASE;
7304 
7305  flags = check_case_options(argc, argv, flags);
7306  str_modify_keep_cr(str);
7307  enc = str_true_enc(str);
7308  if (case_option_single_p(flags, enc, str)) {
7309  if (downcase_single(str))
7310  flags |= ONIGENC_CASE_MODIFIED;
7311  }
7312  else if (flags&ONIGENC_CASE_ASCII_ONLY)
7313  rb_str_ascii_casemap(str, str, &flags, enc);
7314  else
7315  str_shared_replace(str, rb_str_casemap(str, &flags, enc));
7316 
7317  if (ONIGENC_CASE_MODIFIED&flags) return str;
7318  return Qnil;
7319 }
7320 
7321 
7322 /*
7323  * call-seq:
7324  * downcase(*options) -> string
7325  *
7326  * Returns a string containing the downcased characters in +self+:
7327  *
7328  * s = 'Hello World!' # => "Hello World!"
7329  * s.downcase # => "hello world!"
7330  *
7331  * The casing may be affected by the given +options+;
7332  * see {Case Mapping}[doc/case_mapping_rdoc.html].
7333  *
7334  * Related: String#downcase!, String#upcase, String#upcase!.
7335  *
7336  */
7337 
7338 static VALUE
7339 rb_str_downcase(int argc, VALUE *argv, VALUE str)
7340 {
7341  rb_encoding *enc;
7342  OnigCaseFoldType flags = ONIGENC_CASE_DOWNCASE;
7343  VALUE ret;
7344 
7345  flags = check_case_options(argc, argv, flags);
7346  enc = str_true_enc(str);
7347  if (case_option_single_p(flags, enc, str)) {
7348  ret = rb_str_new(RSTRING_PTR(str), RSTRING_LEN(str));
7349  str_enc_copy(ret, str);
7350  downcase_single(ret);
7351  }
7352  else if (flags&ONIGENC_CASE_ASCII_ONLY) {
7353  ret = rb_str_new(0, RSTRING_LEN(str));
7354  rb_str_ascii_casemap(str, ret, &flags, enc);
7355  }
7356  else {
7357  ret = rb_str_casemap(str, &flags, enc);
7358  }
7359 
7360  return ret;
7361 }
7362 
7363 
7364 /*
7365  * call-seq:
7366  * capitalize!(*options) -> self or nil
7367  *
7368  * Upcases the first character in +self+;
7369  * downcases the remaining characters;
7370  * returns +self+ if any changes were made, +nil+ otherwise:
7371  *
7372  * s = 'hello World!' # => "hello World!"
7373  * s.capitalize! # => "Hello world!"
7374  * s # => "Hello world!"
7375  * s.capitalize! # => nil
7376  *
7377  * The casing may be affected by the given +options+;
7378  * see {Case Mapping}[doc/case_mapping_rdoc.html].
7379  *
7380  * Related: String#capitalize.
7381  *
7382  */
7383 
7384 static VALUE
7385 rb_str_capitalize_bang(int argc, VALUE *argv, VALUE str)
7386 {
7387  rb_encoding *enc;
7388  OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_TITLECASE;
7389 
7390  flags = check_case_options(argc, argv, flags);
7391  str_modify_keep_cr(str);
7392  enc = str_true_enc(str);
7393  if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
7394  if (flags&ONIGENC_CASE_ASCII_ONLY)
7395  rb_str_ascii_casemap(str, str, &flags, enc);
7396  else
7397  str_shared_replace(str, rb_str_casemap(str, &flags, enc));
7398 
7399  if (ONIGENC_CASE_MODIFIED&flags) return str;
7400  return Qnil;
7401 }
7402 
7403 
7404 /*
7405  * call-seq:
7406  * capitalize(*options) -> string
7407  *
7408  * Returns a string containing the characters in +self+;
7409  * the first character is upcased;
7410  * the remaining characters are downcased:
7411  *
7412  * s = 'hello World!' # => "hello World!"
7413  * s.capitalize # => "Hello world!"
7414  *
7415  * The casing may be affected by the given +options+;
7416  * see {Case Mapping}[doc/case_mapping_rdoc.html].
7417  *
7418  * Related: String#capitalize!.
7419  *
7420  */
7421 
7422 static VALUE
7423 rb_str_capitalize(int argc, VALUE *argv, VALUE str)
7424 {
7425  rb_encoding *enc;
7426  OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_TITLECASE;
7427  VALUE ret;
7428 
7429  flags = check_case_options(argc, argv, flags);
7430  enc = str_true_enc(str);
7431  if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return str;
7432  if (flags&ONIGENC_CASE_ASCII_ONLY) {
7433  ret = rb_str_new(0, RSTRING_LEN(str));
7434  rb_str_ascii_casemap(str, ret, &flags, enc);
7435  }
7436  else {
7437  ret = rb_str_casemap(str, &flags, enc);
7438  }
7439  return ret;
7440 }
7441 
7442 
7443 /*
7444  * call-seq:
7445  * swapcase!(*options) -> self or nil
7446  *
7447  * Upcases each lowercase character in +self+;
7448  * downcases uppercase character;
7449  * returns +self+ if any changes were made, +nil+ otherwise:
7450  *
7451  * s = 'Hello World!' # => "Hello World!"
7452  * s.swapcase! # => "hELLO wORLD!"
7453  * s # => "Hello World!"
7454  * ''.swapcase! # => nil
7455  *
7456  * The casing may be affected by the given +options+;
7457  * see {Case Mapping}[doc/case_mapping_rdoc.html].
7458  *
7459  * Related: String#swapcase.
7460  *
7461  */
7462 
7463 static VALUE
7464 rb_str_swapcase_bang(int argc, VALUE *argv, VALUE str)
7465 {
7466  rb_encoding *enc;
7467  OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE;
7468 
7469  flags = check_case_options(argc, argv, flags);
7470  str_modify_keep_cr(str);
7471  enc = str_true_enc(str);
7472  if (flags&ONIGENC_CASE_ASCII_ONLY)
7473  rb_str_ascii_casemap(str, str, &flags, enc);
7474  else
7475  str_shared_replace(str, rb_str_casemap(str, &flags, enc));
7476 
7477  if (ONIGENC_CASE_MODIFIED&flags) return str;
7478  return Qnil;
7479 }
7480 
7481 
7482 /*
7483  * call-seq:
7484  * swapcase(*options) -> string
7485  *
7486  * Returns a string containing the characters in +self+, with cases reversed;
7487  * each uppercase character is downcased;
7488  * each lowercase character is upcased:
7489  *
7490  * s = 'Hello World!' # => "Hello World!"
7491  * s.swapcase # => "hELLO wORLD!"
7492  *
7493  * The casing may be affected by the given +options+;
7494  * see {Case Mapping}[doc/case_mapping_rdoc.html].
7495  *
7496  * Related: String#swapcase!.
7497  *
7498  */
7499 
7500 static VALUE
7501 rb_str_swapcase(int argc, VALUE *argv, VALUE str)
7502 {
7503  rb_encoding *enc;
7504  OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE;
7505  VALUE ret;
7506 
7507  flags = check_case_options(argc, argv, flags);
7508  enc = str_true_enc(str);
7509  if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return str_duplicate(rb_cString, str);
7510  if (flags&ONIGENC_CASE_ASCII_ONLY) {
7511  ret = rb_str_new(0, RSTRING_LEN(str));
7512  rb_str_ascii_casemap(str, ret, &flags, enc);
7513  }
7514  else {
7515  ret = rb_str_casemap(str, &flags, enc);
7516  }
7517  return ret;
7518 }
7519 
7520 typedef unsigned char *USTR;
7521 
7522 struct tr {
7523  int gen;
7524  unsigned int now, max;
7525  char *p, *pend;
7526 };
7527 
7528 static unsigned int
7529 trnext(struct tr *t, rb_encoding *enc)
7530 {
7531  int n;
7532 
7533  for (;;) {
7534  nextpart:
7535  if (!t->gen) {
7536  if (t->p == t->pend) return -1;
7537  if (rb_enc_ascget(t->p, t->pend, &n, enc) == '\\' && t->p + n < t->pend) {
7538  t->p += n;
7539  }
7540  t->now = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
7541  t->p += n;
7542  if (rb_enc_ascget(t->p, t->pend, &n, enc) == '-' && t->p + n < t->pend) {
7543  t->p += n;
7544  if (t->p < t->pend) {
7545  unsigned int c = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
7546  t->p += n;
7547  if (t->now > c) {
7548  if (t->now < 0x80 && c < 0x80) {
7550  "invalid range \"%c-%c\" in string transliteration",
7551  t->now, c);
7552  }
7553  else {
7554  rb_raise(rb_eArgError, "invalid range in string transliteration");
7555  }
7556  continue; /* not reached */
7557  }
7558  t->gen = 1;
7559  t->max = c;
7560  }
7561  }
7562  return t->now;
7563  }
7564  else {
7565  while (ONIGENC_CODE_TO_MBCLEN(enc, ++t->now) <= 0) {
7566  if (t->now == t->max) {
7567  t->gen = 0;
7568  goto nextpart;
7569  }
7570  }
7571  if (t->now < t->max) {
7572  return t->now;
7573  }
7574  else {
7575  t->gen = 0;
7576  return t->max;
7577  }
7578  }
7579  }
7580 }
7581 
7582 static VALUE rb_str_delete_bang(int,VALUE*,VALUE);
7583 
7584 static VALUE
7585 tr_trans(VALUE str, VALUE src, VALUE repl, int sflag)
7586 {
7587  const unsigned int errc = -1;
7588  unsigned int trans[256];
7589  rb_encoding *enc, *e1, *e2;
7590  struct tr trsrc, trrepl;
7591  int cflag = 0;
7592  unsigned int c, c0, last = 0;
7593  int modify = 0, i, l;
7594  unsigned char *s, *send;
7595  VALUE hash = 0;
7596  int singlebyte = single_byte_optimizable(str);
7597  int termlen;
7598  int cr;
7599 
7600 #define CHECK_IF_ASCII(c) \
7601  (void)((cr == ENC_CODERANGE_7BIT && !rb_isascii(c)) ? \
7602  (cr = ENC_CODERANGE_VALID) : 0)
7603 
7604  StringValue(src);
7605  StringValue(repl);
7606  if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
7607  if (RSTRING_LEN(repl) == 0) {
7608  return rb_str_delete_bang(1, &src, str);
7609  }
7610 
7611  cr = ENC_CODERANGE(str);
7612  e1 = rb_enc_check(str, src);
7613  e2 = rb_enc_check(str, repl);
7614  if (e1 == e2) {
7615  enc = e1;
7616  }
7617  else {
7618  enc = rb_enc_check(src, repl);
7619  }
7620  trsrc.p = RSTRING_PTR(src); trsrc.pend = trsrc.p + RSTRING_LEN(src);
7621  if (RSTRING_LEN(src) > 1 &&
7622  rb_enc_ascget(trsrc.p, trsrc.pend, &l, enc) == '^' &&
7623  trsrc.p + l < trsrc.pend) {
7624  cflag = 1;
7625  trsrc.p += l;
7626  }
7627  trrepl.p = RSTRING_PTR(repl);
7628  trrepl.pend = trrepl.p + RSTRING_LEN(repl);
7629  trsrc.gen = trrepl.gen = 0;
7630  trsrc.now = trrepl.now = 0;
7631  trsrc.max = trrepl.max = 0;
7632 
7633  if (cflag) {
7634  for (i=0; i<256; i++) {
7635  trans[i] = 1;
7636  }
7637  while ((c = trnext(&trsrc, enc)) != errc) {
7638  if (c < 256) {
7639  trans[c] = errc;
7640  }
7641  else {
7642  if (!hash) hash = rb_hash_new();
7643  rb_hash_aset(hash, UINT2NUM(c), Qtrue);
7644  }
7645  }
7646  while ((c = trnext(&trrepl, enc)) != errc)
7647  /* retrieve last replacer */;
7648  last = trrepl.now;
7649  for (i=0; i<256; i++) {
7650  if (trans[i] != errc) {
7651  trans[i] = last;
7652  }
7653  }
7654  }
7655  else {
7656  unsigned int r;
7657 
7658  for (i=0; i<256; i++) {
7659  trans[i] = errc;
7660  }
7661  while ((c = trnext(&trsrc, enc)) != errc) {
7662  r = trnext(&trrepl, enc);
7663  if (r == errc) r = trrepl.now;
7664  if (c < 256) {
7665  trans[c] = r;
7666  if (rb_enc_codelen(r, enc) != 1) singlebyte = 0;
7667  }
7668  else {
7669  if (!hash) hash = rb_hash_new();
7670  rb_hash_aset(hash, UINT2NUM(c), UINT2NUM(r));
7671  }
7672  }
7673  }
7674 
7675  if (cr == ENC_CODERANGE_VALID && rb_enc_asciicompat(e1))
7676  cr = ENC_CODERANGE_7BIT;
7677  str_modify_keep_cr(str);
7678  s = (unsigned char *)RSTRING_PTR(str); send = (unsigned char *)RSTRING_END(str);
7679  termlen = rb_enc_mbminlen(enc);
7680  if (sflag) {
7681  int clen, tlen;
7682  long offset, max = RSTRING_LEN(str);
7683  unsigned int save = -1;
7684  unsigned char *buf = ALLOC_N(unsigned char, max + termlen), *t = buf;
7685 
7686  while (s < send) {
7687  int may_modify = 0;
7688 
7689  c0 = c = rb_enc_codepoint_len((char *)s, (char *)send, &clen, e1);
7690  tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
7691 
7692  s += clen;
7693  if (c < 256) {
7694  c = trans[c];
7695  }
7696  else if (hash) {
7697  VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
7698  if (NIL_P(tmp)) {
7699  if (cflag) c = last;
7700  else c = errc;
7701  }
7702  else if (cflag) c = errc;
7703  else c = NUM2INT(tmp);
7704  }
7705  else {
7706  c = errc;
7707  }
7708  if (c != (unsigned int)-1) {
7709  if (save == c) {
7710  CHECK_IF_ASCII(c);
7711  continue;
7712  }
7713  save = c;
7714  tlen = rb_enc_codelen(c, enc);
7715  modify = 1;
7716  }
7717  else {
7718  save = -1;
7719  c = c0;
7720  if (enc != e1) may_modify = 1;
7721  }
7722  if ((offset = t - buf) + tlen > max) {
7723  size_t MAYBE_UNUSED(old) = max + termlen;
7724  max = offset + tlen + (send - s);
7725  SIZED_REALLOC_N(buf, unsigned char, max + termlen, old);
7726  t = buf + offset;
7727  }
7728  rb_enc_mbcput(c, t, enc);
7729  if (may_modify && memcmp(s, t, tlen) != 0) {
7730  modify = 1;
7731  }
7732  CHECK_IF_ASCII(c);
7733  t += tlen;
7734  }
7735  if (!STR_EMBED_P(str)) {
7736  ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
7737  }
7738  TERM_FILL((char *)t, termlen);
7739  RSTRING(str)->as.heap.ptr = (char *)buf;
7740  RSTRING(str)->as.heap.len = t - buf;
7741  STR_SET_NOEMBED(str);
7742  RSTRING(str)->as.heap.aux.capa = max;
7743  }
7744  else if (rb_enc_mbmaxlen(enc) == 1 || (singlebyte && !hash)) {
7745  while (s < send) {
7746  c = (unsigned char)*s;
7747  if (trans[c] != errc) {
7748  if (!cflag) {
7749  c = trans[c];
7750  *s = c;
7751  modify = 1;
7752  }
7753  else {
7754  *s = last;
7755  modify = 1;
7756  }
7757  }
7758  CHECK_IF_ASCII(c);
7759  s++;
7760  }
7761  }
7762  else {
7763  int clen, tlen;
7764  long offset, max = (long)((send - s) * 1.2);
7765  unsigned char *buf = ALLOC_N(unsigned char, max + termlen), *t = buf;
7766 
7767  while (s < send) {
7768  int may_modify = 0;
7769  c0 = c = rb_enc_codepoint_len((char *)s, (char *)send, &clen, e1);
7770  tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
7771 
7772  if (c < 256) {
7773  c = trans[c];
7774  }
7775  else if (hash) {
7776  VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
7777  if (NIL_P(tmp)) {
7778  if (cflag) c = last;
7779  else c = errc;
7780  }
7781  else if (cflag) c = errc;
7782  else c = NUM2INT(tmp);
7783  }
7784  else {
7785  c = cflag ? last : errc;
7786  }
7787  if (c != errc) {
7788  tlen = rb_enc_codelen(c, enc);
7789  modify = 1;
7790  }
7791  else {
7792  c = c0;
7793  if (enc != e1) may_modify = 1;
7794  }
7795  if ((offset = t - buf) + tlen > max) {
7796  size_t MAYBE_UNUSED(old) = max + termlen;
7797  max = offset + tlen + (long)((send - s) * 1.2);
7798  SIZED_REALLOC_N(buf, unsigned char, max + termlen, old);
7799  t = buf + offset;
7800  }
7801  if (s != t) {
7802  rb_enc_mbcput(c, t, enc);
7803  if (may_modify && memcmp(s, t, tlen) != 0) {
7804  modify = 1;
7805  }
7806  }
7807  CHECK_IF_ASCII(c);
7808  s += clen;
7809  t += tlen;
7810  }
7811  if (!STR_EMBED_P(str)) {
7812  ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
7813  }
7814  TERM_FILL((char *)t, termlen);
7815  RSTRING(str)->as.heap.ptr = (char *)buf;
7816  RSTRING(str)->as.heap.len = t - buf;
7817  STR_SET_NOEMBED(str);
7818  RSTRING(str)->as.heap.aux.capa = max;
7819  }
7820 
7821  if (modify) {
7822  if (cr != ENC_CODERANGE_BROKEN)
7823  ENC_CODERANGE_SET(str, cr);
7824  rb_enc_associate(str, enc);
7825  return str;
7826  }
7827  return Qnil;
7828 }
7829 
7830 
7831 /*
7832  * call-seq:
7833  * str.tr!(from_str, to_str) -> str or nil
7834  *
7835  * Translates <i>str</i> in place, using the same rules as
7836  * String#tr. Returns <i>str</i>, or <code>nil</code> if no changes
7837  * were made.
7838  */
7839 
7840 static VALUE
7841 rb_str_tr_bang(VALUE str, VALUE src, VALUE repl)
7842 {
7843  return tr_trans(str, src, repl, 0);
7844 }
7845 
7846 
7847 /*
7848  * call-seq:
7849  * str.tr(from_str, to_str) => new_str
7850  *
7851  * Returns a copy of +str+ with the characters in +from_str+ replaced by the
7852  * corresponding characters in +to_str+. If +to_str+ is shorter than
7853  * +from_str+, it is padded with its last character in order to maintain the
7854  * correspondence.
7855  *
7856  * "hello".tr('el', 'ip') #=> "hippo"
7857  * "hello".tr('aeiou', '*') #=> "h*ll*"
7858  * "hello".tr('aeiou', 'AA*') #=> "hAll*"
7859  *
7860  * Both strings may use the <code>c1-c2</code> notation to denote ranges of
7861  * characters, and +from_str+ may start with a <code>^</code>, which denotes
7862  * all characters except those listed.
7863  *
7864  * "hello".tr('a-y', 'b-z') #=> "ifmmp"
7865  * "hello".tr('^aeiou', '*') #=> "*e**o"
7866  *
7867  * The backslash character <code>\</code> can be used to escape
7868  * <code>^</code> or <code>-</code> and is otherwise ignored unless it
7869  * appears at the end of a range or the end of the +from_str+ or +to_str+:
7870  *
7871  * "hello^world".tr("\\^aeiou", "*") #=> "h*ll**w*rld"
7872  * "hello-world".tr("a\\-eo", "*") #=> "h*ll**w*rld"
7873  *
7874  * "hello\r\nworld".tr("\r", "") #=> "hello\nworld"
7875  * "hello\r\nworld".tr("\\r", "") #=> "hello\r\nwold"
7876  * "hello\r\nworld".tr("\\\r", "") #=> "hello\nworld"
7877  *
7878  * "X['\\b']".tr("X\\", "") #=> "['b']"
7879  * "X['\\b']".tr("X-\\]", "") #=> "'b'"
7880  */
7881 
7882 static VALUE
7883 rb_str_tr(VALUE str, VALUE src, VALUE repl)
7884 {
7885  str = str_duplicate(rb_cString, str);
7886  tr_trans(str, src, repl, 0);
7887  return str;
7888 }
7889 
7890 #define TR_TABLE_MAX (UCHAR_MAX+1)
7891 #define TR_TABLE_SIZE (TR_TABLE_MAX+1)
7892 static void
7893 tr_setup_table(VALUE str, char stable[TR_TABLE_SIZE], int first,
7894  VALUE *tablep, VALUE *ctablep, rb_encoding *enc)
7895 {
7896  const unsigned int errc = -1;
7897  char buf[TR_TABLE_MAX];
7898  struct tr tr;
7899  unsigned int c;
7900  VALUE table = 0, ptable = 0;
7901  int i, l, cflag = 0;
7902 
7903  tr.p = RSTRING_PTR(str); tr.pend = tr.p + RSTRING_LEN(str);
7904  tr.gen = tr.now = tr.max = 0;
7905 
7906  if (RSTRING_LEN(str) > 1 && rb_enc_ascget(tr.p, tr.pend, &l, enc) == '^') {
7907  cflag = 1;
7908  tr.p += l;
7909  }
7910  if (first) {
7911  for (i=0; i<TR_TABLE_MAX; i++) {
7912  stable[i] = 1;
7913  }
7914  stable[TR_TABLE_MAX] = cflag;
7915  }
7916  else if (stable[TR_TABLE_MAX] && !cflag) {
7917  stable[TR_TABLE_MAX] = 0;
7918  }
7919  for (i=0; i<TR_TABLE_MAX; i++) {
7920  buf[i] = cflag;
7921  }
7922 
7923  while ((c = trnext(&tr, enc)) != errc) {
7924  if (c < TR_TABLE_MAX) {
7925  buf[(unsigned char)c] = !cflag;
7926  }
7927  else {
7928  VALUE key = UINT2NUM(c);
7929 
7930  if (!table && (first || *tablep || stable[TR_TABLE_MAX])) {
7931  if (cflag) {
7932  ptable = *ctablep;
7933  table = ptable ? ptable : rb_hash_new();
7934  *ctablep = table;
7935  }
7936  else {
7937  table = rb_hash_new();
7938  ptable = *tablep;
7939  *tablep = table;
7940  }
7941  }
7942  if (table && (!ptable || (cflag ^ !NIL_P(rb_hash_aref(ptable, key))))) {
7943  rb_hash_aset(table, key, Qtrue);
7944  }
7945  }
7946  }
7947  for (i=0; i<TR_TABLE_MAX; i++) {
7948  stable[i] = stable[i] && buf[i];
7949  }
7950  if (!table && !cflag) {
7951  *tablep = 0;
7952  }
7953 }
7954 
7955 
7956 static int
7957 tr_find(unsigned int c, const char table[TR_TABLE_SIZE], VALUE del, VALUE nodel)
7958 {
7959  if (c < TR_TABLE_MAX) {
7960  return table[c] != 0;
7961  }
7962  else {
7963  VALUE v = UINT2NUM(c);
7964 
7965  if (del) {
7966  if (!NIL_P(rb_hash_lookup(del, v)) &&
7967  (!nodel || NIL_P(rb_hash_lookup(nodel, v)))) {
7968  return TRUE;
7969  }
7970  }
7971  else if (nodel && !NIL_P(rb_hash_lookup(nodel, v))) {
7972  return FALSE;
7973  }
7974  return table[TR_TABLE_MAX] ? TRUE : FALSE;
7975  }
7976 }
7977 
7978 /*
7979  * call-seq:
7980  * str.delete!([other_str]+) -> str or nil
7981  *
7982  * Performs a <code>delete</code> operation in place, returning <i>str</i>, or
7983  * <code>nil</code> if <i>str</i> was not modified.
7984  */
7985 
7986 static VALUE
7987 rb_str_delete_bang(int argc, VALUE *argv, VALUE str)
7988 {
7989  char squeez[TR_TABLE_SIZE];
7990  rb_encoding *enc = 0;
7991  char *s, *send, *t;
7992  VALUE del = 0, nodel = 0;
7993  int modify = 0;
7994  int i, ascompat, cr;
7995 
7996  if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
7998  for (i=0; i<argc; i++) {
7999  VALUE s = argv[i];
8000 
8001  StringValue(s);
8002  enc = rb_enc_check(str, s);
8003  tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
8004  }
8005 
8006  str_modify_keep_cr(str);
8007  ascompat = rb_enc_asciicompat(enc);
8008  s = t = RSTRING_PTR(str);
8009  send = RSTRING_END(str);
8010  cr = ascompat ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
8011  while (s < send) {
8012  unsigned int c;
8013  int clen;
8014 
8015  if (ascompat && (c = *(unsigned char*)s) < 0x80) {
8016  if (squeez[c]) {
8017  modify = 1;
8018  }
8019  else {
8020  if (t != s) *t = c;
8021  t++;
8022  }
8023  s++;
8024  }
8025  else {
8026  c = rb_enc_codepoint_len(s, send, &clen, enc);
8027 
8028  if (tr_find(c, squeez, del, nodel)) {
8029  modify = 1;
8030  }
8031  else {
8032  if (t != s) rb_enc_mbcput(c, t, enc);
8033  t += clen;
8034  if (cr == ENC_CODERANGE_7BIT) cr = ENC_CODERANGE_VALID;
8035  }
8036  s += clen;
8037  }
8038  }
8039  TERM_FILL(t, TERM_LEN(str));
8040  STR_SET_LEN(str, t - RSTRING_PTR(str));
8041  ENC_CODERANGE_SET(str, cr);
8042 
8043  if (modify) return str;
8044  return Qnil;
8045 }
8046 
8047 
8048 /*
8049  * call-seq:
8050  * str.delete([other_str]+) -> new_str
8051  *
8052  * Returns a copy of <i>str</i> with all characters in the intersection of its
8053  * arguments deleted. Uses the same rules for building the set of characters as
8054  * String#count.
8055  *
8056  * "hello".delete "l","lo" #=> "heo"
8057  * "hello".delete "lo" #=> "he"
8058  * "hello".delete "aeiou", "^e" #=> "hell"
8059  * "hello".delete "ej-m" #=> "ho"
8060  */
8061 
8062 static VALUE
8063 rb_str_delete(int argc, VALUE *argv, VALUE str)
8064 {
8065  str = str_duplicate(rb_cString, str);
8066  rb_str_delete_bang(argc, argv, str);
8067  return str;
8068 }
8069 
8070 
8071 /*
8072  * call-seq:
8073  * str.squeeze!([other_str]*) -> str or nil
8074  *
8075  * Squeezes <i>str</i> in place, returning either <i>str</i>, or
8076  * <code>nil</code> if no changes were made.
8077  */
8078 
8079 static VALUE
8080 rb_str_squeeze_bang(int argc, VALUE *argv, VALUE str)
8081 {
8082  char squeez[TR_TABLE_SIZE];
8083  rb_encoding *enc = 0;
8084  VALUE del = 0, nodel = 0;
8085  unsigned char *s, *send, *t;
8086  int i, modify = 0;
8087  int ascompat, singlebyte = single_byte_optimizable(str);
8088  unsigned int save;
8089 
8090  if (argc == 0) {
8091  enc = STR_ENC_GET(str);
8092  }
8093  else {
8094  for (i=0; i<argc; i++) {
8095  VALUE s = argv[i];
8096 
8097  StringValue(s);
8098  enc = rb_enc_check(str, s);
8099  if (singlebyte && !single_byte_optimizable(s))
8100  singlebyte = 0;
8101  tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
8102  }
8103  }
8104 
8105  str_modify_keep_cr(str);
8106  s = t = (unsigned char *)RSTRING_PTR(str);
8107  if (!s || RSTRING_LEN(str) == 0) return Qnil;
8108  send = (unsigned char *)RSTRING_END(str);
8109  save = -1;
8110  ascompat = rb_enc_asciicompat(enc);
8111 
8112  if (singlebyte) {
8113  while (s < send) {
8114  unsigned int c = *s++;
8115  if (c != save || (argc > 0 && !squeez[c])) {
8116  *t++ = save = c;
8117  }
8118  }
8119  }
8120  else {
8121  while (s < send) {
8122  unsigned int c;
8123  int clen;
8124 
8125  if (ascompat && (c = *s) < 0x80) {
8126  if (c != save || (argc > 0 && !squeez[c])) {
8127  *t++ = save = c;
8128  }
8129  s++;
8130  }
8131  else {
8132  c = rb_enc_codepoint_len((char *)s, (char *)send, &clen, enc);
8133 
8134  if (c != save || (argc > 0 && !tr_find(c, squeez, del, nodel))) {
8135  if (t != s) rb_enc_mbcput(c, t, enc);
8136  save = c;
8137  t += clen;
8138  }
8139  s += clen;
8140  }
8141  }
8142  }
8143 
8144  TERM_FILL((char *)t, TERM_LEN(str));
8145  if ((char *)t - RSTRING_PTR(str) != RSTRING_LEN(str)) {
8146  STR_SET_LEN(str, (char *)t - RSTRING_PTR(str));
8147  modify = 1;
8148  }
8149 
8150  if (modify) return str;
8151  return Qnil;
8152 }
8153 
8154 
8155 /*
8156  * call-seq:
8157  * str.squeeze([other_str]*) -> new_str
8158  *
8159  * Builds a set of characters from the <i>other_str</i> parameter(s)
8160  * using the procedure described for String#count. Returns a new
8161  * string where runs of the same character that occur in this set are
8162  * replaced by a single character. If no arguments are given, all
8163  * runs of identical characters are replaced by a single character.
8164  *
8165  * "yellow moon".squeeze #=> "yelow mon"
8166  * " now is the".squeeze(" ") #=> " now is the"
8167  * "putters shoot balls".squeeze("m-z") #=> "puters shot balls"
8168  */
8169 
8170 static VALUE
8171 rb_str_squeeze(int argc, VALUE *argv, VALUE str)
8172 {
8173  str = str_duplicate(rb_cString, str);
8174  rb_str_squeeze_bang(argc, argv, str);
8175  return str;
8176 }
8177 
8178 
8179 /*
8180  * call-seq:
8181  * str.tr_s!(from_str, to_str) -> str or nil
8182  *
8183  * Performs String#tr_s processing on <i>str</i> in place,
8184  * returning <i>str</i>, or <code>nil</code> if no changes were made.
8185  */
8186 
8187 static VALUE
8188 rb_str_tr_s_bang(VALUE str, VALUE src, VALUE repl)
8189 {
8190  return tr_trans(str, src, repl, 1);
8191 }
8192 
8193 
8194 /*
8195  * call-seq:
8196  * str.tr_s(from_str, to_str) -> new_str
8197  *
8198  * Processes a copy of <i>str</i> as described under String#tr, then
8199  * removes duplicate characters in regions that were affected by the
8200  * translation.
8201  *
8202  * "hello".tr_s('l', 'r') #=> "hero"
8203  * "hello".tr_s('el', '*') #=> "h*o"
8204  * "hello".tr_s('el', 'hx') #=> "hhxo"
8205  */
8206 
8207 static VALUE
8208 rb_str_tr_s(VALUE str, VALUE src, VALUE repl)
8209 {
8210  str = str_duplicate(rb_cString, str);
8211  tr_trans(str, src, repl, 1);
8212  return str;
8213 }
8214 
8215 
8216 /*
8217  * call-seq:
8218  * str.count([other_str]+) -> integer
8219  *
8220  * Each +other_str+ parameter defines a set of characters to count. The
8221  * intersection of these sets defines the characters to count in +str+. Any
8222  * +other_str+ that starts with a caret <code>^</code> is negated. The
8223  * sequence <code>c1-c2</code> means all characters between c1 and c2. The
8224  * backslash character <code>\</code> can be used to escape <code>^</code> or
8225  * <code>-</code> and is otherwise ignored unless it appears at the end of a
8226  * sequence or the end of a +other_str+.
8227  *
8228  * a = "hello world"
8229  * a.count "lo" #=> 5
8230  * a.count "lo", "o" #=> 2
8231  * a.count "hello", "^l" #=> 4
8232  * a.count "ej-m" #=> 4
8233  *
8234  * "hello^world".count "\\^aeiou" #=> 4
8235  * "hello-world".count "a\\-eo" #=> 4
8236  *
8237  * c = "hello world\\r\\n"
8238  * c.count "\\" #=> 2
8239  * c.count "\\A" #=> 0
8240  * c.count "X-\\w" #=> 3
8241  */
8242 
8243 static VALUE
8244 rb_str_count(int argc, VALUE *argv, VALUE str)
8245 {
8246  char table[TR_TABLE_SIZE];
8247  rb_encoding *enc = 0;
8248  VALUE del = 0, nodel = 0, tstr;
8249  char *s, *send;
8250  int i;
8251  int ascompat;
8252  size_t n = 0;
8253 
8255 
8256  tstr = argv[0];
8257  StringValue(tstr);
8258  enc = rb_enc_check(str, tstr);
8259  if (argc == 1) {
8260  const char *ptstr;
8261  if (RSTRING_LEN(tstr) == 1 && rb_enc_asciicompat(enc) &&
8262  (ptstr = RSTRING_PTR(tstr),
8263  ONIGENC_IS_ALLOWED_REVERSE_MATCH(enc, (const unsigned char *)ptstr, (const unsigned char *)ptstr+1)) &&
8264  !is_broken_string(str)) {
8265  int clen;
8266  unsigned char c = rb_enc_codepoint_len(ptstr, ptstr+1, &clen, enc);
8267 
8268  s = RSTRING_PTR(str);
8269  if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
8270  send = RSTRING_END(str);
8271  while (s < send) {
8272  if (*(unsigned char*)s++ == c) n++;
8273  }
8274  return SIZET2NUM(n);
8275  }
8276  }
8277 
8278  tr_setup_table(tstr, table, TRUE, &del, &nodel, enc);
8279  for (i=1; i<argc; i++) {
8280  tstr = argv[i];
8281  StringValue(tstr);
8282  enc = rb_enc_check(str, tstr);
8283  tr_setup_table(tstr, table, FALSE, &del, &nodel, enc);
8284  }
8285 
8286  s = RSTRING_PTR(str);
8287  if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
8288  send = RSTRING_END(str);
8289  ascompat = rb_enc_asciicompat(enc);
8290  while (s < send) {
8291  unsigned int c;
8292 
8293  if (ascompat && (c = *(unsigned char*)s) < 0x80) {
8294  if (table[c]) {
8295  n++;
8296  }
8297  s++;
8298  }
8299  else {
8300  int clen;
8301  c = rb_enc_codepoint_len(s, send, &clen, enc);
8302  if (tr_find(c, table, del, nodel)) {
8303  n++;
8304  }
8305  s += clen;
8306  }
8307  }
8308 
8309  return SIZET2NUM(n);
8310 }
8311 
8312 static VALUE
8313 rb_fs_check(VALUE val)
8314 {
8315  if (!NIL_P(val) && !RB_TYPE_P(val, T_STRING) && !RB_TYPE_P(val, T_REGEXP)) {
8316  val = rb_check_string_type(val);
8317  if (NIL_P(val)) return 0;
8318  }
8319  return val;
8320 }
8321 
8322 static const char isspacetable[256] = {
8323  0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0,
8324  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8325  1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8326  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8327  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8328  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8329  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8330  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8331  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8332  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8333  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8334  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8335  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8336  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8337  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8338  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
8339 };
8340 
8341 #define ascii_isspace(c) isspacetable[(unsigned char)(c)]
8342 
8343 static long
8344 split_string(VALUE result, VALUE str, long beg, long len, long empty_count)
8345 {
8346  if (empty_count >= 0 && len == 0) {
8347  return empty_count + 1;
8348  }
8349  if (empty_count > 0) {
8350  /* make different substrings */
8351  if (result) {
8352  do {
8353  rb_ary_push(result, str_new_empty_String(str));
8354  } while (--empty_count > 0);
8355  }
8356  else {
8357  do {
8358  rb_yield(str_new_empty_String(str));
8359  } while (--empty_count > 0);
8360  }
8361  }
8362  str = rb_str_subseq(str, beg, len);
8363  if (result) {
8364  rb_ary_push(result, str);
8365  }
8366  else {
8367  rb_yield(str);
8368  }
8369  return empty_count;
8370 }
8371 
8372 typedef enum {
8373  SPLIT_TYPE_AWK, SPLIT_TYPE_STRING, SPLIT_TYPE_REGEXP, SPLIT_TYPE_CHARS
8374 } split_type_t;
8375 
8376 static split_type_t
8377 literal_split_pattern(VALUE spat, split_type_t default_type)
8378 {
8379  rb_encoding *enc = STR_ENC_GET(spat);
8380  const char *ptr;
8381  long len;
8382  RSTRING_GETMEM(spat, ptr, len);
8383  if (len == 0) {
8384  /* Special case - split into chars */
8385  return SPLIT_TYPE_CHARS;
8386  }
8387  else if (rb_enc_asciicompat(enc)) {
8388  if (len == 1 && ptr[0] == ' ') {
8389  return SPLIT_TYPE_AWK;
8390  }
8391  }
8392  else {
8393  int l;
8394  if (rb_enc_ascget(ptr, ptr + len, &l, enc) == ' ' && len == l) {
8395  return SPLIT_TYPE_AWK;
8396  }
8397  }
8398  return default_type;
8399 }
8400 
8401 /*
8402  * call-seq:
8403  * str.split(pattern=nil, [limit]) -> an_array
8404  * str.split(pattern=nil, [limit]) {|sub| block } -> str
8405  *
8406  * Divides <i>str</i> into substrings based on a delimiter, returning an array
8407  * of these substrings.
8408  *
8409  * If <i>pattern</i> is a String, then its contents are used as
8410  * the delimiter when splitting <i>str</i>. If <i>pattern</i> is a single
8411  * space, <i>str</i> is split on whitespace, with leading and trailing
8412  * whitespace and runs of contiguous whitespace characters ignored.
8413  *
8414  * If <i>pattern</i> is a Regexp, <i>str</i> is divided where the
8415  * pattern matches. Whenever the pattern matches a zero-length string,
8416  * <i>str</i> is split into individual characters. If <i>pattern</i> contains
8417  * groups, the respective matches will be returned in the array as well.
8418  *
8419  * If <i>pattern</i> is <code>nil</code>, the value of <code>$;</code> is used.
8420  * If <code>$;</code> is <code>nil</code> (which is the default), <i>str</i> is
8421  * split on whitespace as if ' ' were specified.
8422  *
8423  * If the <i>limit</i> parameter is omitted, trailing null fields are
8424  * suppressed. If <i>limit</i> is a positive number, at most that number
8425  * of split substrings will be returned (captured groups will be returned
8426  * as well, but are not counted towards the limit).
8427  * If <i>limit</i> is <code>1</code>, the entire
8428  * string is returned as the only entry in an array. If negative, there is no
8429  * limit to the number of fields returned, and trailing null fields are not
8430  * suppressed.
8431  *
8432  * When the input +str+ is empty an empty Array is returned as the string is
8433  * considered to have no fields to split.
8434  *
8435  * " now's the time ".split #=> ["now's", "the", "time"]
8436  * " now's the time ".split(' ') #=> ["now's", "the", "time"]
8437  * " now's the time".split(/ /) #=> ["", "now's", "", "the", "time"]
8438  * "1, 2.34,56, 7".split(%r{,\s*}) #=> ["1", "2.34", "56", "7"]
8439  * "hello".split(//) #=> ["h", "e", "l", "l", "o"]
8440  * "hello".split(//, 3) #=> ["h", "e", "llo"]
8441  * "hi mom".split(%r{\s*}) #=> ["h", "i", "m", "o", "m"]
8442  *
8443  * "mellow yellow".split("ello") #=> ["m", "w y", "w"]
8444  * "1,2,,3,4,,".split(',') #=> ["1", "2", "", "3", "4"]
8445  * "1,2,,3,4,,".split(',', 4) #=> ["1", "2", "", "3,4,,"]
8446  * "1,2,,3,4,,".split(',', -4) #=> ["1", "2", "", "3", "4", "", ""]
8447  *
8448  * "1:2:3".split(/(:)()()/, 2) #=> ["1", ":", "", "", "2:3"]
8449  *
8450  * "".split(',', -1) #=> []
8451  *
8452  * If a block is given, invoke the block with each split substring.
8453  *
8454  */
8455 
8456 static VALUE
8457 rb_str_split_m(int argc, VALUE *argv, VALUE str)
8458 {
8459  rb_encoding *enc;
8460  VALUE spat;
8461  VALUE limit;
8462  split_type_t split_type;
8463  long beg, end, i = 0, empty_count = -1;
8464  int lim = 0;
8465  VALUE result, tmp;
8466 
8467  result = rb_block_given_p() ? Qfalse : Qnil;
8468  if (rb_scan_args(argc, argv, "02", &spat, &limit) == 2) {
8469  lim = NUM2INT(limit);
8470  if (lim <= 0) limit = Qnil;
8471  else if (lim == 1) {
8472  if (RSTRING_LEN(str) == 0)
8473  return result ? rb_ary_new2(0) : str;
8474  tmp = str_duplicate(rb_cString, str);
8475  if (!result) {
8476  rb_yield(tmp);
8477  return str;
8478  }
8479  return rb_ary_new3(1, tmp);
8480  }
8481  i = 1;
8482  }
8483  if (NIL_P(limit) && !lim) empty_count = 0;
8484 
8485  enc = STR_ENC_GET(str);
8486  split_type = SPLIT_TYPE_REGEXP;
8487  if (!NIL_P(spat)) {
8488  spat = get_pat_quoted(spat, 0);
8489  }
8490  else if (NIL_P(spat = rb_fs)) {
8491  split_type = SPLIT_TYPE_AWK;
8492  }
8493  else if (!(spat = rb_fs_check(spat))) {
8494  rb_raise(rb_eTypeError, "value of $; must be String or Regexp");
8495  }
8496  else {
8497  rb_category_warn(RB_WARN_CATEGORY_DEPRECATED, "$; is set to non-nil value");
8498  }
8499  if (split_type != SPLIT_TYPE_AWK) {
8500  switch (BUILTIN_TYPE(spat)) {
8501  case T_REGEXP:
8502  rb_reg_options(spat); /* check if uninitialized */
8503  tmp = RREGEXP_SRC(spat);
8504  split_type = literal_split_pattern(tmp, SPLIT_TYPE_REGEXP);
8505  if (split_type == SPLIT_TYPE_AWK) {
8506  spat = tmp;
8507  split_type = SPLIT_TYPE_STRING;
8508  }
8509  break;
8510 
8511  case T_STRING:
8512  mustnot_broken(spat);
8513  split_type = literal_split_pattern(spat, SPLIT_TYPE_STRING);
8514  break;
8515 
8516  default:
8518  }
8519  }
8520 
8521 #define SPLIT_STR(beg, len) (empty_count = split_string(result, str, beg, len, empty_count))
8522 
8523  if (result) result = rb_ary_new();
8524  beg = 0;
8525  char *ptr = RSTRING_PTR(str);
8526  char *eptr = RSTRING_END(str);
8527  if (split_type == SPLIT_TYPE_AWK) {
8528  char *bptr = ptr;
8529  int skip = 1;
8530  unsigned int c;
8531 
8532  end = beg;
8533  if (is_ascii_string(str)) {
8534  while (ptr < eptr) {
8535  c = (unsigned char)*ptr++;
8536  if (skip) {
8537  if (ascii_isspace(c)) {
8538  beg = ptr - bptr;
8539  }
8540  else {
8541  end = ptr - bptr;
8542  skip = 0;
8543  if (!NIL_P(limit) && lim <= i) break;
8544  }
8545  }
8546  else if (ascii_isspace(c)) {
8547  SPLIT_STR(beg, end-beg);
8548  skip = 1;
8549  beg = ptr - bptr;
8550  if (!NIL_P(limit)) ++i;
8551  }
8552  else {
8553  end = ptr - bptr;
8554  }
8555  }
8556  }
8557  else {
8558  while (ptr < eptr) {
8559  int n;
8560 
8561  c = rb_enc_codepoint_len(ptr, eptr, &n, enc);
8562  ptr += n;
8563  if (skip) {
8564  if (rb_isspace(c)) {
8565  beg = ptr - bptr;
8566  }
8567  else {
8568  end = ptr - bptr;
8569  skip = 0;
8570  if (!NIL_P(limit) && lim <= i) break;
8571  }
8572  }
8573  else if (rb_isspace(c)) {
8574  SPLIT_STR(beg, end-beg);
8575  skip = 1;
8576  beg = ptr - bptr;
8577  if (!NIL_P(limit)) ++i;
8578  }
8579  else {
8580  end = ptr - bptr;
8581  }
8582  }
8583  }
8584  }
8585  else if (split_type == SPLIT_TYPE_STRING) {
8586  char *str_start = ptr;
8587  char *substr_start = ptr;
8588  char *sptr = RSTRING_PTR(spat);
8589  long slen = RSTRING_LEN(spat);
8590 
8591  mustnot_broken(str);
8592  enc = rb_enc_check(str, spat);
8593  while (ptr < eptr &&
8594  (end = rb_memsearch(sptr, slen, ptr, eptr - ptr, enc)) >= 0) {
8595  /* Check we are at the start of a char */
8596  char *t = rb_enc_right_char_head(ptr, ptr + end, eptr, enc);
8597  if (t != ptr + end) {
8598  ptr = t;
8599  continue;
8600  }
8601  SPLIT_STR(substr_start - str_start, (ptr+end) - substr_start);
8602  ptr += end + slen;
8603  substr_start = ptr;
8604  if (!NIL_P(limit) && lim <= ++i) break;
8605  }
8606  beg = ptr - str_start;
8607  }
8608  else if (split_type == SPLIT_TYPE_CHARS) {
8609  char *str_start = ptr;
8610  int n;
8611 
8612  mustnot_broken(str);
8613  enc = rb_enc_get(str);
8614  while (ptr < eptr &&
8615  (n = rb_enc_precise_mbclen(ptr, eptr, enc)) > 0) {
8616  SPLIT_STR(ptr - str_start, n);
8617  ptr += n;
8618  if (!NIL_P(limit) && lim <= ++i) break;
8619  }
8620  beg = ptr - str_start;
8621  }
8622  else {
8623  long len = RSTRING_LEN(str);
8624  long start = beg;
8625  long idx;
8626  int last_null = 0;
8627  struct re_registers *regs;
8628  VALUE match = 0;
8629 
8630  for (; rb_reg_search(spat, str, start, 0) >= 0;
8631  (match ? (rb_match_unbusy(match), rb_backref_set(match)) : (void)0)) {
8632  match = rb_backref_get();
8633  if (!result) rb_match_busy(match);
8634  regs = RMATCH_REGS(match);
8635  end = BEG(0);
8636  if (start == end && BEG(0) == END(0)) {
8637  if (!ptr) {
8638  SPLIT_STR(0, 0);
8639  break;
8640  }
8641  else if (last_null == 1) {
8642  SPLIT_STR(beg, rb_enc_fast_mbclen(ptr+beg, eptr, enc));
8643  beg = start;
8644  }
8645  else {
8646  if (start == len)
8647  start++;
8648  else
8649  start += rb_enc_fast_mbclen(ptr+start,eptr,enc);
8650  last_null = 1;
8651  continue;
8652  }
8653  }
8654  else {
8655  SPLIT_STR(beg, end-beg);
8656  beg = start = END(0);
8657  }
8658  last_null = 0;
8659 
8660  for (idx=1; idx < regs->num_regs; idx++) {
8661  if (BEG(idx) == -1) continue;
8662  SPLIT_STR(BEG(idx), END(idx)-BEG(idx));
8663  }
8664  if (!NIL_P(limit) && lim <= ++i) break;
8665  }
8666  if (match) rb_match_unbusy(match);
8667  }
8668  if (RSTRING_LEN(str) > 0 && (!NIL_P(limit) || RSTRING_LEN(str) > beg || lim < 0)) {
8669  SPLIT_STR(beg, RSTRING_LEN(str)-beg);
8670  }
8671 
8672  return result ? result : str;
8673 }
8674 
8675 VALUE
8676 rb_str_split(VALUE str, const char *sep0)
8677 {
8678  VALUE sep;
8679 
8680  StringValue(str);
8681  sep = rb_str_new_cstr(sep0);
8682  return rb_str_split_m(1, &sep, str);
8683 }
8684 
8685 #define WANTARRAY(m, size) (!rb_block_given_p() ? rb_ary_new_capa(size) : 0)
8686 
8687 static inline int
8688 enumerator_element(VALUE ary, VALUE e)
8689 {
8690  if (ary) {
8691  rb_ary_push(ary, e);
8692  return 0;
8693  }
8694  else {
8695  rb_yield(e);
8696  return 1;
8697  }
8698 }
8699 
8700 #define ENUM_ELEM(ary, e) enumerator_element(ary, e)
8701 
8702 static const char *
8703 chomp_newline(const char *p, const char *e, rb_encoding *enc)
8704 {
8705  const char *prev = rb_enc_prev_char(p, e, e, enc);
8706  if (rb_enc_is_newline(prev, e, enc)) {
8707  e = prev;
8708  prev = rb_enc_prev_char(p, e, e, enc);
8709  if (prev && rb_enc_ascget(prev, e, NULL, enc) == '\r')
8710  e = prev;
8711  }
8712  return e;
8713 }
8714 
8715 static VALUE
8716 get_rs(void)
8717 {
8718  VALUE rs = rb_rs;
8719  if (!NIL_P(rs) &&
8720  (!RB_TYPE_P(rs, T_STRING) ||
8721  RSTRING_LEN(rs) != 1 ||
8722  RSTRING_PTR(rs)[0] != '\n')) {
8723  rb_category_warn(RB_WARN_CATEGORY_DEPRECATED, "$/ is set to non-default value");
8724  }
8725  return rs;
8726 }
8727 
8728 #define rb_rs get_rs()
8729 
8730 static VALUE
8731 rb_str_enumerate_lines(int argc, VALUE *argv, VALUE str, VALUE ary)
8732 {
8733  rb_encoding *enc;
8734  VALUE line, rs, orig = str, opts = Qnil, chomp = Qfalse;
8735  const char *ptr, *pend, *subptr, *subend, *rsptr, *hit, *adjusted;
8736  long pos, len, rslen;
8737  int rsnewline = 0;
8738 
8739  if (rb_scan_args(argc, argv, "01:", &rs, &opts) == 0)
8740  rs = rb_rs;
8741  if (!NIL_P(opts)) {
8742  static ID keywords[1];
8743  if (!keywords[0]) {
8744  keywords[0] = rb_intern_const("chomp");
8745  }
8746  rb_get_kwargs(opts, keywords, 0, 1, &chomp);
8747  chomp = (chomp != Qundef && RTEST(chomp));
8748  }
8749 
8750  if (NIL_P(rs)) {
8751  if (!ENUM_ELEM(ary, str)) {
8752  return ary;
8753  }
8754  else {
8755  return orig;
8756  }
8757  }
8758 
8759  if (!RSTRING_LEN(str)) goto end;
8760  str = rb_str_new_frozen(str);
8761  ptr = subptr = RSTRING_PTR(str);
8762  pend = RSTRING_END(str);
8763  len = RSTRING_LEN(str);
8764  StringValue(rs);
8765  rslen = RSTRING_LEN(rs);
8766 
8767  if (rs == rb_default_rs)
8768  enc = rb_enc_get(str);
8769  else
8770  enc = rb_enc_check(str, rs);
8771 
8772  if (rslen == 0) {
8773  /* paragraph mode */
8774  int n;
8775  const char *eol = NULL;
8776  subend = subptr;
8777  while (subend < pend) {
8778  do {
8779  if (rb_enc_ascget(subend, pend, &n, enc) != '\r')
8780  n = 0;
8781  rslen = n + rb_enc_mbclen(subend + n, pend, enc);
8782  if (rb_enc_is_newline(subend + n, pend, enc)) {
8783  if (eol == subend) break;
8784  subend += rslen;
8785  if (subptr) eol = subend;
8786  }
8787  else {
8788  if (!subptr) subptr = subend;
8789  subend += rslen;
8790  }
8791  rslen = 0;
8792  } while (subend < pend);
8793  if (!subptr) break;
8794  line = rb_str_subseq(str, subptr - ptr,
8795  subend - subptr + (chomp ? 0 : rslen));
8796  if (ENUM_ELEM(ary, line)) {
8797  str_mod_check(str, ptr, len);
8798  }
8799  subptr = eol = NULL;
8800  }
8801  goto end;
8802  }
8803  else {
8804  rsptr = RSTRING_PTR(rs);
8805  if (RSTRING_LEN(rs) == rb_enc_mbminlen(enc) &&
8806  rb_enc_is_newline(rsptr, rsptr + RSTRING_LEN(rs), enc)) {
8807  rsnewline = 1;
8808  }
8809  }
8810 
8811  if ((rs == rb_default_rs) && !rb_enc_asciicompat(enc)) {
8812  rs = rb_str_new(rsptr, rslen);
8813  rs = rb_str_encode(rs, rb_enc_from_encoding(enc), 0, Qnil);
8814  rsptr = RSTRING_PTR(rs);
8815  rslen = RSTRING_LEN(rs);
8816  }
8817 
8818  while (subptr < pend) {
8819  pos = rb_memsearch(rsptr, rslen, subptr, pend - subptr, enc);
8820  if (pos < 0) break;
8821  hit = subptr + pos;
8822  adjusted = rb_enc_right_char_head(subptr, hit, pend, enc);
8823  if (hit != adjusted) {
8824  subptr = adjusted;
8825  continue;
8826  }
8827  subend = hit += rslen;
8828  if (chomp) {
8829  if (rsnewline) {
8830  subend = chomp_newline(subptr, subend, enc);
8831  }
8832  else {
8833  subend -= rslen;
8834  }
8835  }
8836  line = rb_str_subseq(str, subptr - ptr, subend - subptr);
8837  if (ENUM_ELEM(ary, line)) {
8838  str_mod_check(str, ptr, len);
8839  }
8840  subptr = hit;
8841  }
8842 
8843  if (subptr != pend) {
8844  if (chomp) {
8845  if (rsnewline) {
8846  pend = chomp_newline(subptr, pend, enc);
8847  }
8848  else if (pend - subptr >= rslen &&
8849  memcmp(pend - rslen, rsptr, rslen) == 0) {
8850  pend -= rslen;
8851  }
8852  }
8853  line = rb_str_subseq(str, subptr - ptr, pend - subptr);
8854  ENUM_ELEM(ary, line);
8855  RB_GC_GUARD(str);
8856  }
8857 
8858  end:
8859  if (ary)
8860  return ary;
8861  else
8862  return orig;
8863 }
8864 
8865 /*
8866  * call-seq:
8867  * str.each_line(separator=$/, chomp: false) {|substr| block } -> str
8868  * str.each_line(separator=$/, chomp: false) -> an_enumerator
8869  *
8870  * Splits <i>str</i> using the supplied parameter as the record
8871  * separator (<code>$/</code> by default), passing each substring in
8872  * turn to the supplied block. If a zero-length record separator is
8873  * supplied, the string is split into paragraphs delimited by
8874  * multiple successive newlines.
8875  *
8876  * If +chomp+ is +true+, +separator+ will be removed from the end of each
8877  * line.
8878  *
8879  * If no block is given, an enumerator is returned instead.
8880  *
8881  * "hello\nworld".each_line {|s| p s}
8882  * # prints:
8883  * # "hello\n"
8884  * # "world"
8885  *
8886  * "hello\nworld".each_line('l') {|s| p s}
8887  * # prints:
8888  * # "hel"
8889  * # "l"
8890  * # "o\nworl"
8891  * # "d"
8892  *
8893  * "hello\n\n\nworld".each_line('') {|s| p s}
8894  * # prints
8895  * # "hello\n\n"
8896  * # "world"
8897  *
8898  * "hello\nworld".each_line(chomp: true) {|s| p s}
8899  * # prints:
8900  * # "hello"
8901  * # "world"
8902  *
8903  * "hello\nworld".each_line('l', chomp: true) {|s| p s}
8904  * # prints:
8905  * # "he"
8906  * # ""
8907  * # "o\nwor"
8908  * # "d"
8909  *
8910  */
8911 
8912 static VALUE
8913 rb_str_each_line(int argc, VALUE *argv, VALUE str)
8914 {
8915  RETURN_SIZED_ENUMERATOR(str, argc, argv, 0);
8916  return rb_str_enumerate_lines(argc, argv, str, 0);
8917 }
8918 
8919 /*
8920  * call-seq:
8921  * str.lines(separator=$/, chomp: false) -> an_array
8922  *
8923  * Returns an array of lines in <i>str</i> split using the supplied
8924  * record separator (<code>$/</code> by default). This is a
8925  * shorthand for <code>str.each_line(separator, getline_args).to_a</code>.
8926  *
8927  * If +chomp+ is +true+, +separator+ will be removed from the end of each
8928  * line.
8929  *
8930  * "hello\nworld\n".lines #=> ["hello\n", "world\n"]
8931  * "hello world".lines(' ') #=> ["hello ", " ", "world"]
8932  * "hello\nworld\n".lines(chomp: true) #=> ["hello", "world"]
8933  *
8934  * If a block is given, which is a deprecated form, works the same as
8935  * <code>each_line</code>.
8936  */
8937 
8938 static VALUE
8939 rb_str_lines(int argc, VALUE *argv, VALUE str)
8940 {
8941  VALUE ary = WANTARRAY("lines", 0);
8942  return rb_str_enumerate_lines(argc, argv, str, ary);
8943 }
8944 
8945 static VALUE
8946 rb_str_each_byte_size(VALUE str, VALUE args, VALUE eobj)
8947 {
8948  return LONG2FIX(RSTRING_LEN(str));
8949 }
8950 
8951 static VALUE
8952 rb_str_enumerate_bytes(VALUE str, VALUE ary)
8953 {
8954  long i;
8955 
8956  for (i=0; i<RSTRING_LEN(str); i++) {
8957  ENUM_ELEM(ary, INT2FIX((unsigned char)RSTRING_PTR(str)[i]));
8958  }
8959  if (ary)
8960  return ary;
8961  else
8962  return str;
8963 }
8964 
8965 /*
8966  * call-seq:
8967  * str.each_byte {|integer| block } -> str
8968  * str.each_byte -> an_enumerator
8969  *
8970  * Passes each byte in <i>str</i> to the given block, or returns an
8971  * enumerator if no block is given.
8972  *
8973  * "hello".each_byte {|c| print c, ' ' }
8974  *
8975  * <em>produces:</em>
8976  *
8977  * 104 101 108 108 111
8978  */
8979 
8980 static VALUE
8981 rb_str_each_byte(VALUE str)
8982 {
8983  RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_byte_size);
8984  return rb_str_enumerate_bytes(str, 0);
8985 }
8986 
8987 /*
8988  * call-seq:
8989  * str.bytes -> an_array
8990  *
8991  * Returns an array of bytes in <i>str</i>. This is a shorthand for
8992  * <code>str.each_byte.to_a</code>.
8993  *
8994  * If a block is given, which is a deprecated form, works the same as
8995  * <code>each_byte</code>.
8996  */
8997 
8998 static VALUE
8999 rb_str_bytes(VALUE str)
9000 {
9001  VALUE ary = WANTARRAY("bytes", RSTRING_LEN(str));
9002  return rb_str_enumerate_bytes(str, ary);
9003 }
9004 
9005 static VALUE
9006 rb_str_each_char_size(VALUE str, VALUE args, VALUE eobj)
9007 {
9008  return rb_str_length(str);
9009 }
9010 
9011 static VALUE
9012 rb_str_enumerate_chars(VALUE str, VALUE ary)
9013 {
9014  VALUE orig = str;
9015  long i, len, n;
9016  const char *ptr;
9017  rb_encoding *enc;
9018 
9019  str = rb_str_new_frozen(str);
9020  ptr = RSTRING_PTR(str);
9021  len = RSTRING_LEN(str);
9022  enc = rb_enc_get(str);
9023 
9025  for (i = 0; i < len; i += n) {
9026  n = rb_enc_fast_mbclen(ptr + i, ptr + len, enc);
9027  ENUM_ELEM(ary, rb_str_subseq(str, i, n));
9028  }
9029  }
9030  else {
9031  for (i = 0; i < len; i += n) {
9032  n = rb_enc_mbclen(ptr + i, ptr + len, enc);
9033  ENUM_ELEM(ary, rb_str_subseq(str, i, n));
9034  }
9035  }
9036  RB_GC_GUARD(str);
9037  if (ary)
9038  return ary;
9039  else
9040  return orig;
9041 }
9042 
9043 /*
9044  * call-seq:
9045  * str.each_char {|cstr| block } -> str
9046  * str.each_char -> an_enumerator
9047  *
9048  * Passes each character in <i>str</i> to the given block, or returns
9049  * an enumerator if no block is given.
9050  *
9051  * "hello".each_char {|c| print c, ' ' }
9052  *
9053  * <em>produces:</em>
9054  *
9055  * h e l l o
9056  */
9057 
9058 static VALUE
9059 rb_str_each_char(VALUE str)
9060 {
9061  RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
9062  return rb_str_enumerate_chars(str, 0);
9063 }
9064 
9065 /*
9066  * call-seq:
9067  * str.chars -> an_array
9068  *
9069  * Returns an array of characters in <i>str</i>. This is a shorthand
9070  * for <code>str.each_char.to_a</code>.
9071  *
9072  * If a block is given, which is a deprecated form, works the same as
9073  * <code>each_char</code>.
9074  */
9075 
9076 static VALUE
9077 rb_str_chars(VALUE str)
9078 {
9079  VALUE ary = WANTARRAY("chars", rb_str_strlen(str));
9080  return rb_str_enumerate_chars(str, ary);
9081 }
9082 
9083 static VALUE
9084 rb_str_enumerate_codepoints(VALUE str, VALUE ary)
9085 {
9086  VALUE orig = str;
9087  int n;
9088  unsigned int c;
9089  const char *ptr, *end;
9090  rb_encoding *enc;
9091 
9092  if (single_byte_optimizable(str))
9093  return rb_str_enumerate_bytes(str, ary);
9094 
9095  str = rb_str_new_frozen(str);
9096  ptr = RSTRING_PTR(str);
9097  end = RSTRING_END(str);
9098  enc = STR_ENC_GET(str);
9099 
9100  while (ptr < end) {
9101  c = rb_enc_codepoint_len(ptr, end, &n, enc);
9102  ENUM_ELEM(ary, UINT2NUM(c));
9103  ptr += n;
9104  }
9105  RB_GC_GUARD(str);
9106  if (ary)
9107  return ary;
9108  else
9109  return orig;
9110 }
9111 
9112 /*
9113  * call-seq:
9114  * str.each_codepoint {|integer| block } -> str
9115  * str.each_codepoint -> an_enumerator
9116  *
9117  * Passes the Integer ordinal of each character in <i>str</i>,
9118  * also known as a <i>codepoint</i> when applied to Unicode strings to the
9119  * given block. For encodings other than UTF-8/UTF-16(BE|LE)/UTF-32(BE|LE),
9120  * values are directly derived from the binary representation
9121  * of each character.
9122  *
9123  * If no block is given, an enumerator is returned instead.
9124  *
9125  * "hello\u0639".each_codepoint {|c| print c, ' ' }
9126  *
9127  * <em>produces:</em>
9128  *
9129  * 104 101 108 108 111 1593
9130  */
9131 
9132 static VALUE
9133 rb_str_each_codepoint(VALUE str)
9134 {
9135  RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
9136  return rb_str_enumerate_codepoints(str, 0);
9137 }
9138 
9139 /*
9140  * call-seq:
9141  * str.codepoints -> an_array
9142  *
9143  * Returns an array of the Integer ordinals of the
9144  * characters in <i>str</i>. This is a shorthand for
9145  * <code>str.each_codepoint.to_a</code>.
9146  *
9147  * If a block is given, which is a deprecated form, works the same as
9148  * <code>each_codepoint</code>.
9149  */
9150 
9151 static VALUE
9152 rb_str_codepoints(VALUE str)
9153 {
9154  VALUE ary = WANTARRAY("codepoints", rb_str_strlen(str));
9155  return rb_str_enumerate_codepoints(str, ary);
9156 }
9157 
9158 static regex_t *
9159 get_reg_grapheme_cluster(rb_encoding *enc)
9160 {
9161  int encidx = rb_enc_to_index(enc);
9162  regex_t *reg_grapheme_cluster = NULL;
9163  static regex_t *reg_grapheme_cluster_utf8 = NULL;
9164 
9165  /* synchronize */
9166  if (encidx == rb_utf8_encindex() && reg_grapheme_cluster_utf8) {
9167  reg_grapheme_cluster = reg_grapheme_cluster_utf8;
9168  }
9169  if (!reg_grapheme_cluster) {
9170  const OnigUChar source_ascii[] = "\\X";
9171  OnigErrorInfo einfo;
9172  const OnigUChar *source = source_ascii;
9173  size_t source_len = sizeof(source_ascii) - 1;
9174  switch (encidx) {
9175 #define CHARS_16BE(x) (OnigUChar)((x)>>8), (OnigUChar)(x)
9176 #define CHARS_16LE(x) (OnigUChar)(x), (OnigUChar)((x)>>8)
9177 #define CHARS_32BE(x) CHARS_16BE((x)>>16), CHARS_16BE(x)
9178 #define CHARS_32LE(x) CHARS_16LE(x), CHARS_16LE((x)>>16)
9179 #define CASE_UTF(e) \
9180  case ENCINDEX_UTF_##e: { \
9181  static const OnigUChar source_UTF_##e[] = {CHARS_##e('\\'), CHARS_##e('X')}; \
9182  source = source_UTF_##e; \
9183  source_len = sizeof(source_UTF_##e); \
9184  break; \
9185  }
9186  CASE_UTF(16BE); CASE_UTF(16LE); CASE_UTF(32BE); CASE_UTF(32LE);
9187 #undef CASE_UTF
9188 #undef CHARS_16BE
9189 #undef CHARS_16LE
9190 #undef CHARS_32BE
9191 #undef CHARS_32LE
9192  }
9193  int r = onig_new(&reg_grapheme_cluster, source, source + source_len,
9194  ONIG_OPTION_DEFAULT, enc, OnigDefaultSyntax, &einfo);
9195  if (r) {
9196  UChar message[ONIG_MAX_ERROR_MESSAGE_LEN];
9197  onig_error_code_to_str(message, r, &einfo);
9198  rb_fatal("cannot compile grapheme cluster regexp: %s", (char *)message);
9199  }
9200  if (encidx == rb_utf8_encindex()) {
9201  reg_grapheme_cluster_utf8 = reg_grapheme_cluster;
9202  }
9203  }
9204  return reg_grapheme_cluster;
9205 }
9206 
9207 static VALUE
9208 rb_str_each_grapheme_cluster_size(VALUE str, VALUE args, VALUE eobj)
9209 {
9210  size_t grapheme_cluster_count = 0;
9211  regex_t *reg_grapheme_cluster = NULL;
9213  const char *ptr, *end;
9214 
9215  if (!rb_enc_unicode_p(enc)) {
9216  return rb_str_length(str);
9217  }
9218 
9219  reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
9220  ptr = RSTRING_PTR(str);
9221  end = RSTRING_END(str);
9222 
9223  while (ptr < end) {
9224  OnigPosition len = onig_match(reg_grapheme_cluster,
9225  (const OnigUChar *)ptr, (const OnigUChar *)end,
9226  (const OnigUChar *)ptr, NULL, 0);
9227  if (len <= 0) break;
9228  grapheme_cluster_count++;
9229  ptr += len;
9230  }
9231 
9232  return SIZET2NUM(grapheme_cluster_count);
9233 }
9234 
9235 static VALUE
9236 rb_str_enumerate_grapheme_clusters(VALUE str, VALUE ary)
9237 {
9238  VALUE orig = str;
9239  regex_t *reg_grapheme_cluster = NULL;
9241  const char *ptr0, *ptr, *end;
9242 
9243  if (!rb_enc_unicode_p(enc)) {
9244  return rb_str_enumerate_chars(str, ary);
9245  }
9246 
9247  if (!ary) str = rb_str_new_frozen(str);
9248  reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
9249  ptr0 = ptr = RSTRING_PTR(str);
9250  end = RSTRING_END(str);
9251 
9252  while (ptr < end) {
9253  OnigPosition len = onig_match(reg_grapheme_cluster,
9254  (const OnigUChar *)ptr, (const OnigUChar *)end,
9255  (const OnigUChar *)ptr, NULL, 0);
9256  if (len <= 0) break;
9257  ENUM_ELEM(ary, rb_str_subseq(str, ptr-ptr0, len));
9258  ptr += len;
9259  }
9260  RB_GC_GUARD(str);
9261  if (ary)
9262  return ary;
9263  else
9264  return orig;
9265 }
9266 
9267 /*
9268  * call-seq:
9269  * str.each_grapheme_cluster {|cstr| block } -> str
9270  * str.each_grapheme_cluster -> an_enumerator
9271  *
9272  * Passes each grapheme cluster in <i>str</i> to the given block, or returns
9273  * an enumerator if no block is given.
9274  * Unlike String#each_char, this enumerates by grapheme clusters defined by
9275  * Unicode Standard Annex #29 http://unicode.org/reports/tr29/
9276  *
9277  * "a\u0300".each_char.to_a.size #=> 2
9278  * "a\u0300".each_grapheme_cluster.to_a.size #=> 1
9279  *
9280  */
9281 
9282 static VALUE
9283 rb_str_each_grapheme_cluster(VALUE str)
9284 {
9285  RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_grapheme_cluster_size);
9286  return rb_str_enumerate_grapheme_clusters(str, 0);
9287 }
9288 
9289 /*
9290  * call-seq:
9291  * str.grapheme_clusters -> an_array
9292  *
9293  * Returns an array of grapheme clusters in <i>str</i>. This is a shorthand
9294  * for <code>str.each_grapheme_cluster.to_a</code>.
9295  *
9296  * If a block is given, which is a deprecated form, works the same as
9297  * <code>each_grapheme_cluster</code>.
9298  */
9299 
9300 static VALUE
9301 rb_str_grapheme_clusters(VALUE str)
9302 {
9303  VALUE ary = WANTARRAY("grapheme_clusters", rb_str_strlen(str));
9304  return rb_str_enumerate_grapheme_clusters(str, ary);
9305 }
9306 
9307 static long
9308 chopped_length(VALUE str)
9309 {
9310  rb_encoding *enc = STR_ENC_GET(str);
9311  const char *p, *p2, *beg, *end;
9312 
9313  beg = RSTRING_PTR(str);
9314  end = beg + RSTRING_LEN(str);
9315  if (beg >= end) return 0;
9316  p = rb_enc_prev_char(beg, end, end, enc);
9317  if (!p) return 0;
9318  if (p > beg && rb_enc_ascget(p, end, 0, enc) == '\n') {
9319  p2 = rb_enc_prev_char(beg, p, end, enc);
9320  if (p2 && rb_enc_ascget(p2, end, 0, enc) == '\r') p = p2;
9321  }
9322  return p - beg;
9323 }
9324 
9325 /*
9326  * call-seq:
9327  * str.chop! -> str or nil
9328  *
9329  * Processes <i>str</i> as for String#chop, returning <i>str</i>, or
9330  * <code>nil</code> if <i>str</i> is the empty string. See also
9331  * String#chomp!.
9332  */
9333 
9334 static VALUE
9335 rb_str_chop_bang(VALUE str)
9336 {
9337  str_modify_keep_cr(str);
9338  if (RSTRING_LEN(str) > 0) {
9339  long len;
9340  len = chopped_length(str);
9341  STR_SET_LEN(str, len);
9342  TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
9343  if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
9344  ENC_CODERANGE_CLEAR(str);
9345  }
9346  return str;
9347  }
9348  return Qnil;
9349 }
9350 
9351 
9352 /*
9353  * call-seq:
9354  * str.chop -> new_str
9355  *
9356  * Returns a new String with the last character removed. If the
9357  * string ends with <code>\r\n</code>, both characters are
9358  * removed. Applying <code>chop</code> to an empty string returns an
9359  * empty string. String#chomp is often a safer alternative, as it
9360  * leaves the string unchanged if it doesn't end in a record
9361  * separator.
9362  *
9363  * "string\r\n".chop #=> "string"
9364  * "string\n\r".chop #=> "string\n"
9365  * "string\n".chop #=> "string"
9366  * "string".chop #=> "strin"
9367  * "x".chop.chop #=> ""
9368  */
9369 
9370 static VALUE
9371 rb_str_chop(VALUE str)
9372 {
9373  return rb_str_subseq(str, 0, chopped_length(str));
9374 }
9375 
9376 static long
9377 smart_chomp(VALUE str, const char *e, const char *p)
9378 {
9379  rb_encoding *enc = rb_enc_get(str);
9380  if (rb_enc_mbminlen(enc) > 1) {
9381  const char *pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
9382  if (rb_enc_is_newline(pp, e, enc)) {
9383  e = pp;
9384  }
9385  pp = e - rb_enc_mbminlen(enc);
9386  if (pp >= p) {
9387  pp = rb_enc_left_char_head(p, pp, e, enc);
9388  if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
9389  e = pp;
9390  }
9391  }
9392  }
9393  else {
9394  switch (*(e-1)) { /* not e[-1] to get rid of VC bug */
9395  case '\n':
9396  if (--e > p && *(e-1) == '\r') {
9397  --e;
9398  }
9399  break;
9400  case '\r':
9401  --e;
9402  break;
9403  }
9404  }
9405  return e - p;
9406 }
9407 
9408 static long
9409 chompped_length(VALUE str, VALUE rs)
9410 {
9411  rb_encoding *enc;
9412  int newline;
9413  char *pp, *e, *rsptr;
9414  long rslen;
9415  char *const p = RSTRING_PTR(str);
9416  long len = RSTRING_LEN(str);
9417 
9418  if (len == 0) return 0;
9419  e = p + len;
9420  if (rs == rb_default_rs) {
9421  return smart_chomp(str, e, p);
9422  }
9423 
9424  enc = rb_enc_get(str);
9425  RSTRING_GETMEM(rs, rsptr, rslen);
9426  if (rslen == 0) {
9427  if (rb_enc_mbminlen(enc) > 1) {
9428  while (e > p) {
9429  pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
9430  if (!rb_enc_is_newline(pp, e, enc)) break;
9431  e = pp;
9432  pp -= rb_enc_mbminlen(enc);
9433  if (pp >= p) {
9434  pp = rb_enc_left_char_head(p, pp, e, enc);
9435  if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
9436  e = pp;
9437  }
9438  }
9439  }
9440  }
9441  else {
9442  while (e > p && *(e-1) == '\n') {
9443  --e;
9444  if (e > p && *(e-1) == '\r')
9445  --e;
9446  }
9447  }
9448  return e - p;
9449  }
9450  if (rslen > len) return len;
9451 
9452  enc = rb_enc_get(rs);
9453  newline = rsptr[rslen-1];
9454  if (rslen == rb_enc_mbminlen(enc)) {
9455  if (rslen == 1) {
9456  if (newline == '\n')
9457  return smart_chomp(str, e, p);
9458  }
9459  else {
9460  if (rb_enc_is_newline(rsptr, rsptr+rslen, enc))
9461  return smart_chomp(str, e, p);
9462  }
9463  }
9464 
9465  enc = rb_enc_check(str, rs);
9466  if (is_broken_string(rs)) {
9467  return len;
9468  }
9469  pp = e - rslen;
9470  if (p[len-1] == newline &&
9471  (rslen <= 1 ||
9472  memcmp(rsptr, pp, rslen) == 0)) {
9473  if (rb_enc_left_char_head(p, pp, e, enc) == pp)
9474  return len - rslen;
9475  RB_GC_GUARD(rs);
9476  }
9477  return len;
9478 }
9479 
9485 static VALUE
9486 chomp_rs(int argc, const VALUE *argv)
9487 {
9488  rb_check_arity(argc, 0, 1);
9489  if (argc > 0) {
9490  VALUE rs = argv[0];
9491  if (!NIL_P(rs)) StringValue(rs);
9492  return rs;
9493  }
9494  else {
9495  return rb_rs;
9496  }
9497 }
9498 
9499 VALUE
9500 rb_str_chomp_string(VALUE str, VALUE rs)
9501 {
9502  long olen = RSTRING_LEN(str);
9503  long len = chompped_length(str, rs);
9504  if (len >= olen) return Qnil;
9505  str_modify_keep_cr(str);
9506  STR_SET_LEN(str, len);
9507  TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
9508  if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
9509  ENC_CODERANGE_CLEAR(str);
9510  }
9511  return str;
9512 }
9513 
9514 /*
9515  * call-seq:
9516  * str.chomp!(separator=$/) -> str or nil
9517  *
9518  * Modifies <i>str</i> in place as described for String#chomp,
9519  * returning <i>str</i>, or <code>nil</code> if no modifications were
9520  * made.
9521  */
9522 
9523 static VALUE
9524 rb_str_chomp_bang(int argc, VALUE *argv, VALUE str)
9525 {
9526  VALUE rs;
9527  str_modifiable(str);
9528  if (RSTRING_LEN(str) == 0) return Qnil;
9529  rs = chomp_rs(argc, argv);
9530  if (NIL_P(rs)) return Qnil;
9531  return rb_str_chomp_string(str, rs);
9532 }
9533 
9534 
9535 /*
9536  * call-seq:
9537  * str.chomp(separator=$/) -> new_str
9538  *
9539  * Returns a new String with the given record separator removed
9540  * from the end of <i>str</i> (if present). If <code>$/</code> has not been
9541  * changed from the default Ruby record separator, then <code>chomp</code> also
9542  * removes carriage return characters (that is, it will remove <code>\n</code>,
9543  * <code>\r</code>, and <code>\r\n</code>). If <code>$/</code> is an empty string,
9544  * it will remove all trailing newlines from the string.
9545  *
9546  * "hello".chomp #=> "hello"
9547  * "hello\n".chomp #=> "hello"
9548  * "hello\r\n".chomp #=> "hello"
9549  * "hello\n\r".chomp #=> "hello\n"
9550  * "hello\r".chomp #=> "hello"
9551  * "hello \n there".chomp #=> "hello \n there"
9552  * "hello".chomp("llo") #=> "he"
9553  * "hello\r\n\r\n".chomp('') #=> "hello"
9554  * "hello\r\n\r\r\n".chomp('') #=> "hello\r\n\r"
9555  */
9556 
9557 static VALUE
9558 rb_str_chomp(int argc, VALUE *argv, VALUE str)
9559 {
9560  VALUE rs = chomp_rs(argc, argv);
9561  if (NIL_P(rs)) return str_duplicate(rb_cString, str);
9562  return rb_str_subseq(str, 0, chompped_length(str, rs));
9563 }
9564 
9565 static long
9566 lstrip_offset(VALUE str, const char *s, const char *e, rb_encoding *enc)
9567 {
9568  const char *const start = s;
9569 
9570  if (!s || s >= e) return 0;
9571 
9572  /* remove spaces at head */
9573  if (single_byte_optimizable(str)) {
9574  while (s < e && (*s == '\0' || ascii_isspace(*s))) s++;
9575  }
9576  else {
9577  while (s < e) {
9578  int n;
9579  unsigned int cc = rb_enc_codepoint_len(s, e, &n, enc);
9580 
9581  if (cc && !rb_isspace(cc)) break;
9582  s += n;
9583  }
9584  }
9585  return s - start;
9586 }
9587 
9588 /*
9589  * call-seq:
9590  * str.lstrip! -> self or nil
9591  *
9592  * Removes leading whitespace from the receiver.
9593  * Returns the altered receiver, or +nil+ if no change was made.
9594  * See also String#rstrip! and String#strip!.
9595  *
9596  * Refer to String#strip for the definition of whitespace.
9597  *
9598  * " hello ".lstrip! #=> "hello "
9599  * "hello ".lstrip! #=> nil
9600  * "hello".lstrip! #=> nil
9601  */
9602 
9603 static VALUE
9604 rb_str_lstrip_bang(VALUE str)
9605 {
9606  rb_encoding *enc;
9607  char *start, *s;
9608  long olen, loffset;
9609 
9610  str_modify_keep_cr(str);
9611  enc = STR_ENC_GET(str);
9612  RSTRING_GETMEM(str, start, olen);
9613  loffset = lstrip_offset(str, start, start+olen, enc);
9614  if (loffset > 0) {
9615  long len = olen-loffset;
9616  s = start + loffset;
9617  memmove(start, s, len);
9618  STR_SET_LEN(str, len);
9619  TERM_FILL(start+len, rb_enc_mbminlen(enc));
9620  return str;
9621  }
9622  return Qnil;
9623 }
9624 
9625 
9626 /*
9627  * call-seq:
9628  * str.lstrip -> new_str
9629  *
9630  * Returns a copy of the receiver with leading whitespace removed.
9631  * See also String#rstrip and String#strip.
9632  *
9633  * Refer to String#strip for the definition of whitespace.
9634  *
9635  * " hello ".lstrip #=> "hello "
9636  * "hello".lstrip #=> "hello"
9637  */
9638 
9639 static VALUE
9640 rb_str_lstrip(VALUE str)
9641 {
9642  char *start;
9643  long len, loffset;
9644  RSTRING_GETMEM(str, start, len);
9645  loffset = lstrip_offset(str, start, start+len, STR_ENC_GET(str));
9646  if (loffset <= 0) return str_duplicate(rb_cString, str);
9647  return rb_str_subseq(str, loffset, len - loffset);
9648 }
9649 
9650 static long
9651 rstrip_offset(VALUE str, const char *s, const char *e, rb_encoding *enc)
9652 {
9653  const char *t;
9654 
9655  rb_str_check_dummy_enc(enc);
9656  if (!s || s >= e) return 0;
9657  t = e;
9658 
9659  /* remove trailing spaces or '\0's */
9660  if (single_byte_optimizable(str)) {
9661  unsigned char c;
9662  while (s < t && ((c = *(t-1)) == '\0' || ascii_isspace(c))) t--;
9663  }
9664  else {
9665  char *tp;
9666 
9667  while ((tp = rb_enc_prev_char(s, t, e, enc)) != NULL) {
9668  unsigned int c = rb_enc_codepoint(tp, e, enc);
9669  if (c && !rb_isspace(c)) break;
9670  t = tp;
9671  }
9672  }
9673  return e - t;
9674 }
9675 
9676 /*
9677  * call-seq:
9678  * str.rstrip! -> self or nil
9679  *
9680  * Removes trailing whitespace from the receiver.
9681  * Returns the altered receiver, or +nil+ if no change was made.
9682  * See also String#lstrip! and String#strip!.
9683  *
9684  * Refer to String#strip for the definition of whitespace.
9685  *
9686  * " hello ".rstrip! #=> " hello"
9687  * " hello".rstrip! #=> nil
9688  * "hello".rstrip! #=> nil
9689  */
9690 
9691 static VALUE
9692 rb_str_rstrip_bang(VALUE str)
9693 {
9694  rb_encoding *enc;
9695  char *start;
9696  long olen, roffset;
9697 
9698  str_modify_keep_cr(str);
9699  enc = STR_ENC_GET(str);
9700  RSTRING_GETMEM(str, start, olen);
9701  roffset = rstrip_offset(str, start, start+olen, enc);
9702  if (roffset > 0) {
9703  long len = olen - roffset;
9704 
9705  STR_SET_LEN(str, len);
9706  TERM_FILL(start+len, rb_enc_mbminlen(enc));
9707  return str;
9708  }
9709  return Qnil;
9710 }
9711 
9712 
9713 /*
9714  * call-seq:
9715  * str.rstrip -> new_str
9716  *
9717  * Returns a copy of the receiver with trailing whitespace removed.
9718  * See also String#lstrip and String#strip.
9719  *
9720  * Refer to String#strip for the definition of whitespace.
9721  *
9722  * " hello ".rstrip #=> " hello"
9723  * "hello".rstrip #=> "hello"
9724  */
9725 
9726 static VALUE
9727 rb_str_rstrip(VALUE str)
9728 {
9729  rb_encoding *enc;
9730  char *start;
9731  long olen, roffset;
9732 
9733  enc = STR_ENC_GET(str);
9734  RSTRING_GETMEM(str, start, olen);
9735  roffset = rstrip_offset(str, start, start+olen, enc);
9736 
9737  if (roffset <= 0) return str_duplicate(rb_cString, str);
9738  return rb_str_subseq(str, 0, olen-roffset);
9739 }
9740 
9741 
9742 /*
9743  * call-seq:
9744  * str.strip! -> self or nil
9745  *
9746  * Removes leading and trailing whitespace from the receiver.
9747  * Returns the altered receiver, or +nil+ if there was no change.
9748  *
9749  * Refer to String#strip for the definition of whitespace.
9750  *
9751  * " hello ".strip! #=> "hello"
9752  * "hello".strip! #=> nil
9753  */
9754 
9755 static VALUE
9756 rb_str_strip_bang(VALUE str)
9757 {
9758  char *start;
9759  long olen, loffset, roffset;
9760  rb_encoding *enc;
9761 
9762  str_modify_keep_cr(str);
9763  enc = STR_ENC_GET(str);
9764  RSTRING_GETMEM(str, start, olen);
9765  loffset = lstrip_offset(str, start, start+olen, enc);
9766  roffset = rstrip_offset(str, start+loffset, start+olen, enc);
9767 
9768  if (loffset > 0 || roffset > 0) {
9769  long len = olen-roffset;
9770  if (loffset > 0) {
9771  len -= loffset;
9772  memmove(start, start + loffset, len);
9773  }
9774  STR_SET_LEN(str, len);
9775  TERM_FILL(start+len, rb_enc_mbminlen(enc));
9776  return str;
9777  }
9778  return Qnil;
9779 }
9780 
9781 
9782 /*
9783  * call-seq:
9784  * str.strip -> new_str
9785  *
9786  * Returns a copy of the receiver with leading and trailing whitespace removed.
9787  *
9788  * Whitespace is defined as any of the following characters:
9789  * null, horizontal tab, line feed, vertical tab, form feed, carriage return, space.
9790  *
9791  * " hello ".strip #=> "hello"
9792  * "\tgoodbye\r\n".strip #=> "goodbye"
9793  * "\x00\t\n\v\f\r ".strip #=> ""
9794  * "hello".strip #=> "hello"
9795  */
9796 
9797 static VALUE
9798 rb_str_strip(VALUE str)
9799 {
9800  char *start;
9801  long olen, loffset, roffset;
9802  rb_encoding *enc = STR_ENC_GET(str);
9803 
9804  RSTRING_GETMEM(str, start, olen);
9805  loffset = lstrip_offset(str, start, start+olen, enc);
9806  roffset = rstrip_offset(str, start+loffset, start+olen, enc);
9807 
9808  if (loffset <= 0 && roffset <= 0) return str_duplicate(rb_cString, str);
9809  return rb_str_subseq(str, loffset, olen-loffset-roffset);
9810 }
9811 
9812 static VALUE
9813 scan_once(VALUE str, VALUE pat, long *start, int set_backref_str)
9814 {
9815  VALUE result, match;
9816  struct re_registers *regs;
9817  int i;
9818  long end, pos = rb_pat_search(pat, str, *start, set_backref_str);
9819  if (pos >= 0) {
9820  if (BUILTIN_TYPE(pat) == T_STRING) {
9821  regs = NULL;
9822  end = pos + RSTRING_LEN(pat);
9823  }
9824  else {
9825  match = rb_backref_get();
9826  regs = RMATCH_REGS(match);
9827  pos = BEG(0);
9828  end = END(0);
9829  }
9830  if (pos == end) {
9831  rb_encoding *enc = STR_ENC_GET(str);
9832  /*
9833  * Always consume at least one character of the input string
9834  */
9835  if (RSTRING_LEN(str) > end)
9836  *start = end + rb_enc_fast_mbclen(RSTRING_PTR(str) + end,
9837  RSTRING_END(str), enc);
9838  else
9839  *start = end + 1;
9840  }
9841  else {
9842  *start = end;
9843  }
9844  if (!regs || regs->num_regs == 1) {
9845  result = rb_str_subseq(str, pos, end - pos);
9846  return result;
9847  }
9848  result = rb_ary_new2(regs->num_regs);
9849  for (i=1; i < regs->num_regs; i++) {
9850  VALUE s = Qnil;
9851  if (BEG(i) >= 0) {
9852  s = rb_str_subseq(str, BEG(i), END(i)-BEG(i));
9853  }
9854  rb_ary_push(result, s);
9855  }
9856 
9857  return result;
9858  }
9859  return Qnil;
9860 }
9861 
9862 
9863 /*
9864  * call-seq:
9865  * str.scan(pattern) -> array
9866  * str.scan(pattern) {|match, ...| block } -> str
9867  *
9868  * Both forms iterate through <i>str</i>, matching the pattern (which may be a
9869  * Regexp or a String). For each match, a result is
9870  * generated and either added to the result array or passed to the block. If
9871  * the pattern contains no groups, each individual result consists of the
9872  * matched string, <code>$&</code>. If the pattern contains groups, each
9873  * individual result is itself an array containing one entry per group.
9874  *
9875  * a = "cruel world"
9876  * a.scan(/\w+/) #=> ["cruel", "world"]
9877  * a.scan(/.../) #=> ["cru", "el ", "wor"]
9878  * a.scan(/(...)/) #=> [["cru"], ["el "], ["wor"]]
9879  * a.scan(/(..)(..)/) #=> [["cr", "ue"], ["l ", "wo"]]
9880  *
9881  * And the block form:
9882  *
9883  * a.scan(/\w+/) {|w| print "<<#{w}>> " }
9884  * print "\n"
9885  * a.scan(/(.)(.)/) {|x,y| print y, x }
9886  * print "\n"
9887  *
9888  * <em>produces:</em>
9889  *
9890  * <<cruel>> <<world>>
9891  * rceu lowlr
9892  */
9893 
9894 static VALUE
9895 rb_str_scan(VALUE str, VALUE pat)
9896 {
9897  VALUE result;
9898  long start = 0;
9899  long last = -1, prev = 0;
9900  char *p = RSTRING_PTR(str); long len = RSTRING_LEN(str);
9901 
9902  pat = get_pat_quoted(pat, 1);
9903  mustnot_broken(str);
9904  if (!rb_block_given_p()) {
9905  VALUE ary = rb_ary_new();
9906 
9907  while (!NIL_P(result = scan_once(str, pat, &start, 0))) {
9908  last = prev;
9909  prev = start;
9910  rb_ary_push(ary, result);
9911  }
9912  if (last >= 0) rb_pat_search(pat, str, last, 1);
9913  else rb_backref_set(Qnil);
9914  return ary;
9915  }
9916 
9917  while (!NIL_P(result = scan_once(str, pat, &start, 1))) {
9918  last = prev;
9919  prev = start;
9920  rb_yield(result);
9921  str_mod_check(str, p, len);
9922  }
9923  if (last >= 0) rb_pat_search(pat, str, last, 1);
9924  return str;
9925 }
9926 
9927 
9928 /*
9929  * call-seq:
9930  * str.hex -> integer
9931  *
9932  * Treats leading characters from <i>str</i> as a string of hexadecimal digits
9933  * (with an optional sign and an optional <code>0x</code>) and returns the
9934  * corresponding number. Zero is returned on error.
9935  *
9936  * "0x0a".hex #=> 10
9937  * "-1234".hex #=> -4660
9938  * "0".hex #=> 0
9939  * "wombat".hex #=> 0
9940  */
9941 
9942 static VALUE
9943 rb_str_hex(VALUE str)
9944 {
9945  return rb_str_to_inum(str, 16, FALSE);
9946 }
9947 
9948 
9949 /*
9950  * call-seq:
9951  * str.oct -> integer
9952  *
9953  * Treats leading characters of <i>str</i> as a string of octal digits (with an
9954  * optional sign) and returns the corresponding number. Returns 0 if the
9955  * conversion fails.
9956  *
9957  * "123".oct #=> 83
9958  * "-377".oct #=> -255
9959  * "bad".oct #=> 0
9960  * "0377bad".oct #=> 255
9961  *
9962  * If +str+ starts with <code>0</code>, radix indicators are honored.
9963  * See Kernel#Integer.
9964  */
9965 
9966 static VALUE
9967 rb_str_oct(VALUE str)
9968 {
9969  return rb_str_to_inum(str, -8, FALSE);
9970 }
9971 
9972 #ifndef HAVE_CRYPT_R
9973 # include "ruby/thread_native.h"
9974 # include "ruby/atomic.h"
9975 
9976 static struct {
9977  rb_atomic_t initialized;
9978  rb_nativethread_lock_t lock;
9979 } crypt_mutex;
9980 
9981 static void
9982 crypt_mutex_destroy(void)
9983 {
9984  RUBY_ASSERT_ALWAYS(crypt_mutex.initialized == 1);
9985  rb_nativethread_lock_destroy(&crypt_mutex.lock);
9986  crypt_mutex.initialized = 0;
9987 }
9988 
9989 static void
9990 crypt_mutex_initialize(void)
9991 {
9992  rb_atomic_t i;
9993  while ((i = RUBY_ATOMIC_CAS(crypt_mutex.initialized, 0, 2)) == 2);
9994  switch (i) {
9995  case 0:
9996  rb_nativethread_lock_initialize(&crypt_mutex.lock);
9997  atexit(crypt_mutex_destroy);
9998  RUBY_ASSERT(crypt_mutex.initialized == 2);
9999  RUBY_ATOMIC_CAS(crypt_mutex.initialized, 2, 1);
10000  break;
10001  case 1:
10002  break;
10003  default:
10004  rb_bug("crypt_mutex.initialized: %d->%d", i, crypt_mutex.initialized);
10005  }
10006 }
10007 #endif
10008 
10009 /*
10010  * call-seq:
10011  * str.crypt(salt_str) -> new_str
10012  *
10013  * Returns the string generated by calling <code>crypt(3)</code>
10014  * standard library function with <code>str</code> and
10015  * <code>salt_str</code>, in this order, as its arguments. Please do
10016  * not use this method any longer. It is legacy; provided only for
10017  * backward compatibility with ruby scripts in earlier days. It is
10018  * bad to use in contemporary programs for several reasons:
10019  *
10020  * * Behaviour of C's <code>crypt(3)</code> depends on the OS it is
10021  * run. The generated string lacks data portability.
10022  *
10023  * * On some OSes such as Mac OS, <code>crypt(3)</code> never fails
10024  * (i.e. silently ends up in unexpected results).
10025  *
10026  * * On some OSes such as Mac OS, <code>crypt(3)</code> is not
10027  * thread safe.
10028  *
10029  * * So-called "traditional" usage of <code>crypt(3)</code> is very
10030  * very very weak. According to its manpage, Linux's traditional
10031  * <code>crypt(3)</code> output has only 2**56 variations; too
10032  * easy to brute force today. And this is the default behaviour.
10033  *
10034  * * In order to make things robust some OSes implement so-called
10035  * "modular" usage. To go through, you have to do a complex
10036  * build-up of the <code>salt_str</code> parameter, by hand.
10037  * Failure in generation of a proper salt string tends not to
10038  * yield any errors; typos in parameters are normally not
10039  * detectable.
10040  *
10041  * * For instance, in the following example, the second invocation
10042  * of String#crypt is wrong; it has a typo in "round=" (lacks
10043  * "s"). However the call does not fail and something unexpected
10044  * is generated.
10045  *
10046  * "foo".crypt("$5$rounds=1000$salt$") # OK, proper usage
10047  * "foo".crypt("$5$round=1000$salt$") # Typo not detected
10048  *
10049  * * Even in the "modular" mode, some hash functions are considered
10050  * archaic and no longer recommended at all; for instance module
10051  * <code>$1$</code> is officially abandoned by its author: see
10052  * http://phk.freebsd.dk/sagas/md5crypt_eol/ . For another
10053  * instance module <code>$3$</code> is considered completely
10054  * broken: see the manpage of FreeBSD.
10055  *
10056  * * On some OS such as Mac OS, there is no modular mode. Yet, as
10057  * written above, <code>crypt(3)</code> on Mac OS never fails.
10058  * This means even if you build up a proper salt string it
10059  * generates a traditional DES hash anyways, and there is no way
10060  * for you to be aware of.
10061  *
10062  * "foo".crypt("$5$rounds=1000$salt$") # => "$5fNPQMxC5j6."
10063  *
10064  * If for some reason you cannot migrate to other secure contemporary
10065  * password hashing algorithms, install the string-crypt gem and
10066  * <code>require 'string/crypt'</code> to continue using it.
10067  */
10068 
10069 static VALUE
10070 rb_str_crypt(VALUE str, VALUE salt)
10071 {
10072 #ifdef HAVE_CRYPT_R
10073  VALUE databuf;
10074  struct crypt_data *data;
10075 # define CRYPT_END() ALLOCV_END(databuf)
10076 #else
10077  extern char *crypt(const char *, const char *);
10078 # define CRYPT_END() rb_nativethread_lock_unlock(&crypt_mutex.lock)
10079 #endif
10080  VALUE result;
10081  const char *s, *saltp;
10082  char *res;
10083 #ifdef BROKEN_CRYPT
10084  char salt_8bit_clean[3];
10085 #endif
10086 
10087  StringValue(salt);
10088  mustnot_wchar(str);
10089  mustnot_wchar(salt);
10090  s = StringValueCStr(str);
10091  saltp = RSTRING_PTR(salt);
10092  if (RSTRING_LEN(salt) < 2 || !saltp[0] || !saltp[1]) {
10093  rb_raise(rb_eArgError, "salt too short (need >=2 bytes)");
10094  }
10095 
10096 #ifdef BROKEN_CRYPT
10097  if (!ISASCII((unsigned char)saltp[0]) || !ISASCII((unsigned char)saltp[1])) {
10098  salt_8bit_clean[0] = saltp[0] & 0x7f;
10099  salt_8bit_clean[1] = saltp[1] & 0x7f;
10100  salt_8bit_clean[2] = '\0';
10101  saltp = salt_8bit_clean;
10102  }
10103 #endif
10104 #ifdef HAVE_CRYPT_R
10105  data = ALLOCV(databuf, sizeof(struct crypt_data));
10106 # ifdef HAVE_STRUCT_CRYPT_DATA_INITIALIZED
10107  data->initialized = 0;
10108 # endif
10109  res = crypt_r(s, saltp, data);
10110 #else
10111  crypt_mutex_initialize();
10112  rb_nativethread_lock_lock(&crypt_mutex.lock);
10113  res = crypt(s, saltp);
10114 #endif
10115  if (!res) {
10116  int err = errno;
10117  CRYPT_END();
10118  rb_syserr_fail(err, "crypt");
10119  }
10120  result = rb_str_new_cstr(res);
10121  CRYPT_END();
10122  return result;
10123 }
10124 
10125 
10126 /*
10127  * call-seq:
10128  * str.ord -> integer
10129  *
10130  * Returns the Integer ordinal of a one-character string.
10131  *
10132  * "a".ord #=> 97
10133  */
10134 
10135 static VALUE
10136 rb_str_ord(VALUE s)
10137 {
10138  unsigned int c;
10139 
10140  c = rb_enc_codepoint(RSTRING_PTR(s), RSTRING_END(s), STR_ENC_GET(s));
10141  return UINT2NUM(c);
10142 }
10143 /*
10144  * call-seq:
10145  * str.sum(n=16) -> integer
10146  *
10147  * Returns a basic <em>n</em>-bit checksum of the characters in <i>str</i>,
10148  * where <em>n</em> is the optional Integer parameter, defaulting
10149  * to 16. The result is simply the sum of the binary value of each byte in
10150  * <i>str</i> modulo <code>2**n - 1</code>. This is not a particularly good
10151  * checksum.
10152  */
10153 
10154 static VALUE
10155 rb_str_sum(int argc, VALUE *argv, VALUE str)
10156 {
10157  int bits = 16;
10158  char *ptr, *p, *pend;
10159  long len;
10160  VALUE sum = INT2FIX(0);
10161  unsigned long sum0 = 0;
10162 
10163  if (rb_check_arity(argc, 0, 1) && (bits = NUM2INT(argv[0])) < 0) {
10164  bits = 0;
10165  }
10166  ptr = p = RSTRING_PTR(str);
10167  len = RSTRING_LEN(str);
10168  pend = p + len;
10169 
10170  while (p < pend) {
10171  if (FIXNUM_MAX - UCHAR_MAX < sum0) {
10172  sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
10173  str_mod_check(str, ptr, len);
10174  sum0 = 0;
10175  }
10176  sum0 += (unsigned char)*p;
10177  p++;
10178  }
10179 
10180  if (bits == 0) {
10181  if (sum0) {
10182  sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
10183  }
10184  }
10185  else {
10186  if (sum == INT2FIX(0)) {
10187  if (bits < (int)sizeof(long)*CHAR_BIT) {
10188  sum0 &= (((unsigned long)1)<<bits)-1;
10189  }
10190  sum = LONG2FIX(sum0);
10191  }
10192  else {
10193  VALUE mod;
10194 
10195  if (sum0) {
10196  sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
10197  }
10198 
10199  mod = rb_funcall(INT2FIX(1), idLTLT, 1, INT2FIX(bits));
10200  mod = rb_funcall(mod, '-', 1, INT2FIX(1));
10201  sum = rb_funcall(sum, '&', 1, mod);
10202  }
10203  }
10204  return sum;
10205 }
10206 
10207 static VALUE
10208 rb_str_justify(int argc, VALUE *argv, VALUE str, char jflag)
10209 {
10210  rb_encoding *enc;
10211  VALUE w;
10212  long width, len, flen = 1, fclen = 1;
10213  VALUE res;
10214  char *p;
10215  const char *f = " ";
10216  long n, size, llen, rlen, llen2 = 0, rlen2 = 0;
10217  VALUE pad;
10218  int singlebyte = 1, cr;
10219  int termlen;
10220 
10221  rb_scan_args(argc, argv, "11", &w, &pad);
10222  enc = STR_ENC_GET(str);
10223  termlen = rb_enc_mbminlen(enc);
10224  width = NUM2LONG(w);
10225  if (argc == 2) {
10226  StringValue(pad);
10227  enc = rb_enc_check(str, pad);
10228  f = RSTRING_PTR(pad);
10229  flen = RSTRING_LEN(pad);
10230  fclen = str_strlen(pad, enc); /* rb_enc_check */
10231  singlebyte = single_byte_optimizable(pad);
10232  if (flen == 0 || fclen == 0) {
10233  rb_raise(rb_eArgError, "zero width padding");
10234  }
10235  }
10236  len = str_strlen(str, enc); /* rb_enc_check */
10237  if (width < 0 || len >= width) return str_duplicate(rb_cString, str);
10238  n = width - len;
10239  llen = (jflag == 'l') ? 0 : ((jflag == 'r') ? n : n/2);
10240  rlen = n - llen;
10241  cr = ENC_CODERANGE(str);
10242  if (flen > 1) {
10243  llen2 = str_offset(f, f + flen, llen % fclen, enc, singlebyte);
10244  rlen2 = str_offset(f, f + flen, rlen % fclen, enc, singlebyte);
10245  }
10246  size = RSTRING_LEN(str);
10247  if ((len = llen / fclen + rlen / fclen) >= LONG_MAX / flen ||
10248  (len *= flen) >= LONG_MAX - llen2 - rlen2 ||
10249  (len += llen2 + rlen2) >= LONG_MAX - size) {
10250  rb_raise(rb_eArgError, "argument too big");
10251  }
10252  len += size;
10253  res = str_new0(rb_cString, 0, len, termlen);
10254  p = RSTRING_PTR(res);
10255  if (flen <= 1) {
10256  memset(p, *f, llen);
10257  p += llen;
10258  }
10259  else {
10260  while (llen >= fclen) {
10261  memcpy(p,f,flen);
10262  p += flen;
10263  llen -= fclen;
10264  }
10265  if (llen > 0) {
10266  memcpy(p, f, llen2);
10267  p += llen2;
10268  }
10269  }
10270  memcpy(p, RSTRING_PTR(str), size);
10271  p += size;
10272  if (flen <= 1) {
10273  memset(p, *f, rlen);
10274  p += rlen;
10275  }
10276  else {
10277  while (rlen >= fclen) {
10278  memcpy(p,f,flen);
10279  p += flen;
10280  rlen -= fclen;
10281  }
10282  if (rlen > 0) {
10283  memcpy(p, f, rlen2);
10284  p += rlen2;
10285  }
10286  }
10287  TERM_FILL(p, termlen);
10288  STR_SET_LEN(res, p-RSTRING_PTR(res));
10289  rb_enc_associate(res, enc);
10290  if (argc == 2)
10291  cr = ENC_CODERANGE_AND(cr, ENC_CODERANGE(pad));
10292  if (cr != ENC_CODERANGE_BROKEN)
10293  ENC_CODERANGE_SET(res, cr);
10294 
10295  RB_GC_GUARD(pad);
10296  return res;
10297 }
10298 
10299 
10300 /*
10301  * call-seq:
10302  * str.ljust(integer, padstr=' ') -> new_str
10303  *
10304  * If <i>integer</i> is greater than the length of <i>str</i>, returns a new
10305  * String of length <i>integer</i> with <i>str</i> left justified
10306  * and padded with <i>padstr</i>; otherwise, returns <i>str</i>.
10307  *
10308  * "hello".ljust(4) #=> "hello"
10309  * "hello".ljust(20) #=> "hello "
10310  * "hello".ljust(20, '1234') #=> "hello123412341234123"
10311  */
10312 
10313 static VALUE
10314 rb_str_ljust(int argc, VALUE *argv, VALUE str)
10315 {
10316  return rb_str_justify(argc, argv, str, 'l');
10317 }
10318 
10319 
10320 /*
10321  * call-seq:
10322  * str.rjust(integer, padstr=' ') -> new_str
10323  *
10324  * If <i>integer</i> is greater than the length of <i>str</i>, returns a new
10325  * String of length <i>integer</i> with <i>str</i> right justified
10326  * and padded with <i>padstr</i>; otherwise, returns <i>str</i>.
10327  *
10328  * "hello".rjust(4) #=> "hello"
10329  * "hello".rjust(20) #=> " hello"
10330  * "hello".rjust(20, '1234') #=> "123412341234123hello"
10331  */
10332 
10333 static VALUE
10334 rb_str_rjust(int argc, VALUE *argv, VALUE str)
10335 {
10336  return rb_str_justify(argc, argv, str, 'r');
10337 }
10338 
10339 
10340 /*
10341  * call-seq:
10342  * str.center(width, padstr=' ') -> new_str
10343  *
10344  * Centers +str+ in +width+. If +width+ is greater than the length of +str+,
10345  * returns a new String of length +width+ with +str+ centered and padded with
10346  * +padstr+; otherwise, returns +str+.
10347  *
10348  * "hello".center(4) #=> "hello"
10349  * "hello".center(20) #=> " hello "
10350  * "hello".center(20, '123') #=> "1231231hello12312312"
10351  */
10352 
10353 static VALUE
10354 rb_str_center(int argc, VALUE *argv, VALUE str)
10355 {
10356  return rb_str_justify(argc, argv, str, 'c');
10357 }
10358 
10359 /*
10360  * call-seq:
10361  * str.partition(sep) -> [head, sep, tail]
10362  * str.partition(regexp) -> [head, match, tail]
10363  *
10364  * Searches <i>sep</i> or pattern (<i>regexp</i>) in the string
10365  * and returns the part before it, the match, and the part
10366  * after it.
10367  * If it is not found, returns two empty strings and <i>str</i>.
10368  *
10369  * "hello".partition("l") #=> ["he", "l", "lo"]
10370  * "hello".partition("x") #=> ["hello", "", ""]
10371  * "hello".partition(/.l/) #=> ["h", "el", "lo"]
10372  */
10373 
10374 static VALUE
10375 rb_str_partition(VALUE str, VALUE sep)
10376 {
10377  long pos;
10378 
10379  sep = get_pat_quoted(sep, 0);
10380  if (RB_TYPE_P(sep, T_REGEXP)) {
10381  if (rb_reg_search(sep, str, 0, 0) < 0) {
10382  goto failed;
10383  }
10384  VALUE match = rb_backref_get();
10385  struct re_registers *regs = RMATCH_REGS(match);
10386 
10387  pos = BEG(0);
10388  sep = rb_str_subseq(str, pos, END(0) - pos);
10389  }
10390  else {
10391  pos = rb_str_index(str, sep, 0);
10392  if (pos < 0) goto failed;
10393  }
10394  return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
10395  sep,
10396  rb_str_subseq(str, pos+RSTRING_LEN(sep),
10397  RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
10398 
10399  failed:
10400  return rb_ary_new3(3, str_duplicate(rb_cString, str), str_new_empty_String(str), str_new_empty_String(str));
10401 }
10402 
10403 /*
10404  * call-seq:
10405  * str.rpartition(sep) -> [head, sep, tail]
10406  * str.rpartition(regexp) -> [head, match, tail]
10407  *
10408  * Searches <i>sep</i> or pattern (<i>regexp</i>) in the string from the end
10409  * of the string, and returns the part before it, the match, and the part
10410  * after it.
10411  * If it is not found, returns two empty strings and <i>str</i>.
10412  *
10413  * "hello".rpartition("l") #=> ["hel", "l", "o"]
10414  * "hello".rpartition("x") #=> ["", "", "hello"]
10415  * "hello".rpartition(/.l/) #=> ["he", "ll", "o"]
10416  *
10417  * The match from the end means starting at the possible last position, not
10418  * the last of longest matches.
10419  *
10420  * "hello".rpartition(/l+/) #=> ["hel", "l", "o"]
10421  *
10422  * To partition at the last longest match, needs to combine with
10423  * negative lookbehind.
10424  *
10425  * "hello".rpartition(/(?<!l)l+/) #=> ["he", "ll", "o"]
10426  *
10427  * Or String#partition with negative lookforward.
10428  *
10429  * "hello".partition(/l+(?!.*l)/) #=> ["he", "ll", "o"]
10430  */
10431 
10432 static VALUE
10433 rb_str_rpartition(VALUE str, VALUE sep)
10434 {
10435  long pos = RSTRING_LEN(str);
10436 
10437  sep = get_pat_quoted(sep, 0);
10438  if (RB_TYPE_P(sep, T_REGEXP)) {
10439  if (rb_reg_search(sep, str, pos, 1) < 0) {
10440  goto failed;
10441  }
10442  VALUE match = rb_backref_get();
10443  struct re_registers *regs = RMATCH_REGS(match);
10444 
10445  pos = BEG(0);
10446  sep = rb_str_subseq(str, pos, END(0) - pos);
10447  }
10448  else {
10449  pos = rb_str_sublen(str, pos);
10450  pos = rb_str_rindex(str, sep, pos);
10451  if (pos < 0) {
10452  goto failed;
10453  }
10454  pos = rb_str_offset(str, pos);
10455  }
10456 
10457  return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
10458  sep,
10459  rb_str_subseq(str, pos+RSTRING_LEN(sep),
10460  RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
10461  failed:
10462  return rb_ary_new3(3, str_new_empty_String(str), str_new_empty_String(str), str_duplicate(rb_cString, str));
10463 }
10464 
10465 /*
10466  * call-seq:
10467  * str.start_with?([prefixes]+) -> true or false
10468  *
10469  * Returns true if +str+ starts with one of the +prefixes+ given.
10470  * Each of the +prefixes+ should be a String or a Regexp.
10471  *
10472  * "hello".start_with?("hell") #=> true
10473  * "hello".start_with?(/H/i) #=> true
10474  *
10475  * # returns true if one of the prefixes matches.
10476  * "hello".start_with?("heaven", "hell") #=> true
10477  * "hello".start_with?("heaven", "paradise") #=> false
10478  */
10479 
10480 static VALUE
10481 rb_str_start_with(int argc, VALUE *argv, VALUE str)
10482 {
10483  int i;
10484 
10485  for (i=0; i<argc; i++) {
10486  VALUE tmp = argv[i];
10487  if (RB_TYPE_P(tmp, T_REGEXP)) {
10488  if (rb_reg_start_with_p(tmp, str))
10489  return Qtrue;
10490  }
10491  else {
10492  StringValue(tmp);
10493  rb_enc_check(str, tmp);
10494  if (RSTRING_LEN(str) < RSTRING_LEN(tmp)) continue;
10495  if (memcmp(RSTRING_PTR(str), RSTRING_PTR(tmp), RSTRING_LEN(tmp)) == 0)
10496  return Qtrue;
10497  }
10498  }
10499  return Qfalse;
10500 }
10501 
10502 /*
10503  * call-seq:
10504  * str.end_with?([suffixes]+) -> true or false
10505  *
10506  * Returns true if +str+ ends with one of the +suffixes+ given.
10507  *
10508  * "hello".end_with?("ello") #=> true
10509  *
10510  * # returns true if one of the +suffixes+ matches.
10511  * "hello".end_with?("heaven", "ello") #=> true
10512  * "hello".end_with?("heaven", "paradise") #=> false
10513  */
10514 
10515 static VALUE
10516 rb_str_end_with(int argc, VALUE *argv, VALUE str)
10517 {
10518  int i;
10519  char *p, *s, *e;
10520  rb_encoding *enc;
10521 
10522  for (i=0; i<argc; i++) {
10523  VALUE tmp = argv[i];
10524  long slen, tlen;
10525  StringValue(tmp);
10526  enc = rb_enc_check(str, tmp);
10527  if ((tlen = RSTRING_LEN(tmp)) == 0) return Qtrue;
10528  if ((slen = RSTRING_LEN(str)) < tlen) continue;
10529  p = RSTRING_PTR(str);
10530  e = p + slen;
10531  s = e - tlen;
10532  if (rb_enc_left_char_head(p, s, e, enc) != s)
10533  continue;
10534  if (memcmp(s, RSTRING_PTR(tmp), RSTRING_LEN(tmp)) == 0)
10535  return Qtrue;
10536  }
10537  return Qfalse;
10538 }
10539 
10549 static long
10550 deleted_prefix_length(VALUE str, VALUE prefix)
10551 {
10552  char *strptr, *prefixptr;
10553  long olen, prefixlen;
10554 
10555  StringValue(prefix);
10556  if (is_broken_string(prefix)) return 0;
10557  rb_enc_check(str, prefix);
10558 
10559  /* return 0 if not start with prefix */
10560  prefixlen = RSTRING_LEN(prefix);
10561  if (prefixlen <= 0) return 0;
10562  olen = RSTRING_LEN(str);
10563  if (olen < prefixlen) return 0;
10564  strptr = RSTRING_PTR(str);
10565  prefixptr = RSTRING_PTR(prefix);
10566  if (memcmp(strptr, prefixptr, prefixlen) != 0) return 0;
10567 
10568  return prefixlen;
10569 }
10570 
10571 /*
10572  * call-seq:
10573  * str.delete_prefix!(prefix) -> self or nil
10574  *
10575  * Deletes leading <code>prefix</code> from <i>str</i>, returning
10576  * <code>nil</code> if no change was made.
10577  *
10578  * "hello".delete_prefix!("hel") #=> "lo"
10579  * "hello".delete_prefix!("llo") #=> nil
10580  */
10581 
10582 static VALUE
10583 rb_str_delete_prefix_bang(VALUE str, VALUE prefix)
10584 {
10585  long prefixlen;
10586  str_modify_keep_cr(str);
10587 
10588  prefixlen = deleted_prefix_length(str, prefix);
10589  if (prefixlen <= 0) return Qnil;
10590 
10591  return rb_str_drop_bytes(str, prefixlen);
10592 }
10593 
10594 /*
10595  * call-seq:
10596  * str.delete_prefix(prefix) -> new_str
10597  *
10598  * Returns a copy of <i>str</i> with leading <code>prefix</code> deleted.
10599  *
10600  * "hello".delete_prefix("hel") #=> "lo"
10601  * "hello".delete_prefix("llo") #=> "hello"
10602  */
10603 
10604 static VALUE
10605 rb_str_delete_prefix(VALUE str, VALUE prefix)
10606 {
10607  long prefixlen;
10608 
10609  prefixlen = deleted_prefix_length(str, prefix);
10610  if (prefixlen <= 0) return str_duplicate(rb_cString, str);
10611 
10612  return rb_str_subseq(str, prefixlen, RSTRING_LEN(str) - prefixlen);
10613 }
10614 
10624 static long
10625 deleted_suffix_length(VALUE str, VALUE suffix)
10626 {
10627  char *strptr, *suffixptr, *s;
10628  long olen, suffixlen;
10629  rb_encoding *enc;
10630 
10631  StringValue(suffix);
10632  if (is_broken_string(suffix)) return 0;
10633  enc = rb_enc_check(str, suffix);
10634 
10635  /* return 0 if not start with suffix */
10636  suffixlen = RSTRING_LEN(suffix);
10637  if (suffixlen <= 0) return 0;
10638  olen = RSTRING_LEN(str);
10639  if (olen < suffixlen) return 0;
10640  strptr = RSTRING_PTR(str);
10641  suffixptr = RSTRING_PTR(suffix);
10642  s = strptr + olen - suffixlen;
10643  if (memcmp(s, suffixptr, suffixlen) != 0) return 0;
10644  if (rb_enc_left_char_head(strptr, s, strptr + olen, enc) != s) return 0;
10645 
10646  return suffixlen;
10647 }
10648 
10649 /*
10650  * call-seq:
10651  * str.delete_suffix!(suffix) -> self or nil
10652  *
10653  * Deletes trailing <code>suffix</code> from <i>str</i>, returning
10654  * <code>nil</code> if no change was made.
10655  *
10656  * "hello".delete_suffix!("llo") #=> "he"
10657  * "hello".delete_suffix!("hel") #=> nil
10658  */
10659 
10660 static VALUE
10661 rb_str_delete_suffix_bang(VALUE str, VALUE suffix)
10662 {
10663  long olen, suffixlen, len;
10664  str_modifiable(str);
10665 
10666  suffixlen = deleted_suffix_length(str, suffix);
10667  if (suffixlen <= 0) return Qnil;
10668 
10669  olen = RSTRING_LEN(str);
10670  str_modify_keep_cr(str);
10671  len = olen - suffixlen;
10672  STR_SET_LEN(str, len);
10673  TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
10674  if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
10675  ENC_CODERANGE_CLEAR(str);
10676  }
10677  return str;
10678 }
10679 
10680 /*
10681  * call-seq:
10682  * str.delete_suffix(suffix) -> new_str
10683  *
10684  * Returns a copy of <i>str</i> with trailing <code>suffix</code> deleted.
10685  *
10686  * "hello".delete_suffix("llo") #=> "he"
10687  * "hello".delete_suffix("hel") #=> "hello"
10688  */
10689 
10690 static VALUE
10691 rb_str_delete_suffix(VALUE str, VALUE suffix)
10692 {
10693  long suffixlen;
10694 
10695  suffixlen = deleted_suffix_length(str, suffix);
10696  if (suffixlen <= 0) return str_duplicate(rb_cString, str);
10697 
10698  return rb_str_subseq(str, 0, RSTRING_LEN(str) - suffixlen);
10699 }
10700 
10701 void
10702 rb_str_setter(VALUE val, ID id, VALUE *var)
10703 {
10704  if (!NIL_P(val) && !RB_TYPE_P(val, T_STRING)) {
10705  rb_raise(rb_eTypeError, "value of %"PRIsVALUE" must be String", rb_id2str(id));
10706  }
10707  *var = val;
10708 }
10709 
10710 static void
10711 rb_fs_setter(VALUE val, ID id, VALUE *var)
10712 {
10713  val = rb_fs_check(val);
10714  if (!val) {
10716  "value of %"PRIsVALUE" must be String or Regexp",
10717  rb_id2str(id));
10718  }
10719  if (!NIL_P(val)) {
10720  rb_warn_deprecated("`$;'", NULL);
10721  }
10722  *var = val;
10723 }
10724 
10725 
10726 /*
10727  * call-seq:
10728  * str.force_encoding(encoding) -> str
10729  *
10730  * Changes the encoding to +encoding+ and returns self.
10731  */
10732 
10733 static VALUE
10734 rb_str_force_encoding(VALUE str, VALUE enc)
10735 {
10736  str_modifiable(str);
10737  rb_enc_associate(str, rb_to_encoding(enc));
10738  ENC_CODERANGE_CLEAR(str);
10739  return str;
10740 }
10741 
10742 /*
10743  * call-seq:
10744  * str.b -> str
10745  *
10746  * Returns a copied string whose encoding is ASCII-8BIT.
10747  */
10748 
10749 static VALUE
10750 rb_str_b(VALUE str)
10751 {
10752  VALUE str2;
10753  if (FL_TEST(str, STR_NOEMBED)) {
10754  str2 = str_alloc_heap(rb_cString);
10755  }
10756  else {
10757  str2 = str_alloc_embed(rb_cString, RSTRING_EMBED_LEN(str) + TERM_LEN(str));
10758  }
10759  str_replace_shared_without_enc(str2, str);
10760  ENC_CODERANGE_CLEAR(str2);
10761  return str2;
10762 }
10763 
10764 /*
10765  * call-seq:
10766  * str.valid_encoding? -> true or false
10767  *
10768  * Returns true for a string which is encoded correctly.
10769  *
10770  * "\xc2\xa1".force_encoding("UTF-8").valid_encoding? #=> true
10771  * "\xc2".force_encoding("UTF-8").valid_encoding? #=> false
10772  * "\x80".force_encoding("UTF-8").valid_encoding? #=> false
10773  */
10774 
10775 static VALUE
10776 rb_str_valid_encoding_p(VALUE str)
10777 {
10778  int cr = rb_enc_str_coderange(str);
10779 
10780  return RBOOL(cr != ENC_CODERANGE_BROKEN);
10781 }
10782 
10783 /*
10784  * call-seq:
10785  * str.ascii_only? -> true or false
10786  *
10787  * Returns true for a string which has only ASCII characters.
10788  *
10789  * "abc".force_encoding("UTF-8").ascii_only? #=> true
10790  * "abc\u{6666}".force_encoding("UTF-8").ascii_only? #=> false
10791  */
10792 
10793 static VALUE
10794 rb_str_is_ascii_only_p(VALUE str)
10795 {
10796  int cr = rb_enc_str_coderange(str);
10797 
10798  return RBOOL(cr == ENC_CODERANGE_7BIT);
10799 }
10800 
10801 VALUE
10802 rb_str_ellipsize(VALUE str, long len)
10803 {
10804  static const char ellipsis[] = "...";
10805  const long ellipsislen = sizeof(ellipsis) - 1;
10806  rb_encoding *const enc = rb_enc_get(str);
10807  const long blen = RSTRING_LEN(str);
10808  const char *const p = RSTRING_PTR(str), *e = p + blen;
10809  VALUE estr, ret = 0;
10810 
10811  if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
10812  if (len * rb_enc_mbminlen(enc) >= blen ||
10813  (e = rb_enc_nth(p, e, len, enc)) - p == blen) {
10814  ret = str;
10815  }
10816  else if (len <= ellipsislen ||
10817  !(e = rb_enc_step_back(p, e, e, len = ellipsislen, enc))) {
10818  if (rb_enc_asciicompat(enc)) {
10819  ret = rb_str_new(ellipsis, len);
10820  rb_enc_associate(ret, enc);
10821  }
10822  else {
10823  estr = rb_usascii_str_new(ellipsis, len);
10824  ret = rb_str_encode(estr, rb_enc_from_encoding(enc), 0, Qnil);
10825  }
10826  }
10827  else if (ret = rb_str_subseq(str, 0, e - p), rb_enc_asciicompat(enc)) {
10828  rb_str_cat(ret, ellipsis, ellipsislen);
10829  }
10830  else {
10831  estr = rb_str_encode(rb_usascii_str_new(ellipsis, ellipsislen),
10832  rb_enc_from_encoding(enc), 0, Qnil);
10833  rb_str_append(ret, estr);
10834  }
10835  return ret;
10836 }
10837 
10838 static VALUE
10839 str_compat_and_valid(VALUE str, rb_encoding *enc)
10840 {
10841  int cr;
10842  str = StringValue(str);
10843  cr = rb_enc_str_coderange(str);
10844  if (cr == ENC_CODERANGE_BROKEN) {
10845  rb_raise(rb_eArgError, "replacement must be valid byte sequence '%+"PRIsVALUE"'", str);
10846  }
10847  else {
10848  rb_encoding *e = STR_ENC_GET(str);
10849  if (cr == ENC_CODERANGE_7BIT ? rb_enc_mbminlen(enc) != 1 : enc != e) {
10850  rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
10851  rb_enc_name(enc), rb_enc_name(e));
10852  }
10853  }
10854  return str;
10855 }
10856 
10857 static VALUE enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl, int cr);
10858 
10859 VALUE
10861 {
10862  rb_encoding *enc = STR_ENC_GET(str);
10863  return enc_str_scrub(enc, str, repl, ENC_CODERANGE(str));
10864 }
10865 
10866 VALUE
10867 rb_enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl)
10868 {
10869  int cr = ENC_CODERANGE_UNKNOWN;
10870  if (enc == STR_ENC_GET(str)) {
10871  /* cached coderange makes sense only when enc equals the
10872  * actual encoding of str */
10873  cr = ENC_CODERANGE(str);
10874  }
10875  return enc_str_scrub(enc, str, repl, cr);
10876 }
10877 
10878 static VALUE
10879 enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl, int cr)
10880 {
10881  int encidx;
10882  VALUE buf = Qnil;
10883  const char *rep, *p, *e, *p1, *sp;
10884  long replen = -1;
10885  long slen;
10886 
10887  if (rb_block_given_p()) {
10888  if (!NIL_P(repl))
10889  rb_raise(rb_eArgError, "both of block and replacement given");
10890  replen = 0;
10891  }
10892 
10893  if (ENC_CODERANGE_CLEAN_P(cr))
10894  return Qnil;
10895 
10896  if (!NIL_P(repl)) {
10897  repl = str_compat_and_valid(repl, enc);
10898  }
10899 
10900  if (rb_enc_dummy_p(enc)) {
10901  return Qnil;
10902  }
10903  encidx = rb_enc_to_index(enc);
10904 
10905 #define DEFAULT_REPLACE_CHAR(str) do { \
10906  static const char replace[sizeof(str)-1] = str; \
10907  rep = replace; replen = (int)sizeof(replace); \
10908  } while (0)
10909 
10910  slen = RSTRING_LEN(str);
10911  p = RSTRING_PTR(str);
10912  e = RSTRING_END(str);
10913  p1 = p;
10914  sp = p;
10915 
10916  if (rb_enc_asciicompat(enc)) {
10917  int rep7bit_p;
10918  if (!replen) {
10919  rep = NULL;
10920  rep7bit_p = FALSE;
10921  }
10922  else if (!NIL_P(repl)) {
10923  rep = RSTRING_PTR(repl);
10924  replen = RSTRING_LEN(repl);
10925  rep7bit_p = (ENC_CODERANGE(repl) == ENC_CODERANGE_7BIT);
10926  }
10927  else if (encidx == rb_utf8_encindex()) {
10928  DEFAULT_REPLACE_CHAR("\xEF\xBF\xBD");
10929  rep7bit_p = FALSE;
10930  }
10931  else {
10932  DEFAULT_REPLACE_CHAR("?");
10933  rep7bit_p = TRUE;
10934  }
10935  cr = ENC_CODERANGE_7BIT;
10936 
10937  p = search_nonascii(p, e);
10938  if (!p) {
10939  p = e;
10940  }
10941  while (p < e) {
10942  int ret = rb_enc_precise_mbclen(p, e, enc);
10943  if (MBCLEN_NEEDMORE_P(ret)) {
10944  break;
10945  }
10946  else if (MBCLEN_CHARFOUND_P(ret)) {
10947  cr = ENC_CODERANGE_VALID;
10948  p += MBCLEN_CHARFOUND_LEN(ret);
10949  }
10950  else if (MBCLEN_INVALID_P(ret)) {
10951  /*
10952  * p1~p: valid ascii/multibyte chars
10953  * p ~e: invalid bytes + unknown bytes
10954  */
10955  long clen = rb_enc_mbmaxlen(enc);
10956  if (NIL_P(buf)) buf = rb_str_buf_new(RSTRING_LEN(str));
10957  if (p > p1) {
10958  rb_str_buf_cat(buf, p1, p - p1);
10959  }
10960 
10961  if (e - p < clen) clen = e - p;
10962  if (clen <= 2) {
10963  clen = 1;
10964  }
10965  else {
10966  const char *q = p;
10967  clen--;
10968  for (; clen > 1; clen--) {
10969  ret = rb_enc_precise_mbclen(q, q + clen, enc);
10970  if (MBCLEN_NEEDMORE_P(ret)) break;
10971  if (MBCLEN_INVALID_P(ret)) continue;
10972  UNREACHABLE;
10973  }
10974  }
10975  if (rep) {
10976  rb_str_buf_cat(buf, rep, replen);
10977  if (!rep7bit_p) cr = ENC_CODERANGE_VALID;
10978  }
10979  else {
10980  repl = rb_yield(rb_enc_str_new(p, clen, enc));
10981  str_mod_check(str, sp, slen);
10982  repl = str_compat_and_valid(repl, enc);
10983  rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
10984  if (ENC_CODERANGE(repl) == ENC_CODERANGE_VALID)
10985  cr = ENC_CODERANGE_VALID;
10986  }
10987  p += clen;
10988  p1 = p;
10989  p = search_nonascii(p, e);
10990  if (!p) {
10991  p = e;
10992  break;
10993  }
10994  }
10995  else {
10996  UNREACHABLE;
10997  }
10998  }
10999  if (NIL_P(buf)) {
11000  if (p == e) {
11001  ENC_CODERANGE_SET(str, cr);
11002  return Qnil;
11003  }
11004  buf = rb_str_buf_new(RSTRING_LEN(str));
11005  }
11006  if (p1 < p) {
11007  rb_str_buf_cat(buf, p1, p - p1);
11008  }
11009  if (p < e) {
11010  if (rep) {
11011  rb_str_buf_cat(buf, rep, replen);
11012  if (!rep7bit_p) cr = ENC_CODERANGE_VALID;
11013  }
11014  else {
11015  repl = rb_yield(rb_enc_str_new(p, e-p, enc));
11016  str_mod_check(str, sp, slen);
11017  repl = str_compat_and_valid(repl, enc);
11018  rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11019  if (ENC_CODERANGE(repl) == ENC_CODERANGE_VALID)
11020  cr = ENC_CODERANGE_VALID;
11021  }
11022  }
11023  }
11024  else {
11025  /* ASCII incompatible */
11026  long mbminlen = rb_enc_mbminlen(enc);
11027  if (!replen) {
11028  rep = NULL;
11029  }
11030  else if (!NIL_P(repl)) {
11031  rep = RSTRING_PTR(repl);
11032  replen = RSTRING_LEN(repl);
11033  }
11034  else if (encidx == ENCINDEX_UTF_16BE) {
11035  DEFAULT_REPLACE_CHAR("\xFF\xFD");
11036  }
11037  else if (encidx == ENCINDEX_UTF_16LE) {
11038  DEFAULT_REPLACE_CHAR("\xFD\xFF");
11039  }
11040  else if (encidx == ENCINDEX_UTF_32BE) {
11041  DEFAULT_REPLACE_CHAR("\x00\x00\xFF\xFD");
11042  }
11043  else if (encidx == ENCINDEX_UTF_32LE) {
11044  DEFAULT_REPLACE_CHAR("\xFD\xFF\x00\x00");
11045  }
11046  else {
11047  DEFAULT_REPLACE_CHAR("?");
11048  }
11049 
11050  while (p < e) {
11051  int ret = rb_enc_precise_mbclen(p, e, enc);
11052  if (MBCLEN_NEEDMORE_P(ret)) {
11053  break;
11054  }
11055  else if (MBCLEN_CHARFOUND_P(ret)) {
11056  p += MBCLEN_CHARFOUND_LEN(ret);
11057  }
11058  else if (MBCLEN_INVALID_P(ret)) {
11059  const char *q = p;
11060  long clen = rb_enc_mbmaxlen(enc);
11061  if (NIL_P(buf)) buf = rb_str_buf_new(RSTRING_LEN(str));
11062  if (p > p1) rb_str_buf_cat(buf, p1, p - p1);
11063 
11064  if (e - p < clen) clen = e - p;
11065  if (clen <= mbminlen * 2) {
11066  clen = mbminlen;
11067  }
11068  else {
11069  clen -= mbminlen;
11070  for (; clen > mbminlen; clen-=mbminlen) {
11071  ret = rb_enc_precise_mbclen(q, q + clen, enc);
11072  if (MBCLEN_NEEDMORE_P(ret)) break;
11073  if (MBCLEN_INVALID_P(ret)) continue;
11074  UNREACHABLE;
11075  }
11076  }
11077  if (rep) {
11078  rb_str_buf_cat(buf, rep, replen);
11079  }
11080  else {
11081  repl = rb_yield(rb_enc_str_new(p, clen, enc));
11082  str_mod_check(str, sp, slen);
11083  repl = str_compat_and_valid(repl, enc);
11084  rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11085  }
11086  p += clen;
11087  p1 = p;
11088  }
11089  else {
11090  UNREACHABLE;
11091  }
11092  }
11093  if (NIL_P(buf)) {
11094  if (p == e) {
11096  return Qnil;
11097  }
11098  buf = rb_str_buf_new(RSTRING_LEN(str));
11099  }
11100  if (p1 < p) {
11101  rb_str_buf_cat(buf, p1, p - p1);
11102  }
11103  if (p < e) {
11104  if (rep) {
11105  rb_str_buf_cat(buf, rep, replen);
11106  }
11107  else {
11108  repl = rb_yield(rb_enc_str_new(p, e-p, enc));
11109  str_mod_check(str, sp, slen);
11110  repl = str_compat_and_valid(repl, enc);
11111  rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11112  }
11113  }
11114  cr = ENC_CODERANGE_VALID;
11115  }
11116  ENCODING_CODERANGE_SET(buf, rb_enc_to_index(enc), cr);
11117  return buf;
11118 }
11119 
11120 /*
11121  * call-seq:
11122  * str.scrub -> new_str
11123  * str.scrub(repl) -> new_str
11124  * str.scrub{|bytes|} -> new_str
11125  *
11126  * If the string is invalid byte sequence then replace invalid bytes with given replacement
11127  * character, else returns self.
11128  * If block is given, replace invalid bytes with returned value of the block.
11129  *
11130  * "abc\u3042\x81".scrub #=> "abc\u3042\uFFFD"
11131  * "abc\u3042\x81".scrub("*") #=> "abc\u3042*"
11132  * "abc\u3042\xE3\x80".scrub{|bytes| '<'+bytes.unpack1('H*')+'>' } #=> "abc\u3042<e380>"
11133  */
11134 static VALUE
11135 str_scrub(int argc, VALUE *argv, VALUE str)
11136 {
11137  VALUE repl = argc ? (rb_check_arity(argc, 0, 1), argv[0]) : Qnil;
11138  VALUE new = rb_str_scrub(str, repl);
11139  return NIL_P(new) ? str_duplicate(rb_cString, str): new;
11140 }
11141 
11142 /*
11143  * call-seq:
11144  * str.scrub! -> str
11145  * str.scrub!(repl) -> str
11146  * str.scrub!{|bytes|} -> str
11147  *
11148  * If the string is invalid byte sequence then replace invalid bytes with given replacement
11149  * character, else returns self.
11150  * If block is given, replace invalid bytes with returned value of the block.
11151  *
11152  * "abc\u3042\x81".scrub! #=> "abc\u3042\uFFFD"
11153  * "abc\u3042\x81".scrub!("*") #=> "abc\u3042*"
11154  * "abc\u3042\xE3\x80".scrub!{|bytes| '<'+bytes.unpack1('H*')+'>' } #=> "abc\u3042<e380>"
11155  */
11156 static VALUE
11157 str_scrub_bang(int argc, VALUE *argv, VALUE str)
11158 {
11159  VALUE repl = argc ? (rb_check_arity(argc, 0, 1), argv[0]) : Qnil;
11160  VALUE new = rb_str_scrub(str, repl);
11161  if (!NIL_P(new)) rb_str_replace(str, new);
11162  return str;
11163 }
11164 
11165 static ID id_normalize;
11166 static ID id_normalized_p;
11167 static VALUE mUnicodeNormalize;
11168 
11169 static VALUE
11170 unicode_normalize_common(int argc, VALUE *argv, VALUE str, ID id)
11171 {
11172  static int UnicodeNormalizeRequired = 0;
11173  VALUE argv2[2];
11174 
11175  if (!UnicodeNormalizeRequired) {
11176  rb_require("unicode_normalize/normalize.rb");
11177  UnicodeNormalizeRequired = 1;
11178  }
11179  argv2[0] = str;
11180  if (rb_check_arity(argc, 0, 1)) argv2[1] = argv[0];
11181  return rb_funcallv(mUnicodeNormalize, id, argc+1, argv2);
11182 }
11183 
11184 /*
11185  * call-seq:
11186  * str.unicode_normalize(form=:nfc)
11187  *
11188  * Unicode Normalization---Returns a normalized form of +str+,
11189  * using Unicode normalizations NFC, NFD, NFKC, or NFKD.
11190  * The normalization form used is determined by +form+, which can
11191  * be any of the four values +:nfc+, +:nfd+, +:nfkc+, or +:nfkd+.
11192  * The default is +:nfc+.
11193  *
11194  * If the string is not in a Unicode Encoding, then an Exception is raised.
11195  * In this context, 'Unicode Encoding' means any of UTF-8, UTF-16BE/LE,
11196  * and UTF-32BE/LE, as well as GB18030, UCS_2BE, and UCS_4BE.
11197  * Anything other than UTF-8 is implemented by converting to UTF-8,
11198  * which makes it slower than UTF-8.
11199  *
11200  * "a\u0300".unicode_normalize #=> "\u00E0"
11201  * "a\u0300".unicode_normalize(:nfc) #=> "\u00E0"
11202  * "\u00E0".unicode_normalize(:nfd) #=> "a\u0300"
11203  * "\xE0".force_encoding('ISO-8859-1').unicode_normalize(:nfd)
11204  * #=> Encoding::CompatibilityError raised
11205  */
11206 static VALUE
11207 rb_str_unicode_normalize(int argc, VALUE *argv, VALUE str)
11208 {
11209  return unicode_normalize_common(argc, argv, str, id_normalize);
11210 }
11211 
11212 /*
11213  * call-seq:
11214  * str.unicode_normalize!(form=:nfc)
11215  *
11216  * Destructive version of String#unicode_normalize, doing Unicode
11217  * normalization in place.
11218  */
11219 static VALUE
11220 rb_str_unicode_normalize_bang(int argc, VALUE *argv, VALUE str)
11221 {
11222  return rb_str_replace(str, unicode_normalize_common(argc, argv, str, id_normalize));
11223 }
11224 
11225 /* call-seq:
11226  * str.unicode_normalized?(form=:nfc)
11227  *
11228  * Checks whether +str+ is in Unicode normalization form +form+,
11229  * which can be any of the four values +:nfc+, +:nfd+, +:nfkc+, or +:nfkd+.
11230  * The default is +:nfc+.
11231  *
11232  * If the string is not in a Unicode Encoding, then an Exception is raised.
11233  * For details, see String#unicode_normalize.
11234  *
11235  * "a\u0300".unicode_normalized? #=> false
11236  * "a\u0300".unicode_normalized?(:nfd) #=> true
11237  * "\u00E0".unicode_normalized? #=> true
11238  * "\u00E0".unicode_normalized?(:nfd) #=> false
11239  * "\xE0".force_encoding('ISO-8859-1').unicode_normalized?
11240  * #=> Encoding::CompatibilityError raised
11241  */
11242 static VALUE
11243 rb_str_unicode_normalized_p(int argc, VALUE *argv, VALUE str)
11244 {
11245  return unicode_normalize_common(argc, argv, str, id_normalized_p);
11246 }
11247 
11248 /**********************************************************************
11249  * Document-class: Symbol
11250  *
11251  * Symbol objects represent named identifiers inside the Ruby interpreter.
11252  *
11253  * You can create a \Symbol object explicitly with:
11254  *
11255  * - A {symbol literal}[doc/syntax/literals_rdoc.html#label-Symbol+Literals].
11256  *
11257  * The same Symbol object will be
11258  * created for a given name or string for the duration of a program's
11259  * execution, regardless of the context or meaning of that name. Thus
11260  * if <code>Fred</code> is a constant in one context, a method in
11261  * another, and a class in a third, the Symbol <code>:Fred</code>
11262  * will be the same object in all three contexts.
11263  *
11264  * module One
11265  * class Fred
11266  * end
11267  * $f1 = :Fred
11268  * end
11269  * module Two
11270  * Fred = 1
11271  * $f2 = :Fred
11272  * end
11273  * def Fred()
11274  * end
11275  * $f3 = :Fred
11276  * $f1.object_id #=> 2514190
11277  * $f2.object_id #=> 2514190
11278  * $f3.object_id #=> 2514190
11279  *
11280  * Constant, method, and variable names are returned as symbols:
11281  *
11282  * module One
11283  * Two = 2
11284  * def three; 3 end
11285  * @four = 4
11286  * @@five = 5
11287  * $six = 6
11288  * end
11289  * seven = 7
11290  *
11291  * One.constants
11292  * # => [:Two]
11293  * One.instance_methods(true)
11294  * # => [:three]
11295  * One.instance_variables
11296  * # => [:@four]
11297  * One.class_variables
11298  * # => [:@@five]
11299  * global_variables.grep(/six/)
11300  * # => [:$six]
11301  * local_variables
11302  * # => [:seven]
11303  *
11304  * Symbol objects are different from String objects in that
11305  * Symbol objects represent identifiers, while String objects
11306  * represent text or data.
11307  *
11308  * == What's Here
11309  *
11310  * First, what's elsewhere. \Class \Symbol:
11311  *
11312  * - Inherits from {class Object}[Object.html#class-Object-label-What-27s+Here].
11313  * - Includes {module Comparable}[Comparable.html#module-Comparable-label-What-27s+Here].
11314  *
11315  * Here, class \Symbol provides methods that are useful for:
11316  *
11317  * - {Querying}[#class-Symbol-label-Methods+for+Querying]
11318  * - {Comparing}[#class-Symbol-label-Methods+for+Comparing]
11319  * - {Converting}[#class-Symbol-label-Methods+for+Converting]
11320  *
11321  * === Methods for Querying
11322  *
11323  * - ::all_symbols:: Returns an array of the symbols currently in Ruby's symbol table.
11324  * - {#=~}[#method-i-3D~]:: Returns the index of the first substring
11325  * in symbol that matches a given Regexp
11326  * or other object; returns +nil+ if no match is found.
11327  * - #[], #slice :: Returns a substring of symbol
11328  * determined by a given index, start/length, or range, or string.
11329  * - #empty?:: Returns +true+ if +self.length+ is zero; +false+ otherwise.
11330  * - #encoding:: Returns the Encoding object that represents the encoding
11331  * of symbol.
11332  * - #end_with?:: Returns +true+ if symbol ends with
11333  * any of the given strings.
11334  * - #match:: Returns a MatchData object if symbol
11335  * matches a given Regexp; +nil+ otherwise.
11336  * - #match?:: Returns +true+ if symbol
11337  * matches a given Regexp; +false+ otherwise.
11338  * - #length, #size:: Returns the number of characters in symbol.
11339  * - #start_with?:: Returns +true+ if symbol starts with
11340  * any of the given strings.
11341  *
11342  * === Methods for Comparing
11343  *
11344  * - {#<=>}[#method-i-3C-3D-3E]:: Returns -1, 0, or 1 as a given symbol is smaller than, equal to, or larger than symbol.
11345  * - {#==, #===}[#method-i-3D-3D]:: Returns +true+ if a given symbol
11346  * has the same content and encoding.
11347  * - #casecmp:: Ignoring case, returns -1, 0, or 1 as a given
11348  * symbol is smaller than, equal to, or larger than symbol.
11349  * - #casecmp?:: Returns +true+ if symbol is equal to a given symbol
11350  * after Unicode case folding; +false+ otherwise.
11351  *
11352  * === Methods for Converting
11353  *
11354  * - #capitalize:: Returns symbol with the first character upcased
11355  * and all other characters downcased.
11356  * - #downcase:: Returns symbol with all characters downcased.
11357  * - #inspect:: Returns the string representation of +self+ as a symbol literal.
11358  * - #name:: Returns the frozen string corresponding to symbol.
11359  * - #succ, #next:: Returns the symbol that is the successor to symbol.
11360  * - #swapcase:: Returns symbol with all upcase characters downcased
11361  * and all downcase characters upcased.
11362  * - #to_proc:: Returns a Proc object which responds to the method named by symbol.
11363  * - #to_s, #id2name:: Returns the string corresponding to +self+.
11364  * - #to_sym, #intern:: Returns +self+.
11365  * - #upcase:: Returns symbol with all characters upcased.
11366  *
11367  */
11368 
11369 
11370 /*
11371  * call-seq:
11372  * sym == obj -> true or false
11373  *
11374  * Equality---If <i>sym</i> and <i>obj</i> are exactly the same
11375  * symbol, returns <code>true</code>.
11376  */
11377 
11378 #define sym_equal rb_obj_equal
11379 
11380 static int
11381 sym_printable(const char *s, const char *send, rb_encoding *enc)
11382 {
11383  while (s < send) {
11384  int n;
11385  int c = rb_enc_precise_mbclen(s, send, enc);
11386 
11387  if (!MBCLEN_CHARFOUND_P(c)) return FALSE;
11388  n = MBCLEN_CHARFOUND_LEN(c);
11389  c = rb_enc_mbc_to_codepoint(s, send, enc);
11390  if (!rb_enc_isprint(c, enc)) return FALSE;
11391  s += n;
11392  }
11393  return TRUE;
11394 }
11395 
11396 int
11397 rb_str_symname_p(VALUE sym)
11398 {
11399  rb_encoding *enc;
11400  const char *ptr;
11401  long len;
11403 
11404  if (resenc == NULL) resenc = rb_default_external_encoding();
11405  enc = STR_ENC_GET(sym);
11406  ptr = RSTRING_PTR(sym);
11407  len = RSTRING_LEN(sym);
11408  if ((resenc != enc && !rb_str_is_ascii_only_p(sym)) || len != (long)strlen(ptr) ||
11409  !rb_enc_symname2_p(ptr, len, enc) || !sym_printable(ptr, ptr + len, enc)) {
11410  return FALSE;
11411  }
11412  return TRUE;
11413 }
11414 
11415 VALUE
11416 rb_str_quote_unprintable(VALUE str)
11417 {
11418  rb_encoding *enc;
11419  const char *ptr;
11420  long len;
11421  rb_encoding *resenc;
11422 
11423  Check_Type(str, T_STRING);
11424  resenc = rb_default_internal_encoding();
11425  if (resenc == NULL) resenc = rb_default_external_encoding();
11426  enc = STR_ENC_GET(str);
11427  ptr = RSTRING_PTR(str);
11428  len = RSTRING_LEN(str);
11429  if ((resenc != enc && !rb_str_is_ascii_only_p(str)) ||
11430  !sym_printable(ptr, ptr + len, enc)) {
11431  return rb_str_escape(str);
11432  }
11433  return str;
11434 }
11435 
11436 MJIT_FUNC_EXPORTED VALUE
11437 rb_id_quote_unprintable(ID id)
11438 {
11439  VALUE str = rb_id2str(id);
11440  if (!rb_str_symname_p(str)) {
11441  return rb_str_escape(str);
11442  }
11443  return str;
11444 }
11445 
11446 /*
11447  * call-seq:
11448  * sym.inspect -> string
11449  *
11450  * Returns the representation of <i>sym</i> as a symbol literal.
11451  *
11452  * :fred.inspect #=> ":fred"
11453  */
11454 
11455 static VALUE
11456 sym_inspect(VALUE sym)
11457 {
11458  VALUE str = rb_sym2str(sym);
11459  const char *ptr;
11460  long len;
11461  char *dest;
11462 
11463  if (!rb_str_symname_p(str)) {
11464  str = rb_str_inspect(str);
11465  len = RSTRING_LEN(str);
11466  rb_str_resize(str, len + 1);
11467  dest = RSTRING_PTR(str);
11468  memmove(dest + 1, dest, len);
11469  }
11470  else {
11471  rb_encoding *enc = STR_ENC_GET(str);
11472  RSTRING_GETMEM(str, ptr, len);
11473  str = rb_enc_str_new(0, len + 1, enc);
11474  dest = RSTRING_PTR(str);
11475  memcpy(dest + 1, ptr, len);
11476  }
11477  dest[0] = ':';
11478  return str;
11479 }
11480 
11481 #if 0 /* for RDoc */
11482 /*
11483  * call-seq:
11484  * sym.name -> string
11485  *
11486  * Returns the name or string corresponding to <i>sym</i>. Unlike #to_s, the
11487  * returned string is frozen.
11488  *
11489  * :fred.name #=> "fred"
11490  * :fred.name.frozen? #=> true
11491  * :fred.to_s #=> "fred"
11492  * :fred.to_s.frozen? #=> false
11493  */
11494 VALUE
11495 rb_sym2str(VALUE sym)
11496 {
11497 
11498 }
11499 #endif
11500 
11501 
11502 /*
11503  * call-seq:
11504  * sym.id2name -> string
11505  * sym.to_s -> string
11506  *
11507  * Returns the name or string corresponding to <i>sym</i>.
11508  *
11509  * :fred.id2name #=> "fred"
11510  * :ginger.to_s #=> "ginger"
11511  *
11512  * Note that this string is not frozen (unlike the symbol itself).
11513  * To get a frozen string, use #name.
11514  */
11515 
11516 
11517 VALUE
11519 {
11520  return str_new_shared(rb_cString, rb_sym2str(sym));
11521 }
11522 
11523 
11524 /*
11525  * call-seq:
11526  * sym.to_sym -> sym
11527  * sym.intern -> sym
11528  *
11529  * In general, <code>to_sym</code> returns the Symbol corresponding
11530  * to an object. As <i>sym</i> is already a symbol, <code>self</code> is returned
11531  * in this case.
11532  */
11533 
11534 static VALUE
11535 sym_to_sym(VALUE sym)
11536 {
11537  return sym;
11538 }
11539 
11540 MJIT_FUNC_EXPORTED VALUE
11541 rb_sym_proc_call(ID mid, int argc, const VALUE *argv, int kw_splat, VALUE passed_proc)
11542 {
11543  VALUE obj;
11544 
11545  if (argc < 1) {
11546  rb_raise(rb_eArgError, "no receiver given");
11547  }
11548  obj = argv[0];
11549  return rb_funcall_with_block_kw(obj, mid, argc - 1, argv + 1, passed_proc, kw_splat);
11550 }
11551 
11552 #if 0
11553 /*
11554  * call-seq:
11555  * sym.to_proc
11556  *
11557  * Returns a _Proc_ object which responds to the given method by _sym_.
11558  *
11559  * (1..3).collect(&:to_s) #=> ["1", "2", "3"]
11560  */
11561 
11562 VALUE
11563 rb_sym_to_proc(VALUE sym)
11564 {
11565 }
11566 #endif
11567 
11568 /*
11569  * call-seq:
11570  *
11571  * sym.succ
11572  *
11573  * Same as <code>sym.to_s.succ.intern</code>.
11574  */
11575 
11576 static VALUE
11577 sym_succ(VALUE sym)
11578 {
11579  return rb_str_intern(rb_str_succ(rb_sym2str(sym)));
11580 }
11581 
11582 /*
11583  * call-seq:
11584  *
11585  * symbol <=> other_symbol -> -1, 0, +1, or nil
11586  *
11587  * Compares +symbol+ with +other_symbol+ after calling #to_s on each of the
11588  * symbols. Returns -1, 0, +1, or +nil+ depending on whether +symbol+ is
11589  * less than, equal to, or greater than +other_symbol+.
11590  *
11591  * +nil+ is returned if the two values are incomparable.
11592  *
11593  * See String#<=> for more information.
11594  */
11595 
11596 static VALUE
11597 sym_cmp(VALUE sym, VALUE other)
11598 {
11599  if (!SYMBOL_P(other)) {
11600  return Qnil;
11601  }
11602  return rb_str_cmp_m(rb_sym2str(sym), rb_sym2str(other));
11603 }
11604 
11605 /*
11606  * call-seq:
11607  * casecmp(other_symbol) -> -1, 0, 1, or nil
11608  *
11609  * Case-insensitive version of {Symbol#<=>}[#method-i-3C-3D-3E]:
11610  *
11611  * :aBcDeF.casecmp(:abcde) # => 1
11612  * :aBcDeF.casecmp(:abcdef) # => 0
11613  * :aBcDeF.casecmp(:abcdefg) # => -1
11614  * :abcdef.casecmp(:ABCDEF) # => 0
11615  *
11616  * Returns +nil+ if the two symbols have incompatible encodings,
11617  * or if +other_symbol+ is not a symbol:
11618  *
11619  * sym = "\u{e4 f6 fc}".encode("ISO-8859-1").to_sym
11620  * other_sym = :"\u{c4 d6 dc}"
11621  * sym.casecmp(other_sym) # => nil
11622  * :foo.casecmp(2) # => nil
11623  *
11624  * Currently, case-insensitivity only works on characters A-Z/a-z,
11625  * not all of Unicode. This is different from Symbol#casecmp?.
11626  *
11627  * Related: Symbol#casecmp?.
11628  *
11629  */
11630 
11631 static VALUE
11632 sym_casecmp(VALUE sym, VALUE other)
11633 {
11634  if (!SYMBOL_P(other)) {
11635  return Qnil;
11636  }
11637  return str_casecmp(rb_sym2str(sym), rb_sym2str(other));
11638 }
11639 
11640 /*
11641  * call-seq:
11642  * casecmp?(other_symbol) -> true, false, or nil
11643  *
11644  * Returns +true+ if +sym+ and +other_symbol+ are equal after
11645  * Unicode case folding, +false+ if they are not equal:
11646  *
11647  * :aBcDeF.casecmp?(:abcde) # => false
11648  * :aBcDeF.casecmp?(:abcdef) # => true
11649  * :aBcDeF.casecmp?(:abcdefg) # => false
11650  * :abcdef.casecmp?(:ABCDEF) # => true
11651  * :"\u{e4 f6 fc}".casecmp?(:"\u{c4 d6 dc}") #=> true
11652  *
11653  * Returns +nil+ if the two symbols have incompatible encodings,
11654  * or if +other_symbol+ is not a symbol:
11655  *
11656  * sym = "\u{e4 f6 fc}".encode("ISO-8859-1").to_sym
11657  * other_sym = :"\u{c4 d6 dc}"
11658  * sym.casecmp?(other_sym) # => nil
11659  * :foo.casecmp?(2) # => nil
11660  *
11661  * See {Case Mapping}[doc/case_mapping_rdoc.html].
11662  *
11663  * Related: Symbol#casecmp.
11664  *
11665  */
11666 
11667 static VALUE
11668 sym_casecmp_p(VALUE sym, VALUE other)
11669 {
11670  if (!SYMBOL_P(other)) {
11671  return Qnil;
11672  }
11673  return str_casecmp_p(rb_sym2str(sym), rb_sym2str(other));
11674 }
11675 
11676 /*
11677  * call-seq:
11678  * sym =~ obj -> integer or nil
11679  *
11680  * Returns <code>sym.to_s =~ obj</code>.
11681  */
11682 
11683 static VALUE
11684 sym_match(VALUE sym, VALUE other)
11685 {
11686  return rb_str_match(rb_sym2str(sym), other);
11687 }
11688 
11689 /*
11690  * call-seq:
11691  * sym.match(pattern) -> matchdata or nil
11692  * sym.match(pattern, pos) -> matchdata or nil
11693  *
11694  * Returns <code>sym.to_s.match</code>.
11695  */
11696 
11697 static VALUE
11698 sym_match_m(int argc, VALUE *argv, VALUE sym)
11699 {
11700  return rb_str_match_m(argc, argv, rb_sym2str(sym));
11701 }
11702 
11703 /*
11704  * call-seq:
11705  * sym.match?(pattern) -> true or false
11706  * sym.match?(pattern, pos) -> true or false
11707  *
11708  * Returns <code>sym.to_s.match?</code>.
11709  */
11710 
11711 static VALUE
11712 sym_match_m_p(int argc, VALUE *argv, VALUE sym)
11713 {
11714  return rb_str_match_m_p(argc, argv, sym);
11715 }
11716 
11717 /*
11718  * call-seq:
11719  * sym[idx] -> char
11720  * sym[b, n] -> string
11721  * sym.slice(idx) -> char
11722  * sym.slice(b, n) -> string
11723  *
11724  * Returns <code>sym.to_s[]</code>.
11725  */
11726 
11727 static VALUE
11728 sym_aref(int argc, VALUE *argv, VALUE sym)
11729 {
11730  return rb_str_aref_m(argc, argv, rb_sym2str(sym));
11731 }
11732 
11733 /*
11734  * call-seq:
11735  * sym.length -> integer
11736  * sym.size -> integer
11737  *
11738  * Same as <code>sym.to_s.length</code>.
11739  */
11740 
11741 static VALUE
11742 sym_length(VALUE sym)
11743 {
11744  return rb_str_length(rb_sym2str(sym));
11745 }
11746 
11747 /*
11748  * call-seq:
11749  * sym.empty? -> true or false
11750  *
11751  * Returns whether _sym_ is :"" or not.
11752  */
11753 
11754 static VALUE
11755 sym_empty(VALUE sym)
11756 {
11757  return rb_str_empty(rb_sym2str(sym));
11758 }
11759 
11760 /*
11761  * call-seq:
11762  * upcase(*options) -> symbol
11763  *
11764  * Equivalent to <tt>sym.to_s.upcase.to_sym</tt>.
11765  *
11766  * See String#upcase.
11767  *
11768  */
11769 
11770 static VALUE
11771 sym_upcase(int argc, VALUE *argv, VALUE sym)
11772 {
11773  return rb_str_intern(rb_str_upcase(argc, argv, rb_sym2str(sym)));
11774 }
11775 
11776 /*
11777  * call-seq:
11778  * downcase(*options) -> symbol
11779  *
11780  * Equivalent to <tt>sym.to_s.downcase.to_sym</tt>.
11781  *
11782  * See String#downcase.
11783  *
11784  * Related: Symbol#upcase.
11785  *
11786  */
11787 
11788 static VALUE
11789 sym_downcase(int argc, VALUE *argv, VALUE sym)
11790 {
11791  return rb_str_intern(rb_str_downcase(argc, argv, rb_sym2str(sym)));
11792 }
11793 
11794 /*
11795  * call-seq:
11796  * capitalize(*options) -> symbol
11797  *
11798  * Equivalent to <tt>sym.to_s.capitalize.to_sym</tt>.
11799  *
11800  * See String#capitalize.
11801  *
11802  */
11803 
11804 static VALUE
11805 sym_capitalize(int argc, VALUE *argv, VALUE sym)
11806 {
11807  return rb_str_intern(rb_str_capitalize(argc, argv, rb_sym2str(sym)));
11808 }
11809 
11810 /*
11811  * call-seq:
11812  * swapcase(*options) -> symbol
11813  *
11814  * Equivalent to <tt>sym.to_s.swapcase.to_sym</tt>.
11815  *
11816  * See String#swapcase.
11817  *
11818  */
11819 
11820 static VALUE
11821 sym_swapcase(int argc, VALUE *argv, VALUE sym)
11822 {
11823  return rb_str_intern(rb_str_swapcase(argc, argv, rb_sym2str(sym)));
11824 }
11825 
11826 /*
11827  * call-seq:
11828  * sym.start_with?([prefixes]+) -> true or false
11829  *
11830  * Returns true if +sym+ starts with one of the +prefixes+ given.
11831  * Each of the +prefixes+ should be a String or a Regexp.
11832  *
11833  * :hello.start_with?("hell") #=> true
11834  * :hello.start_with?(/H/i) #=> true
11835  *
11836  * # returns true if one of the prefixes matches.
11837  * :hello.start_with?("heaven", "hell") #=> true
11838  * :hello.start_with?("heaven", "paradise") #=> false
11839  */
11840 
11841 static VALUE
11842 sym_start_with(int argc, VALUE *argv, VALUE sym)
11843 {
11844  return rb_str_start_with(argc, argv, rb_sym2str(sym));
11845 }
11846 
11847 /*
11848  * call-seq:
11849  * sym.end_with?([suffixes]+) -> true or false
11850  *
11851  * Returns true if +sym+ ends with one of the +suffixes+ given.
11852  *
11853  * :hello.end_with?("ello") #=> true
11854  *
11855  * # returns true if one of the +suffixes+ matches.
11856  * :hello.end_with?("heaven", "ello") #=> true
11857  * :hello.end_with?("heaven", "paradise") #=> false
11858  */
11859 
11860 static VALUE
11861 sym_end_with(int argc, VALUE *argv, VALUE sym)
11862 {
11863  return rb_str_end_with(argc, argv, rb_sym2str(sym));
11864 }
11865 
11866 /*
11867  * call-seq:
11868  * sym.encoding -> encoding
11869  *
11870  * Returns the Encoding object that represents the encoding of _sym_.
11871  */
11872 
11873 static VALUE
11874 sym_encoding(VALUE sym)
11875 {
11876  return rb_obj_encoding(rb_sym2str(sym));
11877 }
11878 
11879 static VALUE
11880 string_for_symbol(VALUE name)
11881 {
11882  if (!RB_TYPE_P(name, T_STRING)) {
11883  VALUE tmp = rb_check_string_type(name);
11884  if (NIL_P(tmp)) {
11885  rb_raise(rb_eTypeError, "%+"PRIsVALUE" is not a symbol",
11886  name);
11887  }
11888  name = tmp;
11889  }
11890  return name;
11891 }
11892 
11893 ID
11895 {
11896  if (SYMBOL_P(name)) {
11897  return SYM2ID(name);
11898  }
11899  name = string_for_symbol(name);
11900  return rb_intern_str(name);
11901 }
11902 
11903 VALUE
11905 {
11906  if (SYMBOL_P(name)) {
11907  return name;
11908  }
11909  name = string_for_symbol(name);
11910  return rb_str_intern(name);
11911 }
11912 
11913 /*
11914  * call-seq:
11915  * Symbol.all_symbols => array
11916  *
11917  * Returns an array of all the symbols currently in Ruby's symbol
11918  * table.
11919  *
11920  * Symbol.all_symbols.size #=> 903
11921  * Symbol.all_symbols[1,20] #=> [:floor, :ARGV, :Binding, :symlink,
11922  * :chown, :EOFError, :$;, :String,
11923  * :LOCK_SH, :"setuid?", :$<,
11924  * :default_proc, :compact, :extend,
11925  * :Tms, :getwd, :$=, :ThreadGroup,
11926  * :wait2, :$>]
11927  */
11928 
11929 static VALUE
11930 sym_all_symbols(VALUE _)
11931 {
11932  return rb_sym_all_symbols();
11933 }
11934 
11935 VALUE
11937 {
11938  return rb_fstring(str);
11939 }
11940 
11941 VALUE
11942 rb_interned_str(const char *ptr, long len)
11943 {
11944  struct RString fake_str;
11945  return register_fstring(setup_fake_str(&fake_str, ptr, len, ENCINDEX_US_ASCII), TRUE);
11946 }
11947 
11948 VALUE
11950 {
11951  return rb_interned_str(ptr, strlen(ptr));
11952 }
11953 
11954 VALUE
11955 rb_enc_interned_str(const char *ptr, long len, rb_encoding *enc)
11956 {
11957  if (UNLIKELY(rb_enc_autoload_p(enc))) {
11958  rb_enc_autoload(enc);
11959  }
11960 
11961  struct RString fake_str;
11962  return register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), TRUE);
11963 }
11964 
11965 VALUE
11967 {
11968  return rb_enc_interned_str(ptr, strlen(ptr), enc);
11969 }
11970 
11971 /*
11972  * A \String object has an arbitrary sequence of bytes,
11973  * typically representing text or binary data.
11974  * A \String object may be created using String::new or as literals.
11975  *
11976  * String objects differ from Symbol objects in that Symbol objects are
11977  * designed to be used as identifiers, instead of text or data.
11978  *
11979  * You can create a \String object explicitly with:
11980  *
11981  * - A {string literal}[doc/syntax/literals_rdoc.html#label-String+Literals].
11982  * - A {heredoc literal}[doc/syntax/literals_rdoc.html#label-Here+Document+Literals].
11983  *
11984  * You can convert certain objects to Strings with:
11985  *
11986  * - \Method {String}[Kernel.html#method-i-String].
11987  *
11988  * Some \String methods modify +self+.
11989  * Typically, a method whose name ends with <tt>!</tt> modifies +self+
11990  * and returns +self+;
11991  * often a similarly named method (without the <tt>!</tt>)
11992  * returns a new string.
11993  *
11994  * In general, if there exist both bang and non-bang version of method,
11995  * the bang! mutates and the non-bang! does not.
11996  * However, a method without a bang can also mutate, such as String#replace.
11997  *
11998  * == Substitution Methods
11999  *
12000  * These methods perform substitutions:
12001  *
12002  * - String#sub: One substitution (or none); returns a new string.
12003  * - String#sub!: One substitution (or none); returns +self+.
12004  * - String#gsub: Zero or more substitutions; returns a new string.
12005  * - String#gsub!: Zero or more substitutions; returns +self+.
12006  *
12007  * Each of these methods takes:
12008  *
12009  * - A first argument, +pattern+ (string or regexp),
12010  * that specifies the substring(s) to be replaced.
12011  *
12012  * - Either of these:
12013  *
12014  * - A second argument, +replacement+ (string or hash),
12015  * that determines the replacing string.
12016  * - A block that will determine the replacing string.
12017  *
12018  * The examples in this section mostly use methods String#sub and String#gsub;
12019  * the principles illustrated apply to all four substitution methods.
12020  *
12021  * <b>Argument +pattern+</b>
12022  *
12023  * Argument +pattern+ is commonly a regular expression:
12024  *
12025  * s = 'hello'
12026  * s.sub(/[aeiou]/, '*') # => "h*llo"
12027  * s.gsub(/[aeiou]/, '*') # => "h*ll*"
12028  * s.gsub(/[aeiou]/, '') # => "hll"
12029  * s.sub(/ell/, 'al') # => "halo"
12030  * s.gsub(/xyzzy/, '*') # => "hello"
12031  * 'THX1138'.gsub(/\d+/, '00') # => "THX00"
12032  *
12033  * When +pattern+ is a string, all its characters are treated
12034  * as ordinary characters (not as regexp special characters):
12035  *
12036  * 'THX1138'.gsub('\d+', '00') # => "THX1138"
12037  *
12038  * <b>\String +replacement+</b>
12039  *
12040  * If +replacement+ is a string, that string will determine
12041  * the replacing string that is to be substituted for the matched text.
12042  *
12043  * Each of the examples above uses a simple string as the replacing string.
12044  *
12045  * \String +replacement+ may contain back-references to the pattern's captures:
12046  *
12047  * - <tt>\n</tt> (_n_ a non-negative integer) refers to <tt>$n</tt>.
12048  * - <tt>\k<name></tt> refers to the named capture +name+.
12049  *
12050  * See rdoc-ref:regexp.rdoc for details.
12051  *
12052  * Note that within the string +replacement+, a character combination
12053  * such as <tt>$&</tt> is treated as ordinary text, and not as
12054  * a special match variable.
12055  * However, you may refer to some special match variables using these
12056  * combinations:
12057  *
12058  * - <tt>\&</tt> and <tt>\0</tt> correspond to <tt>$&</tt>,
12059  * which contains the complete matched text.
12060  * - <tt>\'</tt> corresponds to <tt>$'</tt>,
12061  * which contains string after match.
12062  * - <tt>\`</tt> corresponds to <tt>$`</tt>,
12063  * which contains string before match.
12064  * - <tt>\+</tt> corresponds to <tt>$+</tt>,
12065  * which contains last capture group.
12066  *
12067  * See rdoc-ref:regexp.rdoc for details.
12068  *
12069  * Note that <tt>\\\</tt> is interpreted as an escape, i.e., a single backslash.
12070  *
12071  * Note also that a string literal consumes backslashes.
12072  * See {String Literals}[doc/syntax/literals_rdoc.html#label-String+Literals] for details about string literals.
12073  *
12074  * A back-reference is typically preceded by an additional backslash.
12075  * For example, if you want to write a back-reference <tt>\&</tt> in
12076  * +replacement+ with a double-quoted string literal, you need to write
12077  * <tt>"..\\\\&.."</tt>.
12078  *
12079  * If you want to write a non-back-reference string <tt>\&</tt> in
12080  * +replacement+, you need first to escape the backslash to prevent
12081  * this method from interpreting it as a back-reference, and then you
12082  * need to escape the backslashes again to prevent a string literal from
12083  * consuming them: <tt>"..\\\\\\\\&.."</tt>.
12084  *
12085  * You may want to use the block form to avoid a lot of backslashes.
12086  *
12087  * <b>\Hash +replacement+</b>
12088  *
12089  * If argument +replacement+ is a hash, and +pattern+ matches one of its keys,
12090  * the replacing string is the value for that key:
12091  *
12092  * h = {'foo' => 'bar', 'baz' => 'bat'}
12093  * 'food'.sub('foo', h) # => "bard"
12094  *
12095  * Note that a symbol key does not match:
12096  *
12097  * h = {foo: 'bar', baz: 'bat'}
12098  * 'food'.sub('foo', h) # => "d"
12099  *
12100  * <b>Block</b>
12101  *
12102  * In the block form, the current match string is passed to the block;
12103  * the block's return value becomes the replacing string:
12104  *
12105  * s = '@'
12106  * '1234'.gsub(/\d/) {|match| s.succ! } # => "ABCD"
12107  *
12108  * Special match variables such as <tt>$1</tt>, <tt>$2</tt>, <tt>$`</tt>,
12109  * <tt>$&</tt>, and <tt>$'</tt> are set appropriately.
12110  *
12111  *
12112  * == What's Here
12113  *
12114  * First, what's elsewhere. \Class \String:
12115  *
12116  * - Inherits from {class Object}[Object.html#class-Object-label-What-27s+Here].
12117  * - Includes {module Comparable}[Comparable.html#module-Comparable-label-What-27s+Here].
12118  *
12119  * Here, class \String provides methods that are useful for:
12120  *
12121  * - {Creating a String}[#class-String-label-Methods+for+Creating+a+String]
12122  * - {Frozen/Unfrozen Strings}[#class-String-label-Methods+for+a+Frozen-2FUnfrozen+String]
12123  * - {Querying}[#class-String-label-Methods+for+Querying]
12124  * - {Comparing}[#class-String-label-Methods+for+Comparing]
12125  * - {Modifying a String}[#class-String-label-Methods+for+Modifying+a+String]
12126  * - {Converting to New String}[#class-String-label-Methods+for+Converting+to+New+String]
12127  * - {Converting to Non-String}[#class-String-label-Methods+for+Converting+to+Non--5CString]
12128  * - {Iterating}[#class-String-label-Methods+for+Iterating]
12129  *
12130  * === Methods for Creating a \String
12131  *
12132  * - ::new:: Returns a new string.
12133  * - ::try_convert:: Returns a new string created from a given object.
12134  *
12135  * === Methods for a Frozen/Unfrozen String
12136  *
12137  * - {#+string}[#method-i-2B-40]:: Returns a string that is not frozen:
12138  * +self+, if not frozen; +self.dup+ otherwise.
12139  * - {#-string}[#method-i-2D-40]:: Returns a string that is frozen:
12140  * +self+, if already frozen; +self.freeze+ otherwise.
12141  * - #freeze:: Freezes +self+, if not already frozen; returns +self+.
12142  *
12143  * === Methods for Querying
12144  *
12145  * _Counts_
12146  *
12147  * - #length, #size:: Returns the count of characters (not bytes).
12148  * - #empty?:: Returns +true+ if +self.length+ is zero; +false+ otherwise.
12149  * - #bytesize:: Returns the count of bytes.
12150  * - #count:: Returns the count of substrings matching given strings.
12151  *
12152  * _Substrings_
12153  *
12154  * - {#=~}[#method-i-3D~]:: Returns the index of the first substring that matches a given Regexp or other object;
12155  * returns +nil+ if no match is found.
12156  * - #index:: Returns the index of the _first_ occurrence of a given substring;
12157  * returns +nil+ if none found.
12158  * - #rindex:: Returns the index of the _last_ occurrence of a given substring;
12159  * returns +nil+ if none found.
12160  * - #include?:: Returns +true+ if the string contains a given substring; +false+ otherwise.
12161  * - #match:: Returns a MatchData object if the string matches a given Regexp; +nil+ otherwise.
12162  * - #match?:: Returns +true+ if the string matches a given Regexp; +false+ otherwise.
12163  * - #start_with?:: Returns +true+ if the string begins with any of the given substrings.
12164  * - #end_with?:: Returns +true+ if the string ends with any of the given substrings.
12165  *
12166  * _Encodings_
12167  *
12168  * - #encoding:: Returns the Encoding object that represents the encoding of the string.
12169  * - #unicode_normalized?:: Returns +true+ if the string is in Unicode normalized form; +false+ otherwise.
12170  * - #valid_encoding?:: Returns +true+ if the string contains only characters that are valid
12171  * for its encoding.
12172  * - #ascii_only?:: Returns +true+ if the string has only ASCII characters; +false+ otherwise.
12173  *
12174  * _Other_
12175  *
12176  * - #sum:: Returns a basic checksum for the string: the sum of each byte.
12177  * - #hash:: Returns the integer hash code.
12178  *
12179  * === Methods for Comparing
12180  *
12181  * - {#==, #===}[#method-i-3D-3D]:: Returns +true+ if a given other string has the same content as +self+.
12182  * - #eql?:: Returns +true+ if the content is the same as the given other string.
12183  * - {#<=>}[#method-i-3C-3D-3E]:: Returns -1, 0, or 1 as a given other string is smaller than, equal to, or larger than +self+.
12184  * - #casecmp:: Ignoring case, returns -1, 0, or 1 as a given
12185  * other string is smaller than, equal to, or larger than +self+.
12186  * - #casecmp?:: Returns +true+ if the string is equal to a given string after Unicode case folding;
12187  * +false+ otherwise.
12188  *
12189  * === Methods for Modifying a \String
12190  *
12191  * Each of these methods modifies +self+.
12192  *
12193  * _Insertion_
12194  *
12195  * - #insert:: Returns +self+ with a given string inserted at a given offset.
12196  * - #<<:: Returns +self+ concatenated with a given string or integer.
12197  *
12198  * _Substitution_
12199  *
12200  * - #sub!:: Replaces the first substring that matches a given pattern with a given replacement string;
12201  * returns +self+ if any changes, +nil+ otherwise.
12202  * - #gsub!:: Replaces each substring that matches a given pattern with a given replacement string;
12203  * returns +self+ if any changes, +nil+ otherwise.
12204  * - #succ!, #next!:: Returns +self+ modified to become its own successor.
12205  * - #replace:: Returns +self+ with its entire content replaced by a given string.
12206  * - #reverse!:: Returns +self+ with its characters in reverse order.
12207  * - #setbyte:: Sets the byte at a given integer offset to a given value; returns the argument.
12208  * - #tr!:: Replaces specified characters in +self+ with specified replacement characters;
12209  * returns +self+ if any changes, +nil+ otherwise.
12210  * - #tr_s!:: Replaces specified characters in +self+ with specified replacement characters,
12211  * removing duplicates from the substrings that were modified;
12212  * returns +self+ if any changes, +nil+ otherwise.
12213  *
12214  * _Casing_
12215  *
12216  * - #capitalize!:: Upcases the initial character and downcases all others;
12217  * returns +self+ if any changes, +nil+ otherwise.
12218  * - #downcase!:: Downcases all characters; returns +self+ if any changes, +nil+ otherwise.
12219  * - #upcase!:: Upcases all characters; returns +self+ if any changes, +nil+ otherwise.
12220  * - #swapcase!:: Upcases each downcase character and downcases each upcase character;
12221  * returns +self+ if any changes, +nil+ otherwise.
12222  *
12223  * _Encoding_
12224  *
12225  * - #encode!:: Returns +self+ with all characters transcoded from one given encoding into another.
12226  * - #unicode_normalize!:: Unicode-normalizes +self+; returns +self+.
12227  * - #scrub!:: Replaces each invalid byte with a given character; returns +self+.
12228  * - #force_encoding:: Changes the encoding to a given encoding; returns +self+.
12229  *
12230  * _Deletion_
12231  *
12232  * - #clear:: Removes all content, so that +self+ is empty; returns +self+.
12233  * - #slice!, #[]=:: Removes a substring determined by a given index, start/length, range, regexp, or substring.
12234  * - #squeeze!:: Removes contiguous duplicate characters; returns +self+.
12235  * - #delete!:: Removes characters as determined by the intersection of substring arguments.
12236  * - #lstrip!:: Removes leading whitespace; returns +self+ if any changes, +nil+ otherwise.
12237  * - #rstrip!:: Removes trailing whitespace; returns +self+ if any changes, +nil+ otherwise.
12238  * - #strip!:: Removes leading and trailing whitespace; returns +self+ if any changes, +nil+ otherwise.
12239  * - #chomp!:: Removes trailing record separator, if found; returns +self+ if any changes, +nil+ otherwise.
12240  * - #chop!:: Removes trailing whitespace if found, otherwise removes the last character;
12241  * returns +self+ if any changes, +nil+ otherwise.
12242  *
12243  * === Methods for Converting to New \String
12244  *
12245  * Each of these methods returns a new \String based on +self+,
12246  * often just a modified copy of +self+.
12247  *
12248  * _Extension_
12249  *
12250  * - #*:: Returns the concatenation of multiple copies of +self+,
12251  * - #+:: Returns the concatenation of +self+ and a given other string.
12252  * - #center:: Returns a copy of +self+ centered between pad substring.
12253  * - #concat:: Returns the concatenation of +self+ with given other strings.
12254  * - #prepend:: Returns the concatenation of a given other string with +self+.
12255  * - #ljust:: Returns a copy of +self+ of a given length, right-padded with a given other string.
12256  * - #rjust:: Returns a copy of +self+ of a given length, left-padded with a given other string.
12257  *
12258  * _Encoding_
12259  *
12260  * - #b:: Returns a copy of +self+ with ASCII-8BIT encoding.
12261  * - #scrub:: Returns a copy of +self+ with each invalid byte replaced with a given character.
12262  * - #unicode_normalize:: Returns a copy of +self+ with each character Unicode-normalized.
12263  * - #encode:: Returns a copy of +self+ with all characters transcoded from one given encoding into another.
12264  *
12265  * _Substitution_
12266  *
12267  * - #dump:: Returns a copy of +self with all non-printing characters replaced by \xHH notation
12268  * and all special characters escaped.
12269  * - #undump:: Returns a copy of +self with all <tt>\xNN</tt> notation replace by <tt>\uNNNN</tt> notation
12270  * and all escaped characters unescaped.
12271  * - #sub:: Returns a copy of +self+ with the first substring matching a given pattern
12272  * replaced with a given replacement string;.
12273  * - #gsub:: Returns a copy of +self+ with each substring that matches a given pattern
12274  * replaced with a given replacement string.
12275  * - #succ, #next:: Returns the string that is the successor to +self+.
12276  * - #reverse:: Returns a copy of +self+ with its characters in reverse order.
12277  * - #tr:: Returns a copy of +self+ with specified characters replaced with specified replacement characters.
12278  * - #tr_s:: Returns a copy of +self+ with specified characters replaced with specified replacement characters,
12279  * removing duplicates from the substrings that were modified.
12280  * - #%:: Returns the string resulting from formatting a given object into +self+
12281  *
12282  * _Casing_
12283  *
12284  * - #capitalize:: Returns a copy of +self+ with the first character upcased
12285  * and all other characters downcased.
12286  * - #downcase:: Returns a copy of +self+ with all characters downcased.
12287  * - #upcase:: Returns a copy of +self+ with all characters upcased.
12288  * - #swapcase:: Returns a copy of +self+ with all upcase characters downcased
12289  * and all downcase characters upcased.
12290  *
12291  * _Deletion_
12292  *
12293  * - #delete:: Returns a copy of +self+ with characters removed
12294  * - #delete_prefix:: Returns a copy of +self+ with a given prefix removed.
12295  * - #delete_suffix:: Returns a copy of +self+ with a given suffix removed.
12296  * - #lstrip:: Returns a copy of +self+ with leading whitespace removed.
12297  * - #rstrip:: Returns a copy of +self+ with trailing whitespace removed.
12298  * - #strip:: Returns a copy of +self+ with leading and trailing whitespace removed.
12299  * - #chomp:: Returns a copy of +self+ with a trailing record separator removed, if found.
12300  * - #chop:: Returns a copy of +self+ with trailing whitespace or the last character removed.
12301  * - #squeeze:: Returns a copy of +self+ with contiguous duplicate characters removed.
12302  * - #[], #slice:: Returns a substring determined by a given index, start/length, or range, or string.
12303  * - #byteslice:: Returns a substring determined by a given index, start/length, or range.
12304  * - #chr:: Returns the first character.
12305  *
12306  * _Duplication_
12307  *
12308  * - #to_s, $to_str:: If +self+ is a subclass of \String, returns +self+ copied into a \String;
12309  * otherwise, returns +self+.
12310  *
12311  * === Methods for Converting to Non-\String
12312  *
12313  * Each of these methods converts the contents of +self+ to a non-\String.
12314  *
12315  * <em>Characters, Bytes, and Clusters</em>
12316  *
12317  * - #bytes:: Returns an array of the bytes in +self+.
12318  * - #chars:: Returns an array of the characters in +self+.
12319  * - #codepoints:: Returns an array of the integer ordinals in +self+.
12320  * - #getbyte:: Returns an integer byte as determined by a given index.
12321  * - #grapheme_clusters:: Returns an array of the grapheme clusters in +self+.
12322  *
12323  * _Splitting_
12324  *
12325  * - #lines:: Returns an array of the lines in +self+, as determined by a given record separator.
12326  * - #partition:: Returns a 3-element array determined by the first substring that matches
12327  * a given substring or regexp,
12328  * - #rpartition:: Returns a 3-element array determined by the last substring that matches
12329  * a given substring or regexp,
12330  * - #split:: Returns an array of substrings determined by a given delimiter -- regexp or string --
12331  * or, if a block given, passes those substrings to the block.
12332  *
12333  * _Matching_
12334  *
12335  * - #scan:: Returns an array of substrings matching a given regexp or string, or,
12336  * if a block given, passes each matching substring to the block.
12337  * - #unpack:: Returns an array of substrings extracted from +self+ according to a given format.
12338  * - #unpack1:: Returns the first substring extracted from +self+ according to a given format.
12339  *
12340  * _Numerics_
12341  *
12342  * - #hex:: Returns the integer value of the leading characters, interpreted as hexadecimal digits.
12343  * - #oct:: Returns the integer value of the leading characters, interpreted as octal digits.
12344  * - #ord:: Returns the integer ordinal of the first character in +self+.
12345  * - #to_i:: Returns the integer value of leading characters, interpreted as an integer.
12346  * - #to_f:: Returns the floating-point value of leading characters, interpreted as a floating-point number.
12347  *
12348  * <em>Strings and Symbols</em>
12349  *
12350  * - #inspect:: Returns copy of +self+, enclosed in double-quotes, with special characters escaped.
12351  * - #to_sym, #intern:: Returns the symbol corresponding to +self+.
12352  *
12353  * === Methods for Iterating
12354  *
12355  * - #each_byte:: Calls the given block with each successive byte in +self+.
12356  * - #each_char:: Calls the given block with each successive character in +self+.
12357  * - #each_codepoint:: Calls the given block with each successive integer codepoint in +self+.
12358  * - #each_grapheme_cluster:: Calls the given block with each successive grapheme cluster in +self+.
12359  * - #each_line:: Calls the given block with each successive line in +self+,
12360  * as determined by a given record separator.
12361  * - #upto:: Calls the given block with each string value returned by successive calls to #succ.
12362  */
12363 
12364 void
12365 Init_String(void)
12366 {
12367  rb_cString = rb_define_class("String", rb_cObject);
12368  assert(rb_vm_fstring_table());
12369  st_foreach(rb_vm_fstring_table(), fstring_set_class_i, rb_cString);
12371  rb_define_alloc_func(rb_cString, empty_str_alloc);
12372  rb_define_singleton_method(rb_cString, "try_convert", rb_str_s_try_convert, 1);
12373  rb_define_method(rb_cString, "initialize", rb_str_init, -1);
12374  rb_define_method(rb_cString, "initialize_copy", rb_str_replace, 1);
12375  rb_define_method(rb_cString, "<=>", rb_str_cmp_m, 1);
12378  rb_define_method(rb_cString, "eql?", rb_str_eql, 1);
12379  rb_define_method(rb_cString, "hash", rb_str_hash_m, 0);
12380  rb_define_method(rb_cString, "casecmp", rb_str_casecmp, 1);
12381  rb_define_method(rb_cString, "casecmp?", rb_str_casecmp_p, 1);
12384  rb_define_method(rb_cString, "%", rb_str_format_m, 1);
12385  rb_define_method(rb_cString, "[]", rb_str_aref_m, -1);
12386  rb_define_method(rb_cString, "[]=", rb_str_aset_m, -1);
12387  rb_define_method(rb_cString, "insert", rb_str_insert, 2);
12388  rb_define_method(rb_cString, "length", rb_str_length, 0);
12390  rb_define_method(rb_cString, "bytesize", rb_str_bytesize, 0);
12391  rb_define_method(rb_cString, "empty?", rb_str_empty, 0);
12392  rb_define_method(rb_cString, "=~", rb_str_match, 1);
12393  rb_define_method(rb_cString, "match", rb_str_match_m, -1);
12394  rb_define_method(rb_cString, "match?", rb_str_match_m_p, -1);
12396  rb_define_method(rb_cString, "succ!", rb_str_succ_bang, 0);
12398  rb_define_method(rb_cString, "next!", rb_str_succ_bang, 0);
12399  rb_define_method(rb_cString, "upto", rb_str_upto, -1);
12400  rb_define_method(rb_cString, "index", rb_str_index_m, -1);
12401  rb_define_method(rb_cString, "rindex", rb_str_rindex_m, -1);
12402  rb_define_method(rb_cString, "replace", rb_str_replace, 1);
12403  rb_define_method(rb_cString, "clear", rb_str_clear, 0);
12404  rb_define_method(rb_cString, "chr", rb_str_chr, 0);
12405  rb_define_method(rb_cString, "getbyte", rb_str_getbyte, 1);
12406  rb_define_method(rb_cString, "setbyte", rb_str_setbyte, 2);
12407  rb_define_method(rb_cString, "byteslice", rb_str_byteslice, -1);
12408  rb_define_method(rb_cString, "scrub", str_scrub, -1);
12409  rb_define_method(rb_cString, "scrub!", str_scrub_bang, -1);
12410  rb_define_method(rb_cString, "freeze", rb_str_freeze, 0);
12411  rb_define_method(rb_cString, "+@", str_uplus, 0);
12412  rb_define_method(rb_cString, "-@", str_uminus, 0);
12413 
12414  rb_define_method(rb_cString, "to_i", rb_str_to_i, -1);
12415  rb_define_method(rb_cString, "to_f", rb_str_to_f, 0);
12416  rb_define_method(rb_cString, "to_s", rb_str_to_s, 0);
12417  rb_define_method(rb_cString, "to_str", rb_str_to_s, 0);
12418  rb_define_method(rb_cString, "inspect", rb_str_inspect, 0);
12420  rb_define_method(rb_cString, "undump", str_undump, 0);
12421 
12422  sym_ascii = ID2SYM(rb_intern_const("ascii"));
12423  sym_turkic = ID2SYM(rb_intern_const("turkic"));
12424  sym_lithuanian = ID2SYM(rb_intern_const("lithuanian"));
12425  sym_fold = ID2SYM(rb_intern_const("fold"));
12426 
12427  rb_define_method(rb_cString, "upcase", rb_str_upcase, -1);
12428  rb_define_method(rb_cString, "downcase", rb_str_downcase, -1);
12429  rb_define_method(rb_cString, "capitalize", rb_str_capitalize, -1);
12430  rb_define_method(rb_cString, "swapcase", rb_str_swapcase, -1);
12431 
12432  rb_define_method(rb_cString, "upcase!", rb_str_upcase_bang, -1);
12433  rb_define_method(rb_cString, "downcase!", rb_str_downcase_bang, -1);
12434  rb_define_method(rb_cString, "capitalize!", rb_str_capitalize_bang, -1);
12435  rb_define_method(rb_cString, "swapcase!", rb_str_swapcase_bang, -1);
12436 
12437  rb_define_method(rb_cString, "hex", rb_str_hex, 0);
12438  rb_define_method(rb_cString, "oct", rb_str_oct, 0);
12439  rb_define_method(rb_cString, "split", rb_str_split_m, -1);
12440  rb_define_method(rb_cString, "lines", rb_str_lines, -1);
12441  rb_define_method(rb_cString, "bytes", rb_str_bytes, 0);
12442  rb_define_method(rb_cString, "chars", rb_str_chars, 0);
12443  rb_define_method(rb_cString, "codepoints", rb_str_codepoints, 0);
12444  rb_define_method(rb_cString, "grapheme_clusters", rb_str_grapheme_clusters, 0);
12445  rb_define_method(rb_cString, "reverse", rb_str_reverse, 0);
12446  rb_define_method(rb_cString, "reverse!", rb_str_reverse_bang, 0);
12447  rb_define_method(rb_cString, "concat", rb_str_concat_multi, -1);
12449  rb_define_method(rb_cString, "prepend", rb_str_prepend_multi, -1);
12450  rb_define_method(rb_cString, "crypt", rb_str_crypt, 1);
12451  rb_define_method(rb_cString, "intern", rb_str_intern, 0); /* in symbol.c */
12452  rb_define_method(rb_cString, "to_sym", rb_str_intern, 0); /* in symbol.c */
12453  rb_define_method(rb_cString, "ord", rb_str_ord, 0);
12454 
12455  rb_define_method(rb_cString, "include?", rb_str_include, 1);
12456  rb_define_method(rb_cString, "start_with?", rb_str_start_with, -1);
12457  rb_define_method(rb_cString, "end_with?", rb_str_end_with, -1);
12458 
12459  rb_define_method(rb_cString, "scan", rb_str_scan, 1);
12460 
12461  rb_define_method(rb_cString, "ljust", rb_str_ljust, -1);
12462  rb_define_method(rb_cString, "rjust", rb_str_rjust, -1);
12463  rb_define_method(rb_cString, "center", rb_str_center, -1);
12464 
12465  rb_define_method(rb_cString, "sub", rb_str_sub, -1);
12466  rb_define_method(rb_cString, "gsub", rb_str_gsub, -1);
12467  rb_define_method(rb_cString, "chop", rb_str_chop, 0);
12468  rb_define_method(rb_cString, "chomp", rb_str_chomp, -1);
12469  rb_define_method(rb_cString, "strip", rb_str_strip, 0);
12470  rb_define_method(rb_cString, "lstrip", rb_str_lstrip, 0);
12471  rb_define_method(rb_cString, "rstrip", rb_str_rstrip, 0);
12472  rb_define_method(rb_cString, "delete_prefix", rb_str_delete_prefix, 1);
12473  rb_define_method(rb_cString, "delete_suffix", rb_str_delete_suffix, 1);
12474 
12475  rb_define_method(rb_cString, "sub!", rb_str_sub_bang, -1);
12476  rb_define_method(rb_cString, "gsub!", rb_str_gsub_bang, -1);
12477  rb_define_method(rb_cString, "chop!", rb_str_chop_bang, 0);
12478  rb_define_method(rb_cString, "chomp!", rb_str_chomp_bang, -1);
12479  rb_define_method(rb_cString, "strip!", rb_str_strip_bang, 0);
12480  rb_define_method(rb_cString, "lstrip!", rb_str_lstrip_bang, 0);
12481  rb_define_method(rb_cString, "rstrip!", rb_str_rstrip_bang, 0);
12482  rb_define_method(rb_cString, "delete_prefix!", rb_str_delete_prefix_bang, 1);
12483  rb_define_method(rb_cString, "delete_suffix!", rb_str_delete_suffix_bang, 1);
12484 
12485  rb_define_method(rb_cString, "tr", rb_str_tr, 2);
12486  rb_define_method(rb_cString, "tr_s", rb_str_tr_s, 2);
12487  rb_define_method(rb_cString, "delete", rb_str_delete, -1);
12488  rb_define_method(rb_cString, "squeeze", rb_str_squeeze, -1);
12489  rb_define_method(rb_cString, "count", rb_str_count, -1);
12490 
12491  rb_define_method(rb_cString, "tr!", rb_str_tr_bang, 2);
12492  rb_define_method(rb_cString, "tr_s!", rb_str_tr_s_bang, 2);
12493  rb_define_method(rb_cString, "delete!", rb_str_delete_bang, -1);
12494  rb_define_method(rb_cString, "squeeze!", rb_str_squeeze_bang, -1);
12495 
12496  rb_define_method(rb_cString, "each_line", rb_str_each_line, -1);
12497  rb_define_method(rb_cString, "each_byte", rb_str_each_byte, 0);
12498  rb_define_method(rb_cString, "each_char", rb_str_each_char, 0);
12499  rb_define_method(rb_cString, "each_codepoint", rb_str_each_codepoint, 0);
12500  rb_define_method(rb_cString, "each_grapheme_cluster", rb_str_each_grapheme_cluster, 0);
12501 
12502  rb_define_method(rb_cString, "sum", rb_str_sum, -1);
12503 
12504  rb_define_method(rb_cString, "slice", rb_str_aref_m, -1);
12505  rb_define_method(rb_cString, "slice!", rb_str_slice_bang, -1);
12506 
12507  rb_define_method(rb_cString, "partition", rb_str_partition, 1);
12508  rb_define_method(rb_cString, "rpartition", rb_str_rpartition, 1);
12509 
12510  rb_define_method(rb_cString, "encoding", rb_obj_encoding, 0); /* in encoding.c */
12511  rb_define_method(rb_cString, "force_encoding", rb_str_force_encoding, 1);
12512  rb_define_method(rb_cString, "b", rb_str_b, 0);
12513  rb_define_method(rb_cString, "valid_encoding?", rb_str_valid_encoding_p, 0);
12514  rb_define_method(rb_cString, "ascii_only?", rb_str_is_ascii_only_p, 0);
12515 
12516  /* define UnicodeNormalize module here so that we don't have to look it up */
12517  mUnicodeNormalize = rb_define_module("UnicodeNormalize");
12518  id_normalize = rb_intern_const("normalize");
12519  id_normalized_p = rb_intern_const("normalized?");
12520 
12521  rb_define_method(rb_cString, "unicode_normalize", rb_str_unicode_normalize, -1);
12522  rb_define_method(rb_cString, "unicode_normalize!", rb_str_unicode_normalize_bang, -1);
12523  rb_define_method(rb_cString, "unicode_normalized?", rb_str_unicode_normalized_p, -1);
12524 
12525  rb_fs = Qnil;
12526  rb_define_hooked_variable("$;", &rb_fs, 0, rb_fs_setter);
12527  rb_define_hooked_variable("$-F", &rb_fs, 0, rb_fs_setter);
12529 
12530  rb_cSymbol = rb_define_class("Symbol", rb_cObject);
12534  rb_define_singleton_method(rb_cSymbol, "all_symbols", sym_all_symbols, 0);
12535 
12536  rb_define_method(rb_cSymbol, "==", sym_equal, 1);
12537  rb_define_method(rb_cSymbol, "===", sym_equal, 1);
12538  rb_define_method(rb_cSymbol, "inspect", sym_inspect, 0);
12540  rb_define_method(rb_cSymbol, "id2name", rb_sym_to_s, 0);
12541  rb_define_method(rb_cSymbol, "name", rb_sym2str, 0);
12542  rb_define_method(rb_cSymbol, "intern", sym_to_sym, 0);
12543  rb_define_method(rb_cSymbol, "to_sym", sym_to_sym, 0);
12544  rb_define_method(rb_cSymbol, "to_proc", rb_sym_to_proc, 0);
12545  rb_define_method(rb_cSymbol, "succ", sym_succ, 0);
12546  rb_define_method(rb_cSymbol, "next", sym_succ, 0);
12547 
12548  rb_define_method(rb_cSymbol, "<=>", sym_cmp, 1);
12549  rb_define_method(rb_cSymbol, "casecmp", sym_casecmp, 1);
12550  rb_define_method(rb_cSymbol, "casecmp?", sym_casecmp_p, 1);
12551  rb_define_method(rb_cSymbol, "=~", sym_match, 1);
12552 
12553  rb_define_method(rb_cSymbol, "[]", sym_aref, -1);
12554  rb_define_method(rb_cSymbol, "slice", sym_aref, -1);
12555  rb_define_method(rb_cSymbol, "length", sym_length, 0);
12556  rb_define_method(rb_cSymbol, "size", sym_length, 0);
12557  rb_define_method(rb_cSymbol, "empty?", sym_empty, 0);
12558  rb_define_method(rb_cSymbol, "match", sym_match_m, -1);
12559  rb_define_method(rb_cSymbol, "match?", sym_match_m_p, -1);
12560 
12561  rb_define_method(rb_cSymbol, "upcase", sym_upcase, -1);
12562  rb_define_method(rb_cSymbol, "downcase", sym_downcase, -1);
12563  rb_define_method(rb_cSymbol, "capitalize", sym_capitalize, -1);
12564  rb_define_method(rb_cSymbol, "swapcase", sym_swapcase, -1);
12565 
12566  rb_define_method(rb_cSymbol, "start_with?", sym_start_with, -1);
12567  rb_define_method(rb_cSymbol, "end_with?", sym_end_with, -1);
12568 
12569  rb_define_method(rb_cSymbol, "encoding", sym_encoding, 0);
12570 }
#define RUBY_ASSERT(expr)
Asserts that the given expression is truthy if and only if RUBY_DEBUG is truthy.
Definition: assert.h:177
#define RUBY_ASSERT_ALWAYS(expr)
A variant of RUBY_ASSERT that does not interface with RUBY_DEBUG.
Definition: assert.h:167
Atomic operations.
#define RUBY_ATOMIC_CAS(var, oldval, newval)
Atomic compare-and-swap.
Definition: atomic.h:138
std::atomic< unsigned > rb_atomic_t
Type that is eligible for atomic operations.
Definition: atomic.h:69
static int rb_isspace(int c)
Our own locale-insensitive version of isspace(3).
Definition: ctype.h:395
#define rb_define_singleton_method(klass, mid, func, arity)
Defines klass.mid.
Definition: cxxanyargs.hpp:685
static bool rb_enc_isascii(OnigCodePoint c, rb_encoding *enc)
Identical to rb_isascii(), except it additionally takes an encoding.
Definition: ctype.h:82
static bool rb_enc_is_newline(const char *p, const char *e, rb_encoding *enc)
Queries if the passed pointer points to a newline character.
Definition: ctype.h:43
static bool rb_enc_isprint(OnigCodePoint c, rb_encoding *enc)
Identical to rb_isprint(), except it additionally takes an encoding.
Definition: ctype.h:166
static bool rb_enc_isctype(OnigCodePoint c, OnigCtype t, rb_encoding *enc)
Queries if the passed code point is of passed character type in the passed encoding.
Definition: ctype.h:63
VALUE rb_enc_sprintf(rb_encoding *enc, const char *fmt,...)
Identical to rb_sprintf(), except it additionally takes an encoding.
Definition: sprintf.c:1182
static VALUE RB_OBJ_FROZEN_RAW(VALUE obj)
This is an implenentation detail of RB_OBJ_FROZEN().
Definition: fl_type.h:912
@ RUBY_FL_FREEZE
This flag has something to do with data immutability.
Definition: fl_type.h:356
void rb_include_module(VALUE klass, VALUE module)
Includes a module to a class.
Definition: class.c:1043
VALUE rb_define_class(const char *name, VALUE super)
Defines a top-level class.
Definition: class.c:837
VALUE rb_define_module(const char *name)
Defines a top-level module.
Definition: class.c:948
void rb_undef_method(VALUE klass, const char *name)
Defines an undef of a method.
Definition: class.c:1938
int rb_scan_args(int argc, const VALUE *argv, const char *fmt,...)
Retrieves argument from argc and argv to given VALUE references according to the format string.
Definition: class.c:2406
void rb_define_method(VALUE klass, const char *name, VALUE(*func)(ANYARGS), int argc)
Defines a method.
Definition: class.c:1914
int rb_block_given_p(void)
Determines if the current method is given a block.
Definition: eval.c:854
int rb_get_kwargs(VALUE keyword_hash, const ID *table, int required, int optional, VALUE *values)
Keyword argument deconstructor.
Definition: class.c:2195
#define TYPE(_)
Old name of rb_type.
Definition: value_type.h:107
#define ENCODING_SET_INLINED(obj, i)
Old name of RB_ENCODING_SET_INLINED.
Definition: encoding.h:105
#define RB_INTEGER_TYPE_P
Old name of rb_integer_type_p.
Definition: value_type.h:87
#define ENC_CODERANGE_7BIT
Old name of RUBY_ENC_CODERANGE_7BIT.
Definition: coderange.h:180
#define ENC_CODERANGE_VALID
Old name of RUBY_ENC_CODERANGE_VALID.
Definition: coderange.h:181
#define FL_UNSET_RAW
Old name of RB_FL_UNSET_RAW.
Definition: fl_type.h:142
#define FL_EXIVAR
Old name of RUBY_FL_EXIVAR.
Definition: fl_type.h:67
#define ALLOCV
Old name of RB_ALLOCV.
Definition: memory.h:398
#define ISSPACE
Old name of rb_isspace.
Definition: ctype.h:88
#define T_STRING
Old name of RUBY_T_STRING.
Definition: value_type.h:78
#define ENC_CODERANGE_CLEAN_P(cr)
Old name of RB_ENC_CODERANGE_CLEAN_P.
Definition: coderange.h:183
#define ENC_CODERANGE_AND(a, b)
Old name of RB_ENC_CODERANGE_AND.
Definition: coderange.h:188
#define xfree
Old name of ruby_xfree.
Definition: xmalloc.h:58
#define Qundef
Old name of RUBY_Qundef.
#define INT2FIX
Old name of RB_INT2FIX.
Definition: long.h:48
#define OBJ_FROZEN
Old name of RB_OBJ_FROZEN.
Definition: fl_type.h:145
#define UNREACHABLE
Old name of RBIMPL_UNREACHABLE.
Definition: assume.h:30
#define ID2SYM
Old name of RB_ID2SYM.
Definition: symbol.h:44
#define OBJ_FREEZE_RAW
Old name of RB_OBJ_FREEZE_RAW.
Definition: fl_type.h:144
#define OBJ_FREEZE
Old name of RB_OBJ_FREEZE.
Definition: fl_type.h:143
#define T_FIXNUM
Old name of RUBY_T_FIXNUM.
Definition: value_type.h:63
#define UNREACHABLE_RETURN
Old name of RBIMPL_UNREACHABLE_RETURN.
Definition: assume.h:31
#define SYM2ID
Old name of RB_SYM2ID.
Definition: symbol.h:45
#define ENC_CODERANGE(obj)
Old name of RB_ENC_CODERANGE.
Definition: coderange.h:184
#define CLASS_OF
Old name of rb_class_of.
Definition: globals.h:203
#define ENC_CODERANGE_UNKNOWN
Old name of RUBY_ENC_CODERANGE_UNKNOWN.
Definition: coderange.h:179
#define SIZET2NUM
Old name of RB_SIZE2NUM.
Definition: size_t.h:62
#define FIXABLE
Old name of RB_FIXABLE.
Definition: fixnum.h:25
#define xmalloc
Old name of ruby_xmalloc.
Definition: xmalloc.h:53
#define ENCODING_GET(obj)
Old name of RB_ENCODING_GET.
Definition: encoding.h:108
#define LONG2FIX
Old name of RB_INT2FIX.
Definition: long.h:49
#define ISDIGIT
Old name of rb_isdigit.
Definition: ctype.h:93
#define ENC_CODERANGE_MASK
Old name of RUBY_ENC_CODERANGE_MASK.
Definition: coderange.h:178
#define ZALLOC_N
Old name of RB_ZALLOC_N.
Definition: memory.h:395
#define ALLOC_N
Old name of RB_ALLOC_N.
Definition: memory.h:393
#define MBCLEN_CHARFOUND_LEN(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_LEN.
Definition: encoding.h:533
#define FL_TEST_RAW
Old name of RB_FL_TEST_RAW.
Definition: fl_type.h:140
#define FL_SET
Old name of RB_FL_SET.
Definition: fl_type.h:137
#define rb_ary_new3
Old name of rb_ary_new_from_args.
Definition: array.h:652
#define ENCODING_INLINE_MAX
Old name of RUBY_ENCODING_INLINE_MAX.
Definition: encoding.h:66
#define LONG2NUM
Old name of RB_LONG2NUM.
Definition: long.h:50
#define ISALPHA
Old name of rb_isalpha.
Definition: ctype.h:92
#define MBCLEN_INVALID_P(ret)
Old name of ONIGENC_MBCLEN_INVALID_P.
Definition: encoding.h:534
#define ISASCII
Old name of rb_isascii.
Definition: ctype.h:85
#define TOLOWER
Old name of rb_tolower.
Definition: ctype.h:101
#define Qtrue
Old name of RUBY_Qtrue.
#define ST2FIX
Old name of RB_ST2FIX.
Definition: st_data_t.h:33
#define MBCLEN_NEEDMORE_P(ret)
Old name of ONIGENC_MBCLEN_NEEDMORE_P.
Definition: encoding.h:535
#define FIXNUM_MAX
Old name of RUBY_FIXNUM_MAX.
Definition: fixnum.h:26
#define NUM2INT
Old name of RB_NUM2INT.
Definition: int.h:44
#define Qnil
Old name of RUBY_Qnil.
#define Qfalse
Old name of RUBY_Qfalse.
#define FIX2LONG
Old name of RB_FIX2LONG.
Definition: long.h:46
#define ENC_CODERANGE_BROKEN
Old name of RUBY_ENC_CODERANGE_BROKEN.
Definition: coderange.h:182
#define scan_hex(s, l, e)
Old name of ruby_scan_hex.
Definition: util.h:97
#define NIL_P
Old name of RB_NIL_P.
#define MBCLEN_CHARFOUND_P(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_P.
Definition: encoding.h:532
#define FL_WB_PROTECTED
Old name of RUBY_FL_WB_PROTECTED.
Definition: fl_type.h:59
#define DBL2NUM
Old name of rb_float_new.
Definition: double.h:29
#define ISPRINT
Old name of rb_isprint.
Definition: ctype.h:86
#define BUILTIN_TYPE
Old name of RB_BUILTIN_TYPE.
Definition: value_type.h:85
#define ENCODING_SHIFT
Old name of RUBY_ENCODING_SHIFT.
Definition: encoding.h:67
#define FL_TEST
Old name of RB_FL_TEST.
Definition: fl_type.h:139
#define FL_FREEZE
Old name of RUBY_FL_FREEZE.
Definition: fl_type.h:68
#define NUM2LONG
Old name of RB_NUM2LONG.
Definition: long.h:51
#define ENCODING_GET_INLINED(obj)
Old name of RB_ENCODING_GET_INLINED.
Definition: encoding.h:107
#define ENC_CODERANGE_CLEAR(obj)
Old name of RB_ENC_CODERANGE_CLEAR.
Definition: coderange.h:187
#define FL_UNSET
Old name of RB_FL_UNSET.
Definition: fl_type.h:141
#define UINT2NUM
Old name of RB_UINT2NUM.
Definition: int.h:46
#define ENCODING_IS_ASCII8BIT(obj)
Old name of RB_ENCODING_IS_ASCII8BIT.
Definition: encoding.h:109
#define FIXNUM_P
Old name of RB_FIXNUM_P.
#define CONST_ID
Old name of RUBY_CONST_ID.
Definition: symbol.h:47
#define rb_ary_new2
Old name of rb_ary_new_capa.
Definition: array.h:651
#define ENC_CODERANGE_SET(obj, cr)
Old name of RB_ENC_CODERANGE_SET.
Definition: coderange.h:186
#define ENCODING_CODERANGE_SET(obj, encindex, cr)
Old name of RB_ENCODING_CODERANGE_SET.
Definition: coderange.h:189
#define FL_SET_RAW
Old name of RB_FL_SET_RAW.
Definition: fl_type.h:138
#define SYMBOL_P
Old name of RB_SYMBOL_P.
Definition: value_type.h:88
#define OBJ_FROZEN_RAW
Old name of RB_OBJ_FROZEN_RAW.
Definition: fl_type.h:146
#define T_REGEXP
Old name of RUBY_T_REGEXP.
Definition: value_type.h:77
#define ENCODING_MASK
Old name of RUBY_ENCODING_MASK.
Definition: encoding.h:68
void rb_category_warn(rb_warning_category_t category, const char *fmt,...)
Identical to rb_category_warning(), except it reports always regardless of runtime -W flag.
Definition: error.c:428
void rb_raise(VALUE exc, const char *fmt,...)
Exception entry point.
Definition: error.c:3025
void rb_exc_raise(VALUE mesg)
Raises an exception in the current thread.
Definition: eval.c:675
void rb_syserr_fail(int e, const char *mesg)
Raises appropriate exception that represents a C errno.
Definition: error.c:3137
void rb_bug(const char *fmt,...)
Interpreter panic switch.
Definition: error.c:802
VALUE rb_eRangeError
RangeError exception.
Definition: error.c:1103
VALUE rb_eTypeError
TypeError exception.
Definition: error.c:1099
void rb_fatal(const char *fmt,...)
Raises the unsung "fatal" exception.
Definition: error.c:3076
VALUE rb_eEncCompatError
Encoding::CompatibilityError exception.
Definition: error.c:1106
VALUE rb_eRuntimeError
RuntimeError exception.
Definition: error.c:1097
VALUE rb_eArgError
ArgumentError exception.
Definition: error.c:1100
VALUE rb_eIndexError
IndexError exception.
Definition: error.c:1101
VALUE rb_ensure(VALUE(*b_proc)(VALUE), VALUE data1, VALUE(*e_proc)(VALUE), VALUE data2)
An equivalent to ensure clause.
Definition: eval.c:983
@ RB_WARN_CATEGORY_DEPRECATED
Warning is for deprecated features.
Definition: error.h:48
VALUE rb_any_to_s(VALUE obj)
Generates a textual representation of the given object.
Definition: object.c:553
VALUE rb_obj_alloc(VALUE klass)
Allocates an instance of the given class.
Definition: object.c:1909
VALUE rb_obj_frozen_p(VALUE obj)
Just calls RB_OBJ_FROZEN() inside.
Definition: object.c:1173
double rb_str_to_dbl(VALUE str, int mode)
Identical to rb_cstr_to_dbl(), except it accepts a Ruby's string instead of C's.
Definition: object.c:3325
VALUE rb_obj_class(VALUE obj)
Queries the class of an object.
Definition: object.c:188
VALUE rb_cSymbol
Sumbol class.
Definition: string.c:81
VALUE rb_equal(VALUE lhs, VALUE rhs)
This function is an optimised version of calling #==.
Definition: object.c:120
VALUE rb_obj_freeze(VALUE obj)
Just calls rb_obj_freeze_inline() inside.
Definition: object.c:1161
VALUE rb_mComparable
Comparable module.
Definition: compar.c:19
VALUE rb_cString
String class.
Definition: string.c:80
VALUE rb_to_int(VALUE val)
Identical to rb_check_to_int(), except it raises in case of conversion mismatch.
Definition: object.c:2998
#define RB_OBJ_WRITE(old, slot, young)
Declaration of a "back" pointer.
Definition: rgengc.h:220
Encoding relates APIs.
rb_encoding * rb_locale_encoding(void)
Queries the encoding that represents the current locale.
Definition: encoding.c:1573
rb_encoding * rb_default_external_encoding(void)
Queries the "default external" encoding.
Definition: encoding.c:1637
int rb_enc_dummy_p(rb_encoding *enc)
Queries if the passed encoding is dummy.
Definition: encoding.c:203
int rb_enc_precise_mbclen(const char *p, const char *e, rb_encoding *enc)
Queries the number of bytes of the character at the passed pointer.
Definition: encoding.c:1234
int rb_enc_get_index(VALUE obj)
Queries the index of the encoding of the passed object, if any.
Definition: encoding.c:979
VALUE rb_enc_associate(VALUE obj, rb_encoding *enc)
Identical to rb_enc_associate(), except it takes an encoding itself instead of its index.
Definition: encoding.c:1066
rb_encoding * rb_usascii_encoding(void)
Queries the encoding that represents US-ASCII.
Definition: encoding.c:1539
int rb_enc_codelen(int code, rb_encoding *enc)
Queries the number of bytes requested to represent the passed code point using the passed encoding.
Definition: encoding.c:1284
static char * rb_enc_left_char_head(const char *s, const char *p, const char *e, rb_encoding *enc)
Queries the left boundary of a character.
Definition: encoding.h:697
void rb_enc_copy(VALUE dst, VALUE src)
Destructively copies the encoding of the latter object to that of former one.
Definition: encoding.c:1192
int rb_utf8_encindex(void)
Identical to rb_utf8_encoding(), except it returns the encoding's index instead of the encoding itsel...
Definition: encoding.c:1533
int rb_enc_fast_mbclen(const char *p, const char *e, rb_encoding *enc)
Identical to rb_enc_mbclen() unless the character at p overruns e.
Definition: encoding.c:1216
int rb_ascii8bit_encindex(void)
Identical to rb_ascii8bit_encoding(), except it returns the encoding's index instead of the encoding ...
Definition: encoding.c:1521
rb_encoding * rb_enc_compatible(VALUE str1, VALUE str2)
Look for the "common" encoding between the two.
Definition: encoding.c:1176
unsigned int rb_enc_codepoint_len(const char *p, const char *e, int *len, rb_encoding *enc)
Queries the code point of character pointed by the passed pointer.
Definition: encoding.c:1270
int rb_enc_unicode_p(rb_encoding *enc)
Queries if the passed encoding is either one of UTF-8/16/32.
Definition: encoding.c:689
rb_encoding * rb_default_internal_encoding(void)
Queries the "default internal" encoding.
Definition: encoding.c:1724
int rb_enc_to_index(rb_encoding *enc)
Queries the index of the encoding.
Definition: encoding.c:197
static char * rb_enc_right_char_head(const char *s, const char *p, const char *e, rb_encoding *enc)
Queries the right boundary of a character.
Definition: encoding.h:718
rb_encoding * rb_to_encoding(VALUE obj)
Identical to rb_find_encoding(), except it raises an exception instead of returning NULL.
Definition: encoding.c:329
void rb_enc_set_index(VALUE obj, int encindex)
Destructively assigns an encoding (via its index) to an object.
Definition: encoding.c:1030
rb_encoding * rb_ascii8bit_encoding(void)
Queries the encoding that represents ASCII-8BIT a.k.a.
Definition: encoding.c:1515
rb_encoding * rb_enc_check(VALUE str1, VALUE str2)
Identical to rb_enc_compatible(), except it raises an exception instead of returning NULL.
Definition: encoding.c:1097
VALUE rb_enc_from_encoding(rb_encoding *enc)
Queries the Ruby-level counterpart instance of rb_cEncoding that corresponds to the passed encoding.
Definition: encoding.c:188
static bool rb_enc_asciicompat(rb_encoding *enc)
Queries if the passed encoding is in some sense compatible with ASCII.
Definition: encoding.h:782
rb_encoding * rb_utf8_encoding(void)
Queries the encoding that represents UTF-8.
Definition: encoding.c:1527
static char * rb_enc_prev_char(const char *s, const char *p, const char *e, rb_encoding *enc)
Queries the previous (left) character.
Definition: encoding.h:676
static unsigned int rb_enc_codepoint(const char *p, const char *e, rb_encoding *enc)
Queries the code point of character pointed by the passed pointer.
Definition: encoding.h:587
static int rb_enc_mbcput(unsigned int c, void *buf, rb_encoding *enc)
Identical to rb_enc_uint_chr(), except it writes back to the passed buffer instead of allocating one.
Definition: encoding.h:657
rb_encoding * rb_enc_from_index(int idx)
Identical to rb_find_encoding(), except it takes an encoding index instead of a Ruby object.
Definition: encoding.c:414
static int rb_enc_mbmaxlen(rb_encoding *enc)
Queries the maximum number of bytes that the passed encoding needs to represent a character.
Definition: encoding.h:463
int rb_enc_mbclen(const char *p, const char *e, rb_encoding *enc)
Queries the number of bytes of the character at the passed pointer.
Definition: encoding.c:1222
VALUE rb_enc_associate_index(VALUE obj, int encindex)
Identical to rb_enc_set_index(), except it additionally does contents fix-up depending on the passed ...
Definition: encoding.c:1038
rb_encoding * rb_enc_get(VALUE obj)
Identical to rb_enc_get_index(), except the return type.
Definition: encoding.c:1072
static OnigCodePoint rb_enc_mbc_to_codepoint(const char *p, const char *e, rb_encoding *enc)
Identical to rb_enc_codepoint(), except it assumes the passed character is not broken.
Definition: encoding.h:607
static const char * rb_enc_name(rb_encoding *enc)
Queries the (canonical) name of the passed encoding.
Definition: encoding.h:433
static char * rb_enc_step_back(const char *s, const char *p, const char *e, int n, rb_encoding *enc)
Scans the string backwards for n characters.
Definition: encoding.h:740
static int rb_enc_mbminlen(rb_encoding *enc)
Queries the minimum number of bytes that the passed encoding needs to represent a character.
Definition: encoding.h:448
static int rb_enc_code_to_mbclen(int c, rb_encoding *enc)
Identical to rb_enc_codelen(), except it returns 0 for invalid code points.
Definition: encoding.h:635
int rb_enc_ascget(const char *p, const char *e, int *len, rb_encoding *enc)
Queries the code point of character pointed by the passed pointer.
Definition: encoding.c:1246
int rb_usascii_encindex(void)
Identical to rb_usascii_encoding(), except it returns the encoding's index instead of the encoding it...
Definition: encoding.c:1545
rb_encoding * rb_filesystem_encoding(void)
Queries the "filesystem" encoding.
Definition: encoding.c:1592
VALUE rb_str_conv_enc(VALUE str, rb_encoding *from, rb_encoding *to)
Encoding conversion main routine.
Definition: string.c:1182
char * rb_enc_nth(const char *head, const char *tail, long nth, rb_encoding *enc)
Queries the n-th character.
Definition: string.c:2735
int rb_enc_str_coderange(VALUE str)
Scans the passed string to collect its code range.
Definition: string.c:776
VALUE rb_enc_str_new_static(const char *ptr, long len, rb_encoding *enc)
Identical to rb_enc_str_new(), except it takes a C string literal.
Definition: string.c:1034
VALUE rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
Identical to rb_str_conv_enc(), except it additionally takes IO encoder options.
Definition: string.c:1067
VALUE rb_enc_interned_str(const char *ptr, long len, rb_encoding *enc)
Identical to rb_enc_str_new(), except it returns a "f"string.
Definition: string.c:11955
long rb_memsearch(const void *x, long m, const void *y, long n, rb_encoding *enc)
Looks for the passed string in the passed buffer.
Definition: re.c:247
long rb_enc_strlen(const char *head, const char *tail, rb_encoding *enc)
Counts the number of characters of the passed string, according to the passed encoding.
Definition: string.c:2071
VALUE rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *enc)
Identical to rb_str_cat(), except it additionally takes an encoding.
Definition: string.c:3271
VALUE rb_enc_str_new_cstr(const char *ptr, rb_encoding *enc)
Identical to rb_enc_str_new(), except it assumes the passed pointer is a pointer to a C string.
Definition: string.c:980
VALUE rb_enc_str_new(const char *ptr, long len, rb_encoding *enc)
Identical to rb_enc_str_new(), except it additionally takes an encoding.
Definition: string.c:940
VALUE rb_str_export_to_enc(VALUE obj, rb_encoding *enc)
Identical to rb_str_export(), except it additionally takes an encoding.
Definition: string.c:1287
VALUE rb_external_str_new_with_enc(const char *ptr, long len, rb_encoding *enc)
Identical to rb_external_str_new(), except it additionally takes an encoding.
Definition: string.c:1188
int rb_enc_str_asciionly_p(VALUE str)
Queries if the passed string is "ASCII only".
Definition: string.c:790
VALUE rb_obj_encoding(VALUE obj)
Identical to rb_enc_get_index(), except the return type.
Definition: encoding.c:1206
VALUE rb_enc_interned_str_cstr(const char *ptr, rb_encoding *enc)
Identical to rb_enc_str_new_cstr(), except it returns a "f"string.
Definition: string.c:11966
long rb_str_coderange_scan_restartable(const char *str, const char *end, rb_encoding *enc, int *cr)
Scans the passed string until it finds something odd.
Definition: string.c:668
int rb_enc_symname2_p(const char *name, long len, rb_encoding *enc)
Identical to rb_enc_symname_p(), except it additionally takes the passed string's length.
Definition: symbol.c:406
rb_econv_result_t rb_econv_convert(rb_econv_t *ec, const unsigned char **source_buffer_ptr, const unsigned char *source_buffer_end, unsigned char **destination_buffer_ptr, unsigned char *destination_buffer_end, int flags)
Converts a string from an encoding to another.
Definition: transcode.c:1449
rb_econv_result_t
return value of rb_econv_convert()
Definition: transcode.h:30
@ econv_finished
The conversion stopped after converting everything.
Definition: transcode.h:57
@ econv_destination_buffer_full
The conversion stopped because there is no destination.
Definition: transcode.h:46
VALUE rb_str_encode(VALUE str, VALUE to, int ecflags, VALUE ecopts)
Converts the contents of the passed string from its encoding to the passed one.
Definition: transcode.c:2929
rb_econv_t * rb_econv_open_opts(const char *source_encoding, const char *destination_encoding, int ecflags, VALUE ecopts)
Identical to rb_econv_open(), except it additionally takes a hash of optional strings.
Definition: transcode.c:2614
void rb_econv_close(rb_econv_t *ec)
Destructs a converter.
Definition: transcode.c:1705
VALUE rb_funcall(VALUE recv, ID mid, int n,...)
Calls a method.
Definition: vm_eval.c:1102
VALUE rb_funcallv(VALUE recv, ID mid, int argc, const VALUE *argv)
Identical to rb_funcall(), except it takes the method arguments as a C array.
Definition: vm_eval.c:1061
VALUE rb_funcall_with_block_kw(VALUE recv, ID mid, int argc, const VALUE *argv, VALUE procval, int kw_splat)
Identical to rb_funcallv_with_block(), except you can specify how to handle the last element of the g...
Definition: vm_eval.c:1189
void rb_gc_register_address(VALUE *valptr)
Inform the garbage collector that valptr points to a live Ruby object that should not be moved.
Definition: gc.c:8708
VALUE rb_check_array_type(VALUE obj)
Try converting an object to its array representation using its to_ary method, if any.
Definition: array.c:989
VALUE rb_ary_new(void)
Allocates a new, empty array.
Definition: array.c:750
VALUE rb_ary_push(VALUE ary, VALUE elem)
Special case of rb_ary_cat() that it adds only one element.
Definition: array.c:1308
VALUE rb_str_to_inum(VALUE str, int base, int badcheck)
Identical to rb_cstr2inum(), except it takes Ruby's strings instead of C's.
Definition: bignum.c:4280
#define RETURN_SIZED_ENUMERATOR(obj, argc, argv, size_fn)
This roughly resembles return enum_for(__callee__) unless block_given?.
Definition: enumerator.h:206
#define RETURN_ENUMERATOR(obj, argc, argv)
Identical to RETURN_SIZED_ENUMERATOR(), except its size is unknown.
Definition: enumerator.h:239
#define UNLIMITED_ARGUMENTS
This macro is used in conjunction with rb_check_arity().
Definition: error.h:35
#define rb_check_frozen
Just another name of rb_check_frozen.
Definition: error.h:278
static int rb_check_arity(int argc, int min, int max)
Ensures that the passed integer is in the passed range.
Definition: error.h:294
VALUE rb_check_hash_type(VALUE obj)
Try converting an object to its hash representation using its to_hash method, if any.
Definition: hash.c:1896
VALUE rb_hash_aref(VALUE hash, VALUE key)
Queries the given key in the given hash table.
Definition: hash.c:2082
VALUE rb_hash_aset(VALUE hash, VALUE key, VALUE val)
Inserts or replaces ("upsert"s) the objects into the given hash table.
Definition: hash.c:2903
VALUE rb_hash_lookup(VALUE hash, VALUE key)
Identical to rb_hash_aref(), except it always returns RUBY_Qnil for misshits.
Definition: hash.c:2108
VALUE rb_hash_new(void)
Creates a new, empty hash object.
Definition: hash.c:1529
VALUE rb_rs
The record separator character for inputs, or the $/.
Definition: io.c:202
VALUE rb_fs
The field separator character for inputs, or the $;.
Definition: string.c:553
VALUE rb_default_rs
This is the default value of rb_rs, i.e.
Definition: io.c:204
VALUE rb_backref_get(void)
Queries the last match, or Regexp.last_match, or the $~.
Definition: vm.c:1580
VALUE rb_sym_all_symbols(void)
Collects every single bits of symbols that have ever interned in the entire history of the current pr...
Definition: symbol.c:991
void rb_backref_set(VALUE md)
Updates $~.
Definition: vm.c:1586
VALUE rb_range_beg_len(VALUE range, long *begp, long *lenp, long len, int err)
Deconstructs a numerical range.
Definition: range.c:1578
int rb_reg_backref_number(VALUE match, VALUE backref)
Queries the index of the given named capture.
Definition: re.c:1197
int rb_reg_options(VALUE re)
Queries the options of the passed regular expression.
Definition: re.c:3659
VALUE rb_reg_match(VALUE re, VALUE str)
This is the match operator.
Definition: re.c:3260
void rb_match_busy(VALUE md)
Asserts that the given MatchData is "occupied".
Definition: re.c:1377
VALUE rb_reg_nth_match(int n, VALUE md)
Queries the nth captured substring.
Definition: re.c:1793
VALUE rb_str_to_interned_str(VALUE str)
Identical to rb_interned_str(), except it takes a Ruby's string instead of C's.
Definition: string.c:11936
void rb_str_free(VALUE str)
Destroys the given string for no reason.
Definition: string.c:1546
VALUE rb_str_new_shared(VALUE str)
Identical to rb_str_new_cstr(), except it takes a Ruby's string instead of C's.
Definition: string.c:1350
VALUE rb_str_plus(VALUE lhs, VALUE rhs)
Generates a new string, concatenating the former to the latter.
Definition: string.c:2232
VALUE rb_utf8_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "UTF-8" encoding.
Definition: string.c:932
VALUE rb_str_append(VALUE dst, VALUE src)
Identical to rb_str_buf_append(), except it converts the right hand side before concatenating.
Definition: string.c:3317
VALUE rb_filesystem_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "filesystem" encoding.
Definition: string.c:1263
VALUE rb_utf8_str_new_cstr(const char *ptr)
Identical to rb_str_new_cstr(), except it generates a string of "UTF-8" encoding.
Definition: string.c:972
VALUE rb_sym_to_s(VALUE sym)
This is an rb_sym2str() + rb_str_dup() combo.
Definition: string.c:11518
VALUE rb_str_times(VALUE str, VALUE num)
Repetition of a string.
Definition: string.c:2304
VALUE rb_external_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "default external" encoding.
Definition: string.c:1239
VALUE rb_str_tmp_new(long len)
Allocates a "temporary" string.
Definition: string.c:1540
long rb_str_offset(VALUE str, long pos)
"Inverse" of rb_str_sublen().
Definition: string.c:2763
VALUE rb_str_succ(VALUE orig)
Searches for the "successor" of a string.
Definition: string.c:4564
int rb_str_hash_cmp(VALUE str1, VALUE str2)
Compares two strings.
Definition: string.c:3526
VALUE rb_str_subseq(VALUE str, long beg, long len)
Identical to rb_str_substr(), except the numbers are interpreted as byte offsets instead of character...
Definition: string.c:2821
VALUE rb_str_ellipsize(VALUE str, long len)
Shortens str and adds three dots, an ellipsis, if it is longer than len characters.
Definition: string.c:10802
st_index_t rb_memhash(const void *ptr, long len)
This is a universal hash function.
Definition: random.c:1720
void rb_str_shared_replace(VALUE dst, VALUE src)
Replaces the contents of the former with the latter.
Definition: string.c:1593
VALUE rb_str_new_static(const char *ptr, long len)
Identical to rb_str_new(), except it takes a C string literal.
Definition: string.c:1016
VALUE rb_str_buf_cat(VALUE, const char *, long)
Just another name of rb_str_cat.
size_t rb_str_capacity(VALUE str)
Queries the capacity of the given string.
Definition: string.c:828
VALUE rb_str_new_frozen(VALUE str)
Creates a frozen copy of the string, if necessary.
Definition: string.c:1356
VALUE rb_str_cat2(VALUE, const char *)
Just another name of rb_str_cat_cstr.
VALUE rb_str_dup(VALUE str)
Duplicates a string.
Definition: string.c:1808
void rb_str_modify(VALUE str)
Declares that the string is about to be modified.
Definition: string.c:2459
st_index_t rb_str_hash(VALUE str)
Calculates a hash value of a string.
Definition: string.c:3516
VALUE rb_str_cat(VALUE dst, const char *src, long srclen)
Destructively appends the passed contents to the string.
Definition: string.c:3161
VALUE rb_str_locktmp(VALUE str)
Obtains a "temporary lock" of the string.
long rb_str_strlen(VALUE str)
Counts the number of characters (not bytes) that are stored inside of the given string.
Definition: string.c:2160
VALUE rb_str_resurrect(VALUE str)
I guess there is no use case of this function in extension libraries, but this is a routine identical...
Definition: string.c:1814
VALUE rb_usascii_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "US ASCII" encoding.
Definition: string.c:924
VALUE rb_str_replace(VALUE dst, VALUE src)
Replaces the contents of the former object with the stringised contents of the latter.
Definition: string.c:5841
VALUE rb_usascii_str_new_cstr(const char *ptr)
Identical to rb_str_new_cstr(), except it generates a string of "US ASCII" encoding.
Definition: string.c:964
VALUE rb_str_buf_cat2(VALUE, const char *)
Just another name of rb_str_cat_cstr.
char * rb_str_subpos(VALUE str, long beg, long *len)
Identical to rb_str_substr(), except it returns a C's string instead of Ruby's.
Definition: string.c:2844
rb_gvar_setter_t rb_str_setter
This is a rb_gvar_setter_t that refutes non-string assignments.
Definition: string.h:1177
VALUE rb_interned_str_cstr(const char *ptr)
Identical to rb_interned_str(), except it assumes the passed pointer is a pointer to a C's string.
Definition: string.c:11949
VALUE rb_filesystem_str_new_cstr(const char *ptr)
Identical to rb_filesystem_str_new(), except it assumes the passed pointer is a pointer to a C string...
Definition: string.c:1269
VALUE rb_str_buf_append(VALUE dst, VALUE src)
Identical to rb_str_cat_cstr(), except it takes Ruby's string instead of C's.
Definition: string.c:3302
long rb_str_sublen(VALUE str, long pos)
Byte offset to character offset conversion.
Definition: string.c:2810
VALUE rb_str_equal(VALUE str1, VALUE str2)
Equality of two strings.
Definition: string.c:3628
void rb_str_set_len(VALUE str, long len)
Overwrites the length of the string.
Definition: string.c:3039
VALUE rb_str_inspect(VALUE str)
Generates a "readable" version of the receiver.
Definition: string.c:6456
void rb_must_asciicompat(VALUE obj)
Asserts that the given string's encoding is (Ruby's definition of) ASCII compatible.
Definition: string.c:2511
VALUE rb_str_buf_new_cstr(const char *ptr)
This is a rb_str_buf_new() + rb_str_buf_cat() combo.
Definition: string.c:1528
VALUE rb_interned_str(const char *ptr, long len)
Identical to rb_str_new(), except it returns an infamous "f"string.
Definition: string.c:11942
int rb_str_cmp(VALUE lhs, VALUE rhs)
Compares two strings, as in strcmp(3).
Definition: string.c:3582
VALUE rb_str_concat(VALUE dst, VALUE src)
Identical to rb_str_append(), except it also accepts an integer as a codepoint.
Definition: string.c:3418
VALUE rb_tainted_str_new(const char *ptr, long len)
Definition: string.c:1040
int rb_str_comparable(VALUE str1, VALUE str2)
Checks if two strings are comparable each other or not.
Definition: string.c:3557
#define rb_strlen_lit(str)
Length of a string literal.
Definition: string.h:1756
VALUE rb_str_buf_cat_ascii(VALUE dst, const char *src)
Identical to rb_str_cat_cstr(), except it additionally assumes the source string be a NUL terminated ...
Definition: string.c:3278
VALUE rb_str_freeze(VALUE str)
This is the implementation of String#freeze.
Definition: string.c:2963
void rb_str_update(VALUE dst, long beg, long len, VALUE src)
Replaces some (or all) of the contents of the given string.
Definition: string.c:5145
VALUE rb_str_new(const char *ptr, long len)
Allocates an instance of rb_cString.
Definition: string.c:918
VALUE rb_str_scrub(VALUE str, VALUE repl)
"Cleanses" the string.
Definition: string.c:10860
VALUE rb_str_dup_frozen(VALUE)
Just another name of rb_str_new_frozen.
VALUE rb_str_new_with_class(VALUE obj, const char *ptr, long len)
Identical to rb_str_new(), except it takes the class of the allocating object.
Definition: string.c:1487
VALUE rb_locale_str_new_cstr(const char *ptr)
Identical to rb_locale_str_new(), except it assumes the passed pointer is a pointer to a C string.
Definition: string.c:1257
VALUE rb_check_string_type(VALUE obj)
Try converting an object to its stringised representation using its to_str method,...
Definition: string.c:2659
VALUE rb_str_substr(VALUE str, long beg, long len)
This is the implementation of two-argumented String#slice.
Definition: string.c:2931
VALUE rb_str_unlocktmp(VALUE str)
Releases a lock formerly obtained by rb_str_locktmp().
Definition: string.c:3022
VALUE rb_str_new_cstr(const char *ptr)
Identical to rb_str_new(), except it assumes the passed pointer is a pointer to a C string.
Definition: string.c:952
VALUE rb_str_resize(VALUE str, long len)
Overwrites the length of the string.
Definition: string.c:3056
VALUE rb_utf8_str_new_static(const char *ptr, long len)
Identical to rb_str_new_static(), except it generates a string of "UTF-8" encoding instead of "binary...
Definition: string.c:1028
void rb_str_modify_expand(VALUE str, long capa)
Identical to rb_str_modify(), except it additionally expands the capacity of the receiver.
Definition: string.c:2467
VALUE rb_tainted_str_new_cstr(const char *ptr)
Definition: string.c:1047
VALUE rb_str_dump(VALUE str)
"Inverse" of rb_eval_string().
Definition: string.c:6567
VALUE rb_locale_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "locale" encoding.
Definition: string.c:1251
VALUE rb_str_buf_new(long capa)
Allocates a "string buffer".
Definition: string.c:1506
VALUE rb_external_str_new_cstr(const char *ptr)
Identical to rb_external_str_new(), except it assumes the passed pointer is a pointer to a C string.
Definition: string.c:1245
VALUE rb_str_length(VALUE)
Identical to rb_str_strlen(), except it returns the value in rb_cInteger.
Definition: string.c:2180
VALUE rb_str_cat_cstr(VALUE dst, const char *src)
Identical to rb_str_cat(), except it assumes the passed pointer is a pointer to a C string.
Definition: string.c:3171
VALUE rb_str_drop_bytes(VALUE str, long len)
Shrinks the given string for the given number of bytes.
Definition: string.c:5071
VALUE rb_str_split(VALUE str, const char *delim)
Divides the given string based on the given delimiter.
Definition: string.c:8676
VALUE rb_usascii_str_new_static(const char *ptr, long len)
Identical to rb_str_new_static(), except it generates a string of "US ASCII" encoding instead of "bin...
Definition: string.c:1022
VALUE rb_str_intern(VALUE str)
Identical to rb_to_symbol(), except it assumes the receiver being an instance of RString.
Definition: symbol.c:837
VALUE rb_obj_as_string(VALUE obj)
Try converting an object to its stringised representation using its to_s method, if any.
Definition: string.c:1657
int rb_respond_to(VALUE obj, ID mid)
Queries if the object responds to the method.
Definition: vm_method.c:2765
void rb_undef_alloc_func(VALUE klass)
Deletes the allocator function of a class.
Definition: vm_method.c:1117
void rb_define_alloc_func(VALUE klass, rb_alloc_func_t func)
Sets the allocator function of a class.
static ID rb_intern_const(const char *str)
This is a "tiny optimisation" over rb_intern().
Definition: symbol.h:276
ID rb_intern(const char *name)
Finds or creates a symbol of the given name.
Definition: symbol.c:782
VALUE rb_sym2str(VALUE id)
Identical to rb_id2str(), except it takes an instance of rb_cSymbol rather than an ID.
Definition: symbol.c:924
VALUE rb_to_symbol(VALUE name)
Identical to rb_intern_str(), except it generates a dynamic symbol if necessary.
Definition: string.c:11904
ID rb_to_id(VALUE str)
Identical to rb_intern(), except it takes an instance of rb_cString.
Definition: string.c:11894
ID rb_intern_str(VALUE str)
Identical to rb_intern(), except it takes an instance of rb_cString.
Definition: symbol.c:788
VALUE rb_id2str(ID id)
Identical to rb_id2name(), except it returns a Ruby's String instead of C's.
Definition: symbol.c:935
void rb_define_hooked_variable(const char *name, VALUE *var, rb_gvar_getter_t *getter, rb_gvar_setter_t *setter)
Identical to rb_define_virtual_variable(), but can also specify a storage.
Definition: variable.c:563
long rb_reg_search(VALUE re, VALUE str, long pos, int dir)
Runs the passed regular expression over the passed string.
Definition: re.c:1697
VALUE rb_reg_regcomp(VALUE str)
Creates a new instance of rb_cRegexp.
Definition: re.c:3053
VALUE rb_reg_regsub(VALUE repl, VALUE src, struct re_registers *regs, VALUE rexp)
Substitution.
Definition: re.c:3856
VALUE rb_str_format(int argc, const VALUE *argv, VALUE fmt)
Formats a string.
Definition: sprintf.c:214
VALUE rb_yield(VALUE val)
Yields the block.
Definition: vm_eval.c:1357
#define MEMCPY(p1, p2, type, n)
Handy macro to call memcpy.
Definition: memory.h:366
#define ALLOCA_N(type, n)
Definition: memory.h:286
#define MEMZERO(p, type, n)
Handy macro to erase a region of memory.
Definition: memory.h:354
#define RB_GC_GUARD(v)
Prevents premature destruction of local objects.
Definition: memory.h:161
int st_foreach(st_table *q, int_type *w, st_data_t e)
Iteration over the given table.
Definition: cxxanyargs.hpp:432
static int RARRAY_LENINT(VALUE ary)
Identical to rb_array_len(), except it differs for the return type.
Definition: rarray.h:324
#define RARRAY_CONST_PTR
Just another name of rb_array_const_ptr.
Definition: rarray.h:69
static VALUE RBASIC_CLASS(VALUE obj)
Queries the class of an object.
Definition: rbasic.h:152
#define RBASIC(obj)
Convenient casting macro.
Definition: rbasic.h:40
#define DATA_PTR(obj)
Convenient getter macro.
Definition: rdata.h:71
#define RGENGC_WB_PROTECTED_STRING
This is a compile-time flag to enable/disable write barrier for struct RString.
Definition: rgengc.h:107
static struct re_registers * RMATCH_REGS(VALUE match)
Queries the raw re_registers.
Definition: rmatch.h:139
static VALUE RREGEXP_SRC(VALUE rexp)
Convenient getter function.
Definition: rregexp.h:103
@ RSTRING_EMBED_LEN_MAX
Max possible number of characters that can be embedded.
Definition: rstring.h:215
#define StringValue(v)
Ensures that the parameter object is a String.
Definition: rstring.h:72
VALUE rb_str_export_locale(VALUE obj)
Identical to rb_str_export(), except it converts into the locale encoding instead.
Definition: string.c:1281
static char * RSTRING_END(VALUE str)
Queries the end of the contents pointer of the string.
Definition: rstring.h:527
static char * RSTRING_PTR(VALUE str)
Queries the contents pointer of the string.
Definition: rstring.h:497
static long RSTRING_EMBED_LEN(VALUE str)
Queries the length of the string.
Definition: rstring.h:423
static int RSTRING_LENINT(VALUE str)
Identical to RSTRING_LEN(), except it differs for the return type.
Definition: rstring.h:553
#define RSTRING_GETMEM(str, ptrvar, lenvar)
Convenient macro to obtain the contents and length at once.
Definition: rstring.h:573
char * rb_string_value_ptr(volatile VALUE *ptr)
Identical to rb_str_to_str(), except it returns the converted string's backend memory region.
Definition: string.c:2531
char * rb_string_value_cstr(volatile VALUE *ptr)
Identical to rb_string_value_ptr(), except it additionally checks for the contents for viability as a...
Definition: string.c:2636
VALUE rb_string_value(volatile VALUE *ptr)
Identical to rb_str_to_str(), except it fills the passed pointer with the converted object.
Definition: string.c:2520
static long RSTRING_LEN(VALUE str)
Queries the length of the string.
Definition: rstring.h:483
#define RSTRING(obj)
Convenient casting macro.
Definition: rstring.h:41
VALUE rb_str_export(VALUE obj)
Identical to rb_str_to_str(), except it additionally converts the string into default external encodi...
Definition: string.c:1275
VALUE rb_str_to_str(VALUE obj)
Identical to rb_check_string_type(), except it raises exceptions in case of conversion failures.
Definition: string.c:1584
#define StringValueCStr(v)
Identical to StringValuePtr, except it additionally checks for the contents for viability as a C stri...
Definition: rstring.h:95
#define TypedData_Wrap_Struct(klass, data_type, sval)
Converts sval, a pointer to your struct, into a Ruby object.
Definition: rtypeddata.h:441
VALUE rb_require(const char *feature)
Identical to rb_require_string(), except it takes C's string instead of Ruby's.
Definition: load.c:1270
#define RTEST
This is an old name of RB_TEST.
#define _(args)
This was a transition path from K&R to ANSI.
Definition: stdarg.h:35
VALUE flags
Per-object flags.
Definition: rbasic.h:77
Ruby's String.
Definition: rstring.h:231
struct RString::@47::@48 heap
Strings that use separated memory region for contents use this pattern.
union RString::@47 as
String's specific fields.
struct RBasic basic
Basic part, including flags and class.
Definition: rstring.h:234
long capa
Capacity of *ptr.
Definition: rstring.h:268
struct RString::@47::@49 embed
Embedded contents.
long len
Length of the string, not including terminating NUL character.
Definition: rstring.h:250
VALUE shared
Parent of the string.
Definition: rstring.h:276
char * ptr
Pointer to the contents of the string.
Definition: rstring.h:258
This is the struct that holds necessary info for a struct.
Definition: rtypeddata.h:190
Definition: st.h:79
Definition: string.c:7522
void rb_nativethread_lock_lock(rb_nativethread_lock_t *lock)
Blocks until the current thread obtains a lock.
Definition: thread.c:440
void rb_nativethread_lock_initialize(rb_nativethread_lock_t *lock)
Fills the passed lock with an initial value.
Definition: thread.c:428
void rb_nativethread_lock_destroy(rb_nativethread_lock_t *lock)
Destroys the passed mutex.
Definition: thread.c:434
uintptr_t ID
Type that represents a Ruby identifier such as a variable name.
Definition: value.h:52
uintptr_t VALUE
Type that represents a Ruby object.
Definition: value.h:40
static void Check_Type(VALUE v, enum ruby_value_type t)
Identical to RB_TYPE_P(), except it raises exceptions on predication failure.
Definition: value_type.h:432
static bool RB_TYPE_P(VALUE obj, enum ruby_value_type t)
Queries if the given object is of given type.
Definition: value_type.h:375
void ruby_xfree(void *ptr)
Deallocates a storage instance.
Definition: gc.c:11775