Ruby  3.1.4p223 (2023-03-30 revision HEAD)
encoding.c
1 /**********************************************************************
2 
3  encoding.c -
4 
5  $Author$
6  created at: Thu May 24 17:23:27 JST 2007
7 
8  Copyright (C) 2007 Yukihiro Matsumoto
9 
10 **********************************************************************/
11 
12 #include "ruby/internal/config.h"
13 
14 #include <ctype.h>
15 
16 #include "encindex.h"
17 #include "internal.h"
18 #include "internal/enc.h"
19 #include "internal/encoding.h"
20 #include "internal/inits.h"
21 #include "internal/load.h"
22 #include "internal/object.h"
23 #include "internal/string.h"
24 #include "internal/vm.h"
25 #include "regenc.h"
26 #include "ruby/encoding.h"
27 #include "ruby/util.h"
28 #include "ruby_assert.h"
29 #include "vm_sync.h"
30 
31 #ifndef ENC_DEBUG
32 #define ENC_DEBUG 0
33 #endif
34 #define ENC_ASSERT(expr) RUBY_ASSERT_WHEN(ENC_DEBUG, expr)
35 #define MUST_STRING(str) (ENC_ASSERT(RB_TYPE_P(str, T_STRING)), str)
36 
37 #undef rb_ascii8bit_encindex
38 #undef rb_utf8_encindex
39 #undef rb_usascii_encindex
40 
42 
43 #if defined __GNUC__ && __GNUC__ >= 4
44 #pragma GCC visibility push(default)
45 int rb_enc_register(const char *name, rb_encoding *encoding);
46 void rb_enc_set_base(const char *name, const char *orig);
47 int rb_enc_set_dummy(int index);
48 void rb_encdb_declare(const char *name);
49 int rb_encdb_replicate(const char *name, const char *orig);
50 int rb_encdb_dummy(const char *name);
51 int rb_encdb_alias(const char *alias, const char *orig);
52 void rb_encdb_set_unicode(int index);
53 #pragma GCC visibility pop
54 #endif
55 
56 static ID id_encoding;
58 
59 #define DEFAULT_ENCODING_LIST_CAPA 128
60 static VALUE rb_default_encoding_list;
61 static VALUE rb_additional_encoding_list;
62 
64  const char *name;
65  rb_encoding *enc;
66  rb_encoding *base;
67 };
68 
69 static struct enc_table {
70  struct rb_encoding_entry *list;
71  int count;
72  int size;
73  st_table *names;
74 } global_enc_table;
75 
76 static rb_encoding *global_enc_ascii,
77  *global_enc_utf_8,
78  *global_enc_us_ascii;
79 
80 #define GLOBAL_ENC_TABLE_ENTER(enc_table) struct enc_table *enc_table = &global_enc_table; RB_VM_LOCK_ENTER()
81 #define GLOBAL_ENC_TABLE_LEAVE() RB_VM_LOCK_LEAVE()
82 #define GLOBAL_ENC_TABLE_EVAL(enc_table, expr) do { \
83  GLOBAL_ENC_TABLE_ENTER(enc_table); \
84  { \
85  expr; \
86  } \
87  GLOBAL_ENC_TABLE_LEAVE(); \
88 } while (0)
89 
90 
91 #define ENC_DUMMY_FLAG (1<<24)
92 #define ENC_INDEX_MASK (~(~0U<<24))
93 
94 #define ENC_TO_ENCINDEX(enc) (int)((enc)->ruby_encoding_index & ENC_INDEX_MASK)
95 #define ENC_DUMMY_P(enc) ((enc)->ruby_encoding_index & ENC_DUMMY_FLAG)
96 #define ENC_SET_DUMMY(enc) ((enc)->ruby_encoding_index |= ENC_DUMMY_FLAG)
97 
98 #define ENCODING_COUNT ENCINDEX_BUILTIN_MAX
99 #define UNSPECIFIED_ENCODING INT_MAX
100 
101 #define ENCODING_NAMELEN_MAX 63
102 #define valid_encoding_name_p(name) ((name) && strlen(name) <= ENCODING_NAMELEN_MAX)
103 
104 static const rb_data_type_t encoding_data_type = {
105  "encoding",
106  {0, 0, 0,},
107  0, 0, RUBY_TYPED_FREE_IMMEDIATELY
108 };
109 
110 #define is_data_encoding(obj) (RTYPEDDATA_P(obj) && RTYPEDDATA_TYPE(obj) == &encoding_data_type)
111 #define is_obj_encoding(obj) (RB_TYPE_P((obj), T_DATA) && is_data_encoding(obj))
112 
113 int
114 rb_data_is_encoding(VALUE obj)
115 {
116  return is_data_encoding(obj);
117 }
118 
119 static VALUE
120 enc_new(rb_encoding *encoding)
121 {
122  VALUE enc = TypedData_Wrap_Struct(rb_cEncoding, &encoding_data_type, (void *)encoding);
123  rb_obj_freeze(enc);
125  return enc;
126 }
127 
128 static void
129 enc_list_update(int index, rb_raw_encoding *encoding)
130 {
131  if (index < DEFAULT_ENCODING_LIST_CAPA) {
132  VALUE list = rb_default_encoding_list;
133  if (list && NIL_P(rb_ary_entry(list, index))) {
134  /* initialize encoding data */
135  rb_ary_store(list, index, enc_new(encoding));
136  }
137  }
138  else {
139  RB_VM_LOCK_ENTER();
140  {
141  VALUE list = rb_additional_encoding_list;
142  if (list && NIL_P(rb_ary_entry(list, index))) {
143  /* initialize encoding data */
144  rb_ary_store(list, index - DEFAULT_ENCODING_LIST_CAPA, enc_new(encoding));
145  }
146  }
147  RB_VM_LOCK_LEAVE();
148  }
149 }
150 
151 static VALUE
152 enc_list_lookup(int idx)
153 {
154  VALUE list, enc;
155 
156  if (idx < DEFAULT_ENCODING_LIST_CAPA) {
157  if (!(list = rb_default_encoding_list)) {
158  rb_bug("rb_enc_from_encoding_index(%d): no rb_default_encoding_list", idx);
159  }
160  enc = rb_ary_entry(list, idx);
161  }
162  else {
163  RB_VM_LOCK_ENTER();
164  {
165  if (!(list = rb_additional_encoding_list)) {
166  rb_bug("rb_enc_from_encoding_index(%d): no rb_additional_encoding_list", idx);
167  }
168  enc = rb_ary_entry(list, idx - DEFAULT_ENCODING_LIST_CAPA);
169  }
170  RB_VM_LOCK_LEAVE();
171  }
172 
173  if (NIL_P(enc)) {
174  rb_bug("rb_enc_from_encoding_index(%d): not created yet", idx);
175  }
176  else {
177  return enc;
178  }
179 }
180 
181 static VALUE
182 rb_enc_from_encoding_index(int idx)
183 {
184  return enc_list_lookup(idx);
185 }
186 
187 VALUE
189 {
190  int idx;
191  if (!encoding) return Qnil;
192  idx = ENC_TO_ENCINDEX(encoding);
193  return rb_enc_from_encoding_index(idx);
194 }
195 
196 int
198 {
199  return enc ? ENC_TO_ENCINDEX(enc) : 0;
200 }
201 
202 int
204 {
205  return ENC_DUMMY_P(enc) != 0;
206 }
207 
208 static int
209 check_encoding(rb_encoding *enc)
210 {
211  int index = rb_enc_to_index(enc);
212  if (rb_enc_from_index(index) != enc)
213  return -1;
214  if (rb_enc_autoload_p(enc)) {
215  index = rb_enc_autoload(enc);
216  }
217  return index;
218 }
219 
220 static int
221 enc_check_encoding(VALUE obj)
222 {
223  if (!is_obj_encoding(obj)) {
224  return -1;
225  }
226  return check_encoding(RDATA(obj)->data);
227 }
228 
229 NORETURN(static void not_encoding(VALUE enc));
230 static void
231 not_encoding(VALUE enc)
232 {
233  rb_raise(rb_eTypeError, "wrong argument type %"PRIsVALUE" (expected Encoding)",
234  rb_obj_class(enc));
235 }
236 
237 static rb_encoding *
238 must_encoding(VALUE enc)
239 {
240  int index = enc_check_encoding(enc);
241  if (index < 0) {
242  not_encoding(enc);
243  }
244  return DATA_PTR(enc);
245 }
246 
247 static rb_encoding *
248 must_encindex(int index)
249 {
250  rb_encoding *enc = rb_enc_from_index(index);
251  if (!enc) {
252  rb_raise(rb_eEncodingError, "encoding index out of bound: %d",
253  index);
254  }
255  if (ENC_TO_ENCINDEX(enc) != (int)(index & ENC_INDEX_MASK)) {
256  rb_raise(rb_eEncodingError, "wrong encoding index %d for %s (expected %d)",
257  index, rb_enc_name(enc), ENC_TO_ENCINDEX(enc));
258  }
259  if (rb_enc_autoload_p(enc) && rb_enc_autoload(enc) == -1) {
260  rb_loaderror("failed to load encoding (%s)",
261  rb_enc_name(enc));
262  }
263  return enc;
264 }
265 
266 int
268 {
269  int idx;
270  const char *name;
271 
272  idx = enc_check_encoding(enc);
273  if (idx >= 0) {
274  return idx;
275  }
276  else if (NIL_P(enc = rb_check_string_type(enc))) {
277  return -1;
278  }
279  if (!rb_enc_asciicompat(rb_enc_get(enc))) {
280  return -1;
281  }
282  if (!(name = rb_str_to_cstr(enc))) {
283  return -1;
284  }
285  return rb_enc_find_index(name);
286 }
287 
288 static const char *
289 name_for_encoding(volatile VALUE *enc)
290 {
291  VALUE name = StringValue(*enc);
292  const char *n;
293 
294  if (!rb_enc_asciicompat(rb_enc_get(name))) {
295  rb_raise(rb_eArgError, "invalid encoding name (non ASCII)");
296  }
297  if (!(n = rb_str_to_cstr(name))) {
298  rb_raise(rb_eArgError, "invalid encoding name (NUL byte)");
299  }
300  return n;
301 }
302 
303 /* Returns encoding index or UNSPECIFIED_ENCODING */
304 static int
305 str_find_encindex(VALUE enc)
306 {
307  int idx = rb_enc_find_index(name_for_encoding(&enc));
308  RB_GC_GUARD(enc);
309  return idx;
310 }
311 
312 static int
313 str_to_encindex(VALUE enc)
314 {
315  int idx = str_find_encindex(enc);
316  if (idx < 0) {
317  rb_raise(rb_eArgError, "unknown encoding name - %"PRIsVALUE, enc);
318  }
319  return idx;
320 }
321 
322 static rb_encoding *
323 str_to_encoding(VALUE enc)
324 {
325  return rb_enc_from_index(str_to_encindex(enc));
326 }
327 
328 rb_encoding *
330 {
331  if (enc_check_encoding(enc) >= 0) return RDATA(enc)->data;
332  return str_to_encoding(enc);
333 }
334 
335 rb_encoding *
337 {
338  int idx;
339  if (enc_check_encoding(enc) >= 0) return RDATA(enc)->data;
340  idx = str_find_encindex(enc);
341  if (idx < 0) return NULL;
342  return rb_enc_from_index(idx);
343 }
344 
345 static int
346 enc_table_expand(struct enc_table *enc_table, int newsize)
347 {
348  struct rb_encoding_entry *ent;
349  int count = newsize;
350 
351  if (enc_table->size >= newsize) return newsize;
352  newsize = (newsize + 7) / 8 * 8;
353  ent = REALLOC_N(enc_table->list, struct rb_encoding_entry, newsize);
354  memset(ent + enc_table->size, 0, sizeof(*ent)*(newsize - enc_table->size));
355  enc_table->list = ent;
356  enc_table->size = newsize;
357  return count;
358 }
359 
360 static int
361 enc_register_at(struct enc_table *enc_table, int index, const char *name, rb_encoding *base_encoding)
362 {
363  struct rb_encoding_entry *ent = &enc_table->list[index];
364  rb_raw_encoding *encoding;
365 
366  if (!valid_encoding_name_p(name)) return -1;
367  if (!ent->name) {
368  ent->name = name = strdup(name);
369  }
370  else if (STRCASECMP(name, ent->name)) {
371  return -1;
372  }
373  encoding = (rb_raw_encoding *)ent->enc;
374  if (!encoding) {
375  encoding = xmalloc(sizeof(rb_encoding));
376  }
377  if (base_encoding) {
378  *encoding = *base_encoding;
379  }
380  else {
381  memset(encoding, 0, sizeof(*ent->enc));
382  }
383  encoding->name = name;
384  encoding->ruby_encoding_index = index;
385  ent->enc = encoding;
386  st_insert(enc_table->names, (st_data_t)name, (st_data_t)index);
387 
388  enc_list_update(index, encoding);
389  return index;
390 }
391 
392 static int
393 enc_register(struct enc_table *enc_table, const char *name, rb_encoding *encoding)
394 {
395  int index = enc_table->count;
396 
397  enc_table->count = enc_table_expand(enc_table, index + 1);
398  return enc_register_at(enc_table, index, name, encoding);
399 }
400 
401 static void set_encoding_const(const char *, rb_encoding *);
402 static int enc_registered(struct enc_table *enc_table, const char *name);
403 
404 static rb_encoding *
405 enc_from_index(struct enc_table *enc_table, int index)
406 {
407  if (UNLIKELY(index < 0 || enc_table->count <= (index &= ENC_INDEX_MASK))) {
408  return 0;
409  }
410  return enc_table->list[index].enc;
411 }
412 
413 rb_encoding *
415 {
416  rb_encoding *enc;
417 
418  switch (index) {
419  case ENCINDEX_ASCII: return global_enc_ascii;
420  case ENCINDEX_UTF_8: return global_enc_utf_8;
421  case ENCINDEX_US_ASCII: return global_enc_us_ascii;
422  default:
423  GLOBAL_ENC_TABLE_EVAL(enc_table,
424  enc = enc_from_index(enc_table, index));
425  return enc;
426  }
427 }
428 
429 int
430 rb_enc_register(const char *name, rb_encoding *encoding)
431 {
432  int index;
433 
434  GLOBAL_ENC_TABLE_ENTER(enc_table);
435  {
436  index = enc_registered(enc_table, name);
437 
438  if (index >= 0) {
439  rb_encoding *oldenc = enc_from_index(enc_table, index);
440  if (STRCASECMP(name, rb_enc_name(oldenc))) {
441  index = enc_register(enc_table, name, encoding);
442  }
443  else if (rb_enc_autoload_p(oldenc) || !ENC_DUMMY_P(oldenc)) {
444  enc_register_at(enc_table, index, name, encoding);
445  }
446  else {
447  rb_raise(rb_eArgError, "encoding %s is already registered", name);
448  }
449  }
450  else {
451  index = enc_register(enc_table, name, encoding);
452  set_encoding_const(name, rb_enc_from_index(index));
453  }
454  }
455  GLOBAL_ENC_TABLE_LEAVE();
456  return index;
457 }
458 
459 int
460 enc_registered(struct enc_table *enc_table, const char *name)
461 {
462  st_data_t idx = 0;
463 
464  if (!name) return -1;
465  if (!enc_table->list) return -1;
466  if (st_lookup(enc_table->names, (st_data_t)name, &idx)) {
467  return (int)idx;
468  }
469  return -1;
470 }
471 
472 void
473 rb_encdb_declare(const char *name)
474 {
475  GLOBAL_ENC_TABLE_ENTER(enc_table);
476  {
477  int idx = enc_registered(enc_table, name);
478  if (idx < 0) {
479  idx = enc_register(enc_table, name, 0);
480  }
481  set_encoding_const(name, rb_enc_from_index(idx));
482  }
483  GLOBAL_ENC_TABLE_LEAVE();
484 }
485 
486 static void
487 enc_check_duplication(struct enc_table *enc_table, const char *name)
488 {
489  if (enc_registered(enc_table, name) >= 0) {
490  rb_raise(rb_eArgError, "encoding %s is already registered", name);
491  }
492 }
493 
494 static rb_encoding*
495 set_base_encoding(struct enc_table *enc_table, int index, rb_encoding *base)
496 {
497  rb_encoding *enc = enc_table->list[index].enc;
498 
499  ASSUME(enc);
500  enc_table->list[index].base = base;
501  if (ENC_DUMMY_P(base)) ENC_SET_DUMMY((rb_raw_encoding *)enc);
502  return enc;
503 }
504 
505 /* for encdb.h
506  * Set base encoding for encodings which are not replicas
507  * but not in their own files.
508  */
509 void
510 rb_enc_set_base(const char *name, const char *orig)
511 {
512  GLOBAL_ENC_TABLE_ENTER(enc_table);
513  {
514  int idx = enc_registered(enc_table, name);
515  int origidx = enc_registered(enc_table, orig);
516  set_base_encoding(enc_table, idx, rb_enc_from_index(origidx));
517  }
518  GLOBAL_ENC_TABLE_LEAVE();
519 }
520 
521 /* for encdb.h
522  * Set encoding dummy.
523  */
524 int
525 rb_enc_set_dummy(int index)
526 {
527  rb_encoding *enc;
528 
529  GLOBAL_ENC_TABLE_EVAL(enc_table,
530  enc = enc_table->list[index].enc);
531 
532  ENC_SET_DUMMY((rb_raw_encoding *)enc);
533  return index;
534 }
535 
536 static int
537 enc_replicate(struct enc_table *enc_table, const char *name, rb_encoding *encoding)
538 {
539  int idx;
540 
541  enc_check_duplication(enc_table, name);
542  idx = enc_register(enc_table, name, encoding);
543  if (idx < 0) rb_raise(rb_eArgError, "invalid encoding name: %s", name);
544  set_base_encoding(enc_table, idx, encoding);
545  set_encoding_const(name, rb_enc_from_index(idx));
546  return idx;
547 }
548 
549 int
550 rb_enc_replicate(const char *name, rb_encoding *encoding)
551 {
552  int r;
553 
554  GLOBAL_ENC_TABLE_EVAL(enc_table,
555  r = enc_replicate(enc_table, name, encoding));
556 
557  return r;
558 }
559 
560 /*
561  * call-seq:
562  * enc.replicate(name) -> encoding
563  *
564  * Returns a replicated encoding of _enc_ whose name is _name_.
565  * The new encoding should have the same byte structure of _enc_.
566  * If _name_ is used by another encoding, raise ArgumentError.
567  *
568  */
569 static VALUE
570 enc_replicate_m(VALUE encoding, VALUE name)
571 {
572  int idx = rb_enc_replicate(name_for_encoding(&name), rb_to_encoding(encoding));
573  RB_GC_GUARD(name);
574  return rb_enc_from_encoding_index(idx);
575 }
576 
577 static int
578 enc_replicate_with_index(struct enc_table *enc_table, const char *name, rb_encoding *origenc, int idx)
579 {
580  if (idx < 0) {
581  idx = enc_register(enc_table, name, origenc);
582  }
583  else {
584  idx = enc_register_at(enc_table, idx, name, origenc);
585  }
586  if (idx >= 0) {
587  set_base_encoding(enc_table, idx, origenc);
588  set_encoding_const(name, rb_enc_from_index(idx));
589  }
590  else {
591  rb_raise(rb_eArgError, "failed to replicate encoding");
592  }
593  return idx;
594 }
595 
596 int
597 rb_encdb_replicate(const char *name, const char *orig)
598 {
599  int r;
600 
601  GLOBAL_ENC_TABLE_ENTER(enc_table);
602  {
603  int origidx = enc_registered(enc_table, orig);
604  int idx = enc_registered(enc_table, name);
605 
606  if (origidx < 0) {
607  origidx = enc_register(enc_table, orig, 0);
608  }
609  r = enc_replicate_with_index(enc_table, name, rb_enc_from_index(origidx), idx);
610  }
611  GLOBAL_ENC_TABLE_LEAVE();
612 
613  return r;
614 }
615 
616 int
617 rb_define_dummy_encoding(const char *name)
618 {
619  int index;
620 
621  GLOBAL_ENC_TABLE_ENTER(enc_table);
622  {
623  index = enc_replicate(enc_table, name, rb_ascii8bit_encoding());
624  rb_encoding *enc = enc_table->list[index].enc;
625  ENC_SET_DUMMY((rb_raw_encoding *)enc);
626  }
627  GLOBAL_ENC_TABLE_LEAVE();
628 
629  return index;
630 }
631 
632 int
633 rb_encdb_dummy(const char *name)
634 {
635  int index;
636 
637  GLOBAL_ENC_TABLE_ENTER(enc_table);
638  {
639  index = enc_replicate_with_index(enc_table, name,
641  enc_registered(enc_table, name));
642  rb_encoding *enc = enc_table->list[index].enc;
643  ENC_SET_DUMMY((rb_raw_encoding *)enc);
644  }
645  GLOBAL_ENC_TABLE_LEAVE();
646 
647  return index;
648 }
649 
650 /*
651  * call-seq:
652  * enc.dummy? -> true or false
653  *
654  * Returns true for dummy encodings.
655  * A dummy encoding is an encoding for which character handling is not properly
656  * implemented.
657  * It is used for stateful encodings.
658  *
659  * Encoding::ISO_2022_JP.dummy? #=> true
660  * Encoding::UTF_8.dummy? #=> false
661  *
662  */
663 static VALUE
664 enc_dummy_p(VALUE enc)
665 {
666  return RBOOL(ENC_DUMMY_P(must_encoding(enc)));
667 }
668 
669 /*
670  * call-seq:
671  * enc.ascii_compatible? -> true or false
672  *
673  * Returns whether ASCII-compatible or not.
674  *
675  * Encoding::UTF_8.ascii_compatible? #=> true
676  * Encoding::UTF_16BE.ascii_compatible? #=> false
677  *
678  */
679 static VALUE
680 enc_ascii_compatible_p(VALUE enc)
681 {
682  return RBOOL(rb_enc_asciicompat(must_encoding(enc)));
683 }
684 
685 /*
686  * Returns non-zero when the encoding is Unicode series other than UTF-7 else 0.
687  */
688 int
690 {
691  return ONIGENC_IS_UNICODE(enc);
692 }
693 
694 static st_data_t
695 enc_dup_name(st_data_t name)
696 {
697  return (st_data_t)strdup((const char *)name);
698 }
699 
700 /*
701  * Returns copied alias name when the key is added for st_table,
702  * else returns NULL.
703  */
704 static int
705 enc_alias_internal(struct enc_table *enc_table, const char *alias, int idx)
706 {
707  return st_insert2(enc_table->names, (st_data_t)alias, (st_data_t)idx,
708  enc_dup_name);
709 }
710 
711 static int
712 enc_alias(struct enc_table *enc_table, const char *alias, int idx)
713 {
714  if (!valid_encoding_name_p(alias)) return -1;
715  if (!enc_alias_internal(enc_table, alias, idx))
716  set_encoding_const(alias, enc_from_index(enc_table, idx));
717  return idx;
718 }
719 
720 int
721 rb_enc_alias(const char *alias, const char *orig)
722 {
723  int idx, r;
724 
725  GLOBAL_ENC_TABLE_ENTER(enc_table);
726  {
727  enc_check_duplication(enc_table, alias);
728  if ((idx = rb_enc_find_index(orig)) < 0) {
729  r = -1;
730  }
731  else {
732  r = enc_alias(enc_table, alias, idx);
733  }
734  }
735  GLOBAL_ENC_TABLE_LEAVE();
736 
737  return r;
738 }
739 
740 int
741 rb_encdb_alias(const char *alias, const char *orig)
742 {
743  int r;
744 
745  GLOBAL_ENC_TABLE_ENTER(enc_table);
746  {
747  int idx = enc_registered(enc_table, orig);
748 
749  if (idx < 0) {
750  idx = enc_register(enc_table, orig, 0);
751  }
752  r = enc_alias(enc_table, alias, idx);
753  }
754  GLOBAL_ENC_TABLE_LEAVE();
755 
756  return r;
757 }
758 
759 void
760 rb_encdb_set_unicode(int index)
761 {
763  ASSUME(enc);
764  enc->flags |= ONIGENC_FLAG_UNICODE;
765 }
766 
767 static void
768 rb_enc_init(struct enc_table *enc_table)
769 {
770  enc_table_expand(enc_table, ENCODING_COUNT + 1);
771  if (!enc_table->names) {
772  enc_table->names = st_init_strcasetable();
773  }
774 #define ENC_REGISTER(enc) enc_register_at(enc_table, ENCINDEX_##enc, rb_enc_name(&OnigEncoding##enc), &OnigEncoding##enc)
775  ENC_REGISTER(ASCII);
776  ENC_REGISTER(UTF_8);
777  ENC_REGISTER(US_ASCII);
778  global_enc_ascii = enc_table->list[ENCINDEX_ASCII].enc;
779  global_enc_utf_8 = enc_table->list[ENCINDEX_UTF_8].enc;
780  global_enc_us_ascii = enc_table->list[ENCINDEX_US_ASCII].enc;
781 #undef ENC_REGISTER
782 #define ENCDB_REGISTER(name, enc) enc_register_at(enc_table, ENCINDEX_##enc, name, NULL)
783  ENCDB_REGISTER("UTF-16BE", UTF_16BE);
784  ENCDB_REGISTER("UTF-16LE", UTF_16LE);
785  ENCDB_REGISTER("UTF-32BE", UTF_32BE);
786  ENCDB_REGISTER("UTF-32LE", UTF_32LE);
787  ENCDB_REGISTER("UTF-16", UTF_16);
788  ENCDB_REGISTER("UTF-32", UTF_32);
789  ENCDB_REGISTER("UTF8-MAC", UTF8_MAC);
790 
791  ENCDB_REGISTER("EUC-JP", EUC_JP);
792  ENCDB_REGISTER("Windows-31J", Windows_31J);
793 #undef ENCDB_REGISTER
794  enc_table->count = ENCINDEX_BUILTIN_MAX;
795 }
796 
797 rb_encoding *
798 rb_enc_get_from_index(int index)
799 {
800  return must_encindex(index);
801 }
802 
803 int rb_require_internal_silent(VALUE fname);
804 
805 static int
806 load_encoding(const char *name)
807 {
808  VALUE enclib = rb_sprintf("enc/%s.so", name);
809  VALUE debug = ruby_debug;
810  VALUE errinfo;
811  char *s = RSTRING_PTR(enclib) + 4, *e = RSTRING_END(enclib) - 3;
812  int loaded;
813  int idx;
814 
815  while (s < e) {
816  if (!ISALNUM(*s)) *s = '_';
817  else if (ISUPPER(*s)) *s = (char)TOLOWER(*s);
818  ++s;
819  }
820  enclib = rb_fstring(enclib);
821  ruby_debug = Qfalse;
822  errinfo = rb_errinfo();
823  loaded = rb_require_internal_silent(enclib);
824  ruby_debug = debug;
825  rb_set_errinfo(errinfo);
826 
827  GLOBAL_ENC_TABLE_ENTER(enc_table);
828  {
829  if (loaded < 0 || 1 < loaded) {
830  idx = -1;
831  }
832  else if ((idx = enc_registered(enc_table, name)) < 0) {
833  idx = -1;
834  }
835  else if (rb_enc_autoload_p(enc_table->list[idx].enc)) {
836  idx = -1;
837  }
838  }
839  GLOBAL_ENC_TABLE_LEAVE();
840 
841  return idx;
842 }
843 
844 static int
845 enc_autoload_body(struct enc_table *enc_table, rb_encoding *enc)
846 {
847  rb_encoding *base = enc_table->list[ENC_TO_ENCINDEX(enc)].base;
848 
849  if (base) {
850  int i = 0;
851  do {
852  if (i >= enc_table->count) return -1;
853  } while (enc_table->list[i].enc != base && (++i, 1));
854  if (rb_enc_autoload_p(base)) {
855  if (rb_enc_autoload(base) < 0) return -1;
856  }
857  i = enc->ruby_encoding_index;
858  enc_register_at(enc_table, i & ENC_INDEX_MASK, rb_enc_name(enc), base);
859  ((rb_raw_encoding *)enc)->ruby_encoding_index = i;
860  i &= ENC_INDEX_MASK;
861  return i;
862  }
863  else {
864  return -2;
865  }
866 }
867 
868 int
869 rb_enc_autoload(rb_encoding *enc)
870 {
871  int i;
872  GLOBAL_ENC_TABLE_EVAL(enc_table, i = enc_autoload_body(enc_table, enc));
873  if (i == -2) {
874  i = load_encoding(rb_enc_name(enc));
875  }
876  return i;
877 }
878 
879 /* Return encoding index or UNSPECIFIED_ENCODING from encoding name */
880 int
881 rb_enc_find_index(const char *name)
882 {
883  int i;
884  rb_encoding *enc;
885 
886  GLOBAL_ENC_TABLE_EVAL(enc_table, i = enc_registered(enc_table, name));
887 
888  if (i < 0) {
889  i = load_encoding(name);
890  }
891  else if (!(enc = rb_enc_from_index(i))) {
892  if (i != UNSPECIFIED_ENCODING) {
893  rb_raise(rb_eArgError, "encoding %s is not registered", name);
894  }
895  }
896  else if (rb_enc_autoload_p(enc)) {
897  if (rb_enc_autoload(enc) < 0) {
898  rb_warn("failed to load encoding (%s); use ASCII-8BIT instead",
899  name);
900  return 0;
901  }
902  }
903  return i;
904 }
905 
906 int
907 rb_enc_find_index2(const char *name, long len)
908 {
909  char buf[ENCODING_NAMELEN_MAX+1];
910 
911  if (len > ENCODING_NAMELEN_MAX) return -1;
912  memcpy(buf, name, len);
913  buf[len] = '\0';
914  return rb_enc_find_index(buf);
915 }
916 
917 rb_encoding *
918 rb_enc_find(const char *name)
919 {
920  int idx = rb_enc_find_index(name);
921  if (idx < 0) idx = 0;
922  return rb_enc_from_index(idx);
923 }
924 
925 static inline int
926 enc_capable(VALUE obj)
927 {
928  if (SPECIAL_CONST_P(obj)) return SYMBOL_P(obj);
929  switch (BUILTIN_TYPE(obj)) {
930  case T_STRING:
931  case T_REGEXP:
932  case T_FILE:
933  case T_SYMBOL:
934  return TRUE;
935  case T_DATA:
936  if (is_data_encoding(obj)) return TRUE;
937  default:
938  return FALSE;
939  }
940 }
941 
942 int
944 {
945  return enc_capable(obj);
946 }
947 
948 ID
949 rb_id_encoding(void)
950 {
951  CONST_ID(id_encoding, "encoding");
952  return id_encoding;
953 }
954 
955 static int
956 enc_get_index_str(VALUE str)
957 {
958  int i = ENCODING_GET_INLINED(str);
959  if (i == ENCODING_INLINE_MAX) {
960  VALUE iv;
961 
962 #if 0
963  iv = rb_ivar_get(str, rb_id_encoding());
964  i = NUM2INT(iv);
965 #else
966  /*
967  * Tentatively, assume ASCII-8BIT, if encoding index instance
968  * variable is not found. This can happen when freeing after
969  * all instance variables are removed in `obj_free`.
970  */
971  iv = rb_attr_get(str, rb_id_encoding());
972  i = NIL_P(iv) ? ENCINDEX_ASCII : NUM2INT(iv);
973 #endif
974  }
975  return i;
976 }
977 
978 int
980 {
981  int i = -1;
982  VALUE tmp;
983 
984  if (SPECIAL_CONST_P(obj)) {
985  if (!SYMBOL_P(obj)) return -1;
986  obj = rb_sym2str(obj);
987  }
988  switch (BUILTIN_TYPE(obj)) {
989  case T_STRING:
990  case T_SYMBOL:
991  case T_REGEXP:
992  i = enc_get_index_str(obj);
993  break;
994  case T_FILE:
995  tmp = rb_funcallv(obj, rb_intern("internal_encoding"), 0, 0);
996  if (NIL_P(tmp)) {
997  tmp = rb_funcallv(obj, rb_intern("external_encoding"), 0, 0);
998  }
999  if (is_obj_encoding(tmp)) {
1000  i = enc_check_encoding(tmp);
1001  }
1002  break;
1003  case T_DATA:
1004  if (is_data_encoding(obj)) {
1005  i = enc_check_encoding(obj);
1006  }
1007  break;
1008  default:
1009  break;
1010  }
1011  return i;
1012 }
1013 
1014 static void
1015 enc_set_index(VALUE obj, int idx)
1016 {
1017  if (!enc_capable(obj)) {
1018  rb_raise(rb_eArgError, "cannot set encoding on non-encoding capable object");
1019  }
1020 
1021  if (idx < ENCODING_INLINE_MAX) {
1022  ENCODING_SET_INLINED(obj, idx);
1023  return;
1024  }
1026  rb_ivar_set(obj, rb_id_encoding(), INT2NUM(idx));
1027 }
1028 
1029 void
1031 {
1032  rb_check_frozen(obj);
1033  must_encindex(idx);
1034  enc_set_index(obj, idx);
1035 }
1036 
1037 VALUE
1039 {
1040  rb_encoding *enc;
1041  int oldidx, oldtermlen, termlen;
1042 
1043 /* enc_check_capable(obj);*/
1044  rb_check_frozen(obj);
1045  oldidx = rb_enc_get_index(obj);
1046  if (oldidx == idx)
1047  return obj;
1048  if (SPECIAL_CONST_P(obj)) {
1049  rb_raise(rb_eArgError, "cannot set encoding");
1050  }
1051  enc = must_encindex(idx);
1052  if (!ENC_CODERANGE_ASCIIONLY(obj) ||
1053  !rb_enc_asciicompat(enc)) {
1054  ENC_CODERANGE_CLEAR(obj);
1055  }
1056  termlen = rb_enc_mbminlen(enc);
1057  oldtermlen = rb_enc_mbminlen(rb_enc_from_index(oldidx));
1058  if (oldtermlen != termlen && RB_TYPE_P(obj, T_STRING)) {
1059  rb_str_change_terminator_length(obj, oldtermlen, termlen);
1060  }
1061  enc_set_index(obj, idx);
1062  return obj;
1063 }
1064 
1065 VALUE
1067 {
1068  return rb_enc_associate_index(obj, rb_enc_to_index(enc));
1069 }
1070 
1071 rb_encoding*
1073 {
1074  return rb_enc_from_index(rb_enc_get_index(obj));
1075 }
1076 
1077 static rb_encoding*
1078 rb_encoding_check(rb_encoding* enc, VALUE str1, VALUE str2)
1079 {
1080  if (!enc)
1081  rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
1082  rb_enc_name(rb_enc_get(str1)),
1083  rb_enc_name(rb_enc_get(str2)));
1084  return enc;
1085 }
1086 
1087 static rb_encoding* enc_compatible_str(VALUE str1, VALUE str2);
1088 
1089 rb_encoding*
1090 rb_enc_check_str(VALUE str1, VALUE str2)
1091 {
1092  rb_encoding *enc = enc_compatible_str(MUST_STRING(str1), MUST_STRING(str2));
1093  return rb_encoding_check(enc, str1, str2);
1094 }
1095 
1096 rb_encoding*
1098 {
1099  rb_encoding *enc = rb_enc_compatible(str1, str2);
1100  return rb_encoding_check(enc, str1, str2);
1101 }
1102 
1103 static rb_encoding*
1104 enc_compatible_latter(VALUE str1, VALUE str2, int idx1, int idx2)
1105 {
1106  int isstr1, isstr2;
1107  rb_encoding *enc1 = rb_enc_from_index(idx1);
1108  rb_encoding *enc2 = rb_enc_from_index(idx2);
1109 
1110  isstr2 = RB_TYPE_P(str2, T_STRING);
1111  if (isstr2 && RSTRING_LEN(str2) == 0)
1112  return enc1;
1113  isstr1 = RB_TYPE_P(str1, T_STRING);
1114  if (isstr1 && isstr2 && RSTRING_LEN(str1) == 0)
1115  return (rb_enc_asciicompat(enc1) && rb_enc_str_asciionly_p(str2)) ? enc1 : enc2;
1116  if (!rb_enc_asciicompat(enc1) || !rb_enc_asciicompat(enc2)) {
1117  return 0;
1118  }
1119 
1120  /* objects whose encoding is the same of contents */
1121  if (!isstr2 && idx2 == ENCINDEX_US_ASCII)
1122  return enc1;
1123  if (!isstr1 && idx1 == ENCINDEX_US_ASCII)
1124  return enc2;
1125 
1126  if (!isstr1) {
1127  VALUE tmp = str1;
1128  int idx0 = idx1;
1129  str1 = str2;
1130  str2 = tmp;
1131  idx1 = idx2;
1132  idx2 = idx0;
1133  idx0 = isstr1;
1134  isstr1 = isstr2;
1135  isstr2 = idx0;
1136  }
1137  if (isstr1) {
1138  int cr1, cr2;
1139 
1140  cr1 = rb_enc_str_coderange(str1);
1141  if (isstr2) {
1142  cr2 = rb_enc_str_coderange(str2);
1143  if (cr1 != cr2) {
1144  /* may need to handle ENC_CODERANGE_BROKEN */
1145  if (cr1 == ENC_CODERANGE_7BIT) return enc2;
1146  if (cr2 == ENC_CODERANGE_7BIT) return enc1;
1147  }
1148  if (cr2 == ENC_CODERANGE_7BIT) {
1149  return enc1;
1150  }
1151  }
1152  if (cr1 == ENC_CODERANGE_7BIT)
1153  return enc2;
1154  }
1155  return 0;
1156 }
1157 
1158 static rb_encoding*
1159 enc_compatible_str(VALUE str1, VALUE str2)
1160 {
1161  int idx1 = enc_get_index_str(str1);
1162  int idx2 = enc_get_index_str(str2);
1163 
1164  if (idx1 < 0 || idx2 < 0)
1165  return 0;
1166 
1167  if (idx1 == idx2) {
1168  return rb_enc_from_index(idx1);
1169  }
1170  else {
1171  return enc_compatible_latter(str1, str2, idx1, idx2);
1172  }
1173 }
1174 
1175 rb_encoding*
1177 {
1178  int idx1 = rb_enc_get_index(str1);
1179  int idx2 = rb_enc_get_index(str2);
1180 
1181  if (idx1 < 0 || idx2 < 0)
1182  return 0;
1183 
1184  if (idx1 == idx2) {
1185  return rb_enc_from_index(idx1);
1186  }
1187 
1188  return enc_compatible_latter(str1, str2, idx1, idx2);
1189 }
1190 
1191 void
1193 {
1195 }
1196 
1197 
1198 /*
1199  * call-seq:
1200  * obj.encoding -> encoding
1201  *
1202  * Returns the Encoding object that represents the encoding of obj.
1203  */
1204 
1205 VALUE
1207 {
1208  int idx = rb_enc_get_index(obj);
1209  if (idx < 0) {
1210  rb_raise(rb_eTypeError, "unknown encoding");
1211  }
1212  return rb_enc_from_encoding_index(idx & ENC_INDEX_MASK);
1213 }
1214 
1215 int
1216 rb_enc_fast_mbclen(const char *p, const char *e, rb_encoding *enc)
1217 {
1218  return ONIGENC_MBC_ENC_LEN(enc, (UChar*)p, (UChar*)e);
1219 }
1220 
1221 int
1222 rb_enc_mbclen(const char *p, const char *e, rb_encoding *enc)
1223 {
1224  int n = ONIGENC_PRECISE_MBC_ENC_LEN(enc, (UChar*)p, (UChar*)e);
1225  if (MBCLEN_CHARFOUND_P(n) && MBCLEN_CHARFOUND_LEN(n) <= e-p)
1226  return MBCLEN_CHARFOUND_LEN(n);
1227  else {
1228  int min = rb_enc_mbminlen(enc);
1229  return min <= e-p ? min : (int)(e-p);
1230  }
1231 }
1232 
1233 int
1234 rb_enc_precise_mbclen(const char *p, const char *e, rb_encoding *enc)
1235 {
1236  int n;
1237  if (e <= p)
1238  return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(1);
1239  n = ONIGENC_PRECISE_MBC_ENC_LEN(enc, (UChar*)p, (UChar*)e);
1240  if (e-p < n)
1241  return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(n-(int)(e-p));
1242  return n;
1243 }
1244 
1245 int
1246 rb_enc_ascget(const char *p, const char *e, int *len, rb_encoding *enc)
1247 {
1248  unsigned int c;
1249  int l;
1250  if (e <= p)
1251  return -1;
1252  if (rb_enc_asciicompat(enc)) {
1253  c = (unsigned char)*p;
1254  if (!ISASCII(c))
1255  return -1;
1256  if (len) *len = 1;
1257  return c;
1258  }
1259  l = rb_enc_precise_mbclen(p, e, enc);
1260  if (!MBCLEN_CHARFOUND_P(l))
1261  return -1;
1262  c = rb_enc_mbc_to_codepoint(p, e, enc);
1263  if (!rb_enc_isascii(c, enc))
1264  return -1;
1265  if (len) *len = l;
1266  return c;
1267 }
1268 
1269 unsigned int
1270 rb_enc_codepoint_len(const char *p, const char *e, int *len_p, rb_encoding *enc)
1271 {
1272  int r;
1273  if (e <= p)
1274  rb_raise(rb_eArgError, "empty string");
1275  r = rb_enc_precise_mbclen(p, e, enc);
1276  if (!MBCLEN_CHARFOUND_P(r)) {
1277  rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(enc));
1278  }
1279  if (len_p) *len_p = MBCLEN_CHARFOUND_LEN(r);
1280  return rb_enc_mbc_to_codepoint(p, e, enc);
1281 }
1282 
1283 int
1285 {
1286  int n = ONIGENC_CODE_TO_MBCLEN(enc,c);
1287  if (n == 0) {
1288  rb_raise(rb_eArgError, "invalid codepoint 0x%x in %s", c, rb_enc_name(enc));
1289  }
1290  return n;
1291 }
1292 
1293 int
1295 {
1296  return (ONIGENC_IS_ASCII_CODE(c)?ONIGENC_ASCII_CODE_TO_UPPER_CASE(c):(c));
1297 }
1298 
1299 int
1301 {
1302  return (ONIGENC_IS_ASCII_CODE(c)?ONIGENC_ASCII_CODE_TO_LOWER_CASE(c):(c));
1303 }
1304 
1305 /*
1306  * call-seq:
1307  * enc.inspect -> string
1308  *
1309  * Returns a string which represents the encoding for programmers.
1310  *
1311  * Encoding::UTF_8.inspect #=> "#<Encoding:UTF-8>"
1312  * Encoding::ISO_2022_JP.inspect #=> "#<Encoding:ISO-2022-JP (dummy)>"
1313  */
1314 static VALUE
1315 enc_inspect(VALUE self)
1316 {
1317  rb_encoding *enc;
1318 
1319  if (!is_data_encoding(self)) {
1320  not_encoding(self);
1321  }
1322  if (!(enc = DATA_PTR(self)) || rb_enc_from_index(rb_enc_to_index(enc)) != enc) {
1323  rb_raise(rb_eTypeError, "broken Encoding");
1324  }
1326  "#<%"PRIsVALUE":%s%s%s>", rb_obj_class(self),
1327  rb_enc_name(enc),
1328  (ENC_DUMMY_P(enc) ? " (dummy)" : ""),
1329  rb_enc_autoload_p(enc) ? " (autoload)" : "");
1330 }
1331 
1332 /*
1333  * call-seq:
1334  * enc.name -> string
1335  * enc.to_s -> string
1336  *
1337  * Returns the name of the encoding.
1338  *
1339  * Encoding::UTF_8.name #=> "UTF-8"
1340  */
1341 static VALUE
1342 enc_name(VALUE self)
1343 {
1344  return rb_fstring_cstr(rb_enc_name((rb_encoding*)DATA_PTR(self)));
1345 }
1346 
1347 static int
1348 enc_names_i(st_data_t name, st_data_t idx, st_data_t args)
1349 {
1350  VALUE *arg = (VALUE *)args;
1351 
1352  if ((int)idx == (int)arg[0]) {
1353  VALUE str = rb_fstring_cstr((char *)name);
1354  rb_ary_push(arg[1], str);
1355  }
1356  return ST_CONTINUE;
1357 }
1358 
1359 /*
1360  * call-seq:
1361  * enc.names -> array
1362  *
1363  * Returns the list of name and aliases of the encoding.
1364  *
1365  * Encoding::WINDOWS_31J.names #=> ["Windows-31J", "CP932", "csWindows31J", "SJIS", "PCK"]
1366  */
1367 static VALUE
1368 enc_names(VALUE self)
1369 {
1370  VALUE args[2];
1371 
1372  args[0] = (VALUE)rb_to_encoding_index(self);
1373  args[1] = rb_ary_new2(0);
1374 
1375  GLOBAL_ENC_TABLE_EVAL(enc_table,
1376  st_foreach(enc_table->names, enc_names_i, (st_data_t)args));
1377 
1378  return args[1];
1379 }
1380 
1381 /*
1382  * call-seq:
1383  * Encoding.list -> [enc1, enc2, ...]
1384  *
1385  * Returns the list of loaded encodings.
1386  *
1387  * Encoding.list
1388  * #=> [#<Encoding:ASCII-8BIT>, #<Encoding:UTF-8>,
1389  * #<Encoding:ISO-2022-JP (dummy)>]
1390  *
1391  * Encoding.find("US-ASCII")
1392  * #=> #<Encoding:US-ASCII>
1393  *
1394  * Encoding.list
1395  * #=> [#<Encoding:ASCII-8BIT>, #<Encoding:UTF-8>,
1396  * #<Encoding:US-ASCII>, #<Encoding:ISO-2022-JP (dummy)>]
1397  *
1398  */
1399 static VALUE
1400 enc_list(VALUE klass)
1401 {
1402  VALUE ary = rb_ary_new2(0);
1403 
1404  RB_VM_LOCK_ENTER();
1405  {
1406  rb_ary_replace(ary, rb_default_encoding_list);
1407  rb_ary_concat(ary, rb_additional_encoding_list);
1408  }
1409  RB_VM_LOCK_LEAVE();
1410 
1411  return ary;
1412 }
1413 
1414 /*
1415  * call-seq:
1416  * Encoding.find(string) -> enc
1417  *
1418  * Search the encoding with specified <i>name</i>.
1419  * <i>name</i> should be a string.
1420  *
1421  * Encoding.find("US-ASCII") #=> #<Encoding:US-ASCII>
1422  *
1423  * Names which this method accept are encoding names and aliases
1424  * including following special aliases
1425  *
1426  * "external":: default external encoding
1427  * "internal":: default internal encoding
1428  * "locale":: locale encoding
1429  * "filesystem":: filesystem encoding
1430  *
1431  * An ArgumentError is raised when no encoding with <i>name</i>.
1432  * Only <code>Encoding.find("internal")</code> however returns nil
1433  * when no encoding named "internal", in other words, when Ruby has no
1434  * default internal encoding.
1435  */
1436 static VALUE
1437 enc_find(VALUE klass, VALUE enc)
1438 {
1439  int idx;
1440  if (is_obj_encoding(enc))
1441  return enc;
1442  idx = str_to_encindex(enc);
1443  if (idx == UNSPECIFIED_ENCODING) return Qnil;
1444  return rb_enc_from_encoding_index(idx);
1445 }
1446 
1447 /*
1448  * call-seq:
1449  * Encoding.compatible?(obj1, obj2) -> enc or nil
1450  *
1451  * Checks the compatibility of two objects.
1452  *
1453  * If the objects are both strings they are compatible when they are
1454  * concatenatable. The encoding of the concatenated string will be returned
1455  * if they are compatible, nil if they are not.
1456  *
1457  * Encoding.compatible?("\xa1".force_encoding("iso-8859-1"), "b")
1458  * #=> #<Encoding:ISO-8859-1>
1459  *
1460  * Encoding.compatible?(
1461  * "\xa1".force_encoding("iso-8859-1"),
1462  * "\xa1\xa1".force_encoding("euc-jp"))
1463  * #=> nil
1464  *
1465  * If the objects are non-strings their encodings are compatible when they
1466  * have an encoding and:
1467  * * Either encoding is US-ASCII compatible
1468  * * One of the encodings is a 7-bit encoding
1469  *
1470  */
1471 static VALUE
1472 enc_compatible_p(VALUE klass, VALUE str1, VALUE str2)
1473 {
1474  rb_encoding *enc;
1475 
1476  if (!enc_capable(str1)) return Qnil;
1477  if (!enc_capable(str2)) return Qnil;
1478  enc = rb_enc_compatible(str1, str2);
1479  if (!enc) return Qnil;
1480  return rb_enc_from_encoding(enc);
1481 }
1482 
1483 NORETURN(static VALUE enc_s_alloc(VALUE klass));
1484 /* :nodoc: */
1485 static VALUE
1486 enc_s_alloc(VALUE klass)
1487 {
1488  rb_undefined_alloc(klass);
1490 }
1491 
1492 /* :nodoc: */
1493 static VALUE
1494 enc_dump(int argc, VALUE *argv, VALUE self)
1495 {
1496  rb_check_arity(argc, 0, 1);
1497  return enc_name(self);
1498 }
1499 
1500 /* :nodoc: */
1501 static VALUE
1502 enc_load(VALUE klass, VALUE str)
1503 {
1504  return str;
1505 }
1506 
1507 /* :nodoc: */
1508 static VALUE
1509 enc_m_loader(VALUE klass, VALUE str)
1510 {
1511  return enc_find(klass, str);
1512 }
1513 
1514 rb_encoding *
1516 {
1517  return global_enc_ascii;
1518 }
1519 
1520 int
1522 {
1523  return ENCINDEX_ASCII;
1524 }
1525 
1526 rb_encoding *
1528 {
1529  return global_enc_utf_8;
1530 }
1531 
1532 int
1534 {
1535  return ENCINDEX_UTF_8;
1536 }
1537 
1538 rb_encoding *
1540 {
1541  return global_enc_us_ascii;
1542 }
1543 
1544 int
1546 {
1547  return ENCINDEX_US_ASCII;
1548 }
1549 
1550 int rb_locale_charmap_index(void);
1551 
1552 int
1554 {
1555  int idx = rb_locale_charmap_index();
1556 
1557  if (idx < 0) idx = ENCINDEX_UTF_8;
1558 
1559  GLOBAL_ENC_TABLE_ENTER(enc_table);
1560  if (enc_registered(enc_table, "locale") < 0) {
1561 # if defined _WIN32
1562  void Init_w32_codepage(void);
1563  Init_w32_codepage();
1564 # endif
1565  enc_alias_internal(enc_table, "locale", idx);
1566  }
1567  GLOBAL_ENC_TABLE_LEAVE();
1568 
1569  return idx;
1570 }
1571 
1572 rb_encoding *
1574 {
1576 }
1577 
1578 int
1580 {
1581  int idx;
1582 
1583  GLOBAL_ENC_TABLE_EVAL(enc_table,
1584  idx = enc_registered(enc_table, "filesystem"));
1585 
1586  if (idx < 0)
1587  idx = ENCINDEX_ASCII;
1588  return idx;
1589 }
1590 
1591 rb_encoding *
1593 {
1595 }
1596 
1598  int index; /* -2 => not yet set, -1 => nil */
1599  rb_encoding *enc;
1600 };
1601 
1602 static struct default_encoding default_external = {0};
1603 
1604 static int
1605 enc_set_default_encoding(struct default_encoding *def, VALUE encoding, const char *name)
1606 {
1607  int overridden = FALSE;
1608 
1609  if (def->index != -2)
1610  /* Already set */
1611  overridden = TRUE;
1612 
1613  GLOBAL_ENC_TABLE_ENTER(enc_table);
1614  {
1615  if (NIL_P(encoding)) {
1616  def->index = -1;
1617  def->enc = 0;
1618  st_insert(enc_table->names, (st_data_t)strdup(name),
1619  (st_data_t)UNSPECIFIED_ENCODING);
1620  }
1621  else {
1622  def->index = rb_enc_to_index(rb_to_encoding(encoding));
1623  def->enc = 0;
1624  enc_alias_internal(enc_table, name, def->index);
1625  }
1626 
1627  if (def == &default_external) {
1628  enc_alias_internal(enc_table, "filesystem", Init_enc_set_filesystem_encoding());
1629  }
1630  }
1631  GLOBAL_ENC_TABLE_LEAVE();
1632 
1633  return overridden;
1634 }
1635 
1636 rb_encoding *
1638 {
1639  if (default_external.enc) return default_external.enc;
1640 
1641  if (default_external.index >= 0) {
1642  default_external.enc = rb_enc_from_index(default_external.index);
1643  return default_external.enc;
1644  }
1645  else {
1646  return rb_locale_encoding();
1647  }
1648 }
1649 
1650 VALUE
1652 {
1654 }
1655 
1656 /*
1657  * call-seq:
1658  * Encoding.default_external -> enc
1659  *
1660  * Returns default external encoding.
1661  *
1662  * The default external encoding is used by default for strings created from
1663  * the following locations:
1664  *
1665  * * CSV
1666  * * File data read from disk
1667  * * SDBM
1668  * * StringIO
1669  * * Zlib::GzipReader
1670  * * Zlib::GzipWriter
1671  * * String#inspect
1672  * * Regexp#inspect
1673  *
1674  * While strings created from these locations will have this encoding, the
1675  * encoding may not be valid. Be sure to check String#valid_encoding?.
1676  *
1677  * File data written to disk will be transcoded to the default external
1678  * encoding when written, if default_internal is not nil.
1679  *
1680  * The default external encoding is initialized by the -E option.
1681  * If -E isn't set, it is initialized to UTF-8 on Windows and the locale on
1682  * other operating systems.
1683  */
1684 static VALUE
1685 get_default_external(VALUE klass)
1686 {
1687  return rb_enc_default_external();
1688 }
1689 
1690 void
1692 {
1693  if (NIL_P(encoding)) {
1694  rb_raise(rb_eArgError, "default external can not be nil");
1695  }
1696  enc_set_default_encoding(&default_external, encoding,
1697  "external");
1698 }
1699 
1700 /*
1701  * call-seq:
1702  * Encoding.default_external = enc
1703  *
1704  * Sets default external encoding. You should not set
1705  * Encoding::default_external in ruby code as strings created before changing
1706  * the value may have a different encoding from strings created after the value
1707  * was changed., instead you should use <tt>ruby -E</tt> to invoke ruby with
1708  * the correct default_external.
1709  *
1710  * See Encoding::default_external for information on how the default external
1711  * encoding is used.
1712  */
1713 static VALUE
1714 set_default_external(VALUE klass, VALUE encoding)
1715 {
1716  rb_warning("setting Encoding.default_external");
1717  rb_enc_set_default_external(encoding);
1718  return encoding;
1719 }
1720 
1721 static struct default_encoding default_internal = {-2};
1722 
1723 rb_encoding *
1725 {
1726  if (!default_internal.enc && default_internal.index >= 0) {
1727  default_internal.enc = rb_enc_from_index(default_internal.index);
1728  }
1729  return default_internal.enc; /* can be NULL */
1730 }
1731 
1732 VALUE
1734 {
1735  /* Note: These functions cope with default_internal not being set */
1737 }
1738 
1739 /*
1740  * call-seq:
1741  * Encoding.default_internal -> enc
1742  *
1743  * Returns default internal encoding. Strings will be transcoded to the
1744  * default internal encoding in the following places if the default internal
1745  * encoding is not nil:
1746  *
1747  * * CSV
1748  * * Etc.sysconfdir and Etc.systmpdir
1749  * * File data read from disk
1750  * * File names from Dir
1751  * * Integer#chr
1752  * * String#inspect and Regexp#inspect
1753  * * Strings returned from Readline
1754  * * Strings returned from SDBM
1755  * * Time#zone
1756  * * Values from ENV
1757  * * Values in ARGV including $PROGRAM_NAME
1758  *
1759  * Additionally String#encode and String#encode! use the default internal
1760  * encoding if no encoding is given.
1761  *
1762  * The script encoding (__ENCODING__), not default_internal, is used as the
1763  * encoding of created strings.
1764  *
1765  * Encoding::default_internal is initialized with -E option or nil otherwise.
1766  */
1767 static VALUE
1768 get_default_internal(VALUE klass)
1769 {
1770  return rb_enc_default_internal();
1771 }
1772 
1773 void
1775 {
1776  enc_set_default_encoding(&default_internal, encoding,
1777  "internal");
1778 }
1779 
1780 /*
1781  * call-seq:
1782  * Encoding.default_internal = enc or nil
1783  *
1784  * Sets default internal encoding or removes default internal encoding when
1785  * passed nil. You should not set Encoding::default_internal in ruby code as
1786  * strings created before changing the value may have a different encoding
1787  * from strings created after the change. Instead you should use
1788  * <tt>ruby -E</tt> to invoke ruby with the correct default_internal.
1789  *
1790  * See Encoding::default_internal for information on how the default internal
1791  * encoding is used.
1792  */
1793 static VALUE
1794 set_default_internal(VALUE klass, VALUE encoding)
1795 {
1796  rb_warning("setting Encoding.default_internal");
1797  rb_enc_set_default_internal(encoding);
1798  return encoding;
1799 }
1800 
1801 static void
1802 set_encoding_const(const char *name, rb_encoding *enc)
1803 {
1804  VALUE encoding = rb_enc_from_encoding(enc);
1805  char *s = (char *)name;
1806  int haslower = 0, hasupper = 0, valid = 0;
1807 
1808  if (ISDIGIT(*s)) return;
1809  if (ISUPPER(*s)) {
1810  hasupper = 1;
1811  while (*++s && (ISALNUM(*s) || *s == '_')) {
1812  if (ISLOWER(*s)) haslower = 1;
1813  }
1814  }
1815  if (!*s) {
1816  if (s - name > ENCODING_NAMELEN_MAX) return;
1817  valid = 1;
1818  rb_define_const(rb_cEncoding, name, encoding);
1819  }
1820  if (!valid || haslower) {
1821  size_t len = s - name;
1822  if (len > ENCODING_NAMELEN_MAX) return;
1823  if (!haslower || !hasupper) {
1824  do {
1825  if (ISLOWER(*s)) haslower = 1;
1826  if (ISUPPER(*s)) hasupper = 1;
1827  } while (*++s && (!haslower || !hasupper));
1828  len = s - name;
1829  }
1830  len += strlen(s);
1831  if (len++ > ENCODING_NAMELEN_MAX) return;
1832  MEMCPY(s = ALLOCA_N(char, len), name, char, len);
1833  name = s;
1834  if (!valid) {
1835  if (ISLOWER(*s)) *s = ONIGENC_ASCII_CODE_TO_UPPER_CASE((int)*s);
1836  for (; *s; ++s) {
1837  if (!ISALNUM(*s)) *s = '_';
1838  }
1839  if (hasupper) {
1840  rb_define_const(rb_cEncoding, name, encoding);
1841  }
1842  }
1843  if (haslower) {
1844  for (s = (char *)name; *s; ++s) {
1845  if (ISLOWER(*s)) *s = ONIGENC_ASCII_CODE_TO_UPPER_CASE((int)*s);
1846  }
1847  rb_define_const(rb_cEncoding, name, encoding);
1848  }
1849  }
1850 }
1851 
1852 static int
1853 rb_enc_name_list_i(st_data_t name, st_data_t idx, st_data_t arg)
1854 {
1855  VALUE ary = (VALUE)arg;
1856  VALUE str = rb_fstring_cstr((char *)name);
1857  rb_ary_push(ary, str);
1858  return ST_CONTINUE;
1859 }
1860 
1861 /*
1862  * call-seq:
1863  * Encoding.name_list -> ["enc1", "enc2", ...]
1864  *
1865  * Returns the list of available encoding names.
1866  *
1867  * Encoding.name_list
1868  * #=> ["US-ASCII", "ASCII-8BIT", "UTF-8",
1869  * "ISO-8859-1", "Shift_JIS", "EUC-JP",
1870  * "Windows-31J",
1871  * "BINARY", "CP932", "eucJP"]
1872  *
1873  */
1874 
1875 static VALUE
1876 rb_enc_name_list(VALUE klass)
1877 {
1878  VALUE ary;
1879 
1880  GLOBAL_ENC_TABLE_ENTER(enc_table);
1881  {
1882  ary = rb_ary_new2(enc_table->names->num_entries);
1883  st_foreach(enc_table->names, rb_enc_name_list_i, (st_data_t)ary);
1884  }
1885  GLOBAL_ENC_TABLE_LEAVE();
1886 
1887  return ary;
1888 }
1889 
1890 static int
1891 rb_enc_aliases_enc_i(st_data_t name, st_data_t orig, st_data_t arg)
1892 {
1893  VALUE *p = (VALUE *)arg;
1894  VALUE aliases = p[0], ary = p[1];
1895  int idx = (int)orig;
1896  VALUE key, str = rb_ary_entry(ary, idx);
1897 
1898  if (NIL_P(str)) {
1899  rb_encoding *enc = rb_enc_from_index(idx);
1900 
1901  if (!enc) return ST_CONTINUE;
1902  if (STRCASECMP((char*)name, rb_enc_name(enc)) == 0) {
1903  return ST_CONTINUE;
1904  }
1905  str = rb_fstring_cstr(rb_enc_name(enc));
1906  rb_ary_store(ary, idx, str);
1907  }
1908  key = rb_fstring_cstr((char *)name);
1909  rb_hash_aset(aliases, key, str);
1910  return ST_CONTINUE;
1911 }
1912 
1913 /*
1914  * call-seq:
1915  * Encoding.aliases -> {"alias1" => "orig1", "alias2" => "orig2", ...}
1916  *
1917  * Returns the hash of available encoding alias and original encoding name.
1918  *
1919  * Encoding.aliases
1920  * #=> {"BINARY"=>"ASCII-8BIT", "ASCII"=>"US-ASCII", "ANSI_X3.4-1968"=>"US-ASCII",
1921  * "SJIS"=>"Windows-31J", "eucJP"=>"EUC-JP", "CP932"=>"Windows-31J"}
1922  *
1923  */
1924 
1925 static VALUE
1926 rb_enc_aliases(VALUE klass)
1927 {
1928  VALUE aliases[2];
1929  aliases[0] = rb_hash_new();
1930  aliases[1] = rb_ary_new();
1931 
1932  GLOBAL_ENC_TABLE_EVAL(enc_table,
1933  st_foreach(enc_table->names, rb_enc_aliases_enc_i, (st_data_t)aliases));
1934 
1935  return aliases[0];
1936 }
1937 
1938 /*
1939  * An Encoding instance represents a character encoding usable in Ruby. It is
1940  * defined as a constant under the Encoding namespace. It has a name and
1941  * optionally, aliases:
1942  *
1943  * Encoding::ISO_8859_1.name
1944  * #=> "ISO-8859-1"
1945  *
1946  * Encoding::ISO_8859_1.names
1947  * #=> ["ISO-8859-1", "ISO8859-1"]
1948  *
1949  * Ruby methods dealing with encodings return or accept Encoding instances as
1950  * arguments (when a method accepts an Encoding instance as an argument, it
1951  * can be passed an Encoding name or alias instead).
1952  *
1953  * "some string".encoding
1954  * #=> #<Encoding:UTF-8>
1955  *
1956  * string = "some string".encode(Encoding::ISO_8859_1)
1957  * #=> "some string"
1958  * string.encoding
1959  * #=> #<Encoding:ISO-8859-1>
1960  *
1961  * "some string".encode "ISO-8859-1"
1962  * #=> "some string"
1963  *
1964  * Encoding::ASCII_8BIT is a special encoding that is usually used for
1965  * a byte string, not a character string. But as the name insists, its
1966  * characters in the range of ASCII are considered as ASCII
1967  * characters. This is useful when you use ASCII-8BIT characters with
1968  * other ASCII compatible characters.
1969  *
1970  * == Changing an encoding
1971  *
1972  * The associated Encoding of a String can be changed in two different ways.
1973  *
1974  * First, it is possible to set the Encoding of a string to a new Encoding
1975  * without changing the internal byte representation of the string, with
1976  * String#force_encoding. This is how you can tell Ruby the correct encoding
1977  * of a string.
1978  *
1979  * string
1980  * #=> "R\xC3\xA9sum\xC3\xA9"
1981  * string.encoding
1982  * #=> #<Encoding:ISO-8859-1>
1983  * string.force_encoding(Encoding::UTF_8)
1984  * #=> "R\u00E9sum\u00E9"
1985  *
1986  * Second, it is possible to transcode a string, i.e. translate its internal
1987  * byte representation to another encoding. Its associated encoding is also
1988  * set to the other encoding. See String#encode for the various forms of
1989  * transcoding, and the Encoding::Converter class for additional control over
1990  * the transcoding process.
1991  *
1992  * string
1993  * #=> "R\u00E9sum\u00E9"
1994  * string.encoding
1995  * #=> #<Encoding:UTF-8>
1996  * string = string.encode!(Encoding::ISO_8859_1)
1997  * #=> "R\xE9sum\xE9"
1998  * string.encoding
1999  * #=> #<Encoding::ISO-8859-1>
2000  *
2001  * == Script encoding
2002  *
2003  * All Ruby script code has an associated Encoding which any String literal
2004  * created in the source code will be associated to.
2005  *
2006  * The default script encoding is Encoding::UTF_8 after v2.0, but it
2007  * can be changed by a magic comment on the first line of the source
2008  * code file (or second line, if there is a shebang line on the
2009  * first). The comment must contain the word <code>coding</code> or
2010  * <code>encoding</code>, followed by a colon, space and the Encoding
2011  * name or alias:
2012  *
2013  * # encoding: UTF-8
2014  *
2015  * "some string".encoding
2016  * #=> #<Encoding:UTF-8>
2017  *
2018  * The <code>__ENCODING__</code> keyword returns the script encoding of the file
2019  * which the keyword is written:
2020  *
2021  * # encoding: ISO-8859-1
2022  *
2023  * __ENCODING__
2024  * #=> #<Encoding:ISO-8859-1>
2025  *
2026  * <code>ruby -K</code> will change the default locale encoding, but this is
2027  * not recommended. Ruby source files should declare its script encoding by a
2028  * magic comment even when they only depend on US-ASCII strings or regular
2029  * expressions.
2030  *
2031  * == Locale encoding
2032  *
2033  * The default encoding of the environment. Usually derived from locale.
2034  *
2035  * see Encoding.locale_charmap, Encoding.find('locale')
2036  *
2037  * == Filesystem encoding
2038  *
2039  * The default encoding of strings from the filesystem of the environment.
2040  * This is used for strings of file names or paths.
2041  *
2042  * see Encoding.find('filesystem')
2043  *
2044  * == External encoding
2045  *
2046  * Each IO object has an external encoding which indicates the encoding that
2047  * Ruby will use to read its data. By default Ruby sets the external encoding
2048  * of an IO object to the default external encoding. The default external
2049  * encoding is set by locale encoding or the interpreter <code>-E</code> option.
2050  * Encoding.default_external returns the current value of the external
2051  * encoding.
2052  *
2053  * ENV["LANG"]
2054  * #=> "UTF-8"
2055  * Encoding.default_external
2056  * #=> #<Encoding:UTF-8>
2057  *
2058  * $ ruby -E ISO-8859-1 -e "p Encoding.default_external"
2059  * #<Encoding:ISO-8859-1>
2060  *
2061  * $ LANG=C ruby -e 'p Encoding.default_external'
2062  * #<Encoding:US-ASCII>
2063  *
2064  * The default external encoding may also be set through
2065  * Encoding.default_external=, but you should not do this as strings created
2066  * before and after the change will have inconsistent encodings. Instead use
2067  * <code>ruby -E</code> to invoke ruby with the correct external encoding.
2068  *
2069  * When you know that the actual encoding of the data of an IO object is not
2070  * the default external encoding, you can reset its external encoding with
2071  * IO#set_encoding or set it at IO object creation (see IO.new options).
2072  *
2073  * == Internal encoding
2074  *
2075  * To process the data of an IO object which has an encoding different
2076  * from its external encoding, you can set its internal encoding. Ruby will use
2077  * this internal encoding to transcode the data when it is read from the IO
2078  * object.
2079  *
2080  * Conversely, when data is written to the IO object it is transcoded from the
2081  * internal encoding to the external encoding of the IO object.
2082  *
2083  * The internal encoding of an IO object can be set with
2084  * IO#set_encoding or at IO object creation (see IO.new options).
2085  *
2086  * The internal encoding is optional and when not set, the Ruby default
2087  * internal encoding is used. If not explicitly set this default internal
2088  * encoding is +nil+ meaning that by default, no transcoding occurs.
2089  *
2090  * The default internal encoding can be set with the interpreter option
2091  * <code>-E</code>. Encoding.default_internal returns the current internal
2092  * encoding.
2093  *
2094  * $ ruby -e 'p Encoding.default_internal'
2095  * nil
2096  *
2097  * $ ruby -E ISO-8859-1:UTF-8 -e "p [Encoding.default_external, \
2098  * Encoding.default_internal]"
2099  * [#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>]
2100  *
2101  * The default internal encoding may also be set through
2102  * Encoding.default_internal=, but you should not do this as strings created
2103  * before and after the change will have inconsistent encodings. Instead use
2104  * <code>ruby -E</code> to invoke ruby with the correct internal encoding.
2105  *
2106  * == IO encoding example
2107  *
2108  * In the following example a UTF-8 encoded string "R\u00E9sum\u00E9" is transcoded for
2109  * output to ISO-8859-1 encoding, then read back in and transcoded to UTF-8:
2110  *
2111  * string = "R\u00E9sum\u00E9"
2112  *
2113  * open("transcoded.txt", "w:ISO-8859-1") do |io|
2114  * io.write(string)
2115  * end
2116  *
2117  * puts "raw text:"
2118  * p File.binread("transcoded.txt")
2119  * puts
2120  *
2121  * open("transcoded.txt", "r:ISO-8859-1:UTF-8") do |io|
2122  * puts "transcoded text:"
2123  * p io.read
2124  * end
2125  *
2126  * While writing the file, the internal encoding is not specified as it is
2127  * only necessary for reading. While reading the file both the internal and
2128  * external encoding must be specified to obtain the correct result.
2129  *
2130  * $ ruby t.rb
2131  * raw text:
2132  * "R\xE9sum\xE9"
2133  *
2134  * transcoded text:
2135  * "R\u00E9sum\u00E9"
2136  *
2137  */
2138 
2139 void
2140 Init_Encoding(void)
2141 {
2142  VALUE list;
2143  int i;
2144 
2145  rb_cEncoding = rb_define_class("Encoding", rb_cObject);
2146  rb_define_alloc_func(rb_cEncoding, enc_s_alloc);
2148  rb_define_method(rb_cEncoding, "to_s", enc_name, 0);
2149  rb_define_method(rb_cEncoding, "inspect", enc_inspect, 0);
2150  rb_define_method(rb_cEncoding, "name", enc_name, 0);
2151  rb_define_method(rb_cEncoding, "names", enc_names, 0);
2152  rb_define_method(rb_cEncoding, "dummy?", enc_dummy_p, 0);
2153  rb_define_method(rb_cEncoding, "ascii_compatible?", enc_ascii_compatible_p, 0);
2154  rb_define_method(rb_cEncoding, "replicate", enc_replicate_m, 1);
2155  rb_define_singleton_method(rb_cEncoding, "list", enc_list, 0);
2156  rb_define_singleton_method(rb_cEncoding, "name_list", rb_enc_name_list, 0);
2157  rb_define_singleton_method(rb_cEncoding, "aliases", rb_enc_aliases, 0);
2158  rb_define_singleton_method(rb_cEncoding, "find", enc_find, 1);
2159  rb_define_singleton_method(rb_cEncoding, "compatible?", enc_compatible_p, 2);
2160 
2161  rb_define_method(rb_cEncoding, "_dump", enc_dump, -1);
2162  rb_define_singleton_method(rb_cEncoding, "_load", enc_load, 1);
2163 
2164  rb_define_singleton_method(rb_cEncoding, "default_external", get_default_external, 0);
2165  rb_define_singleton_method(rb_cEncoding, "default_external=", set_default_external, 1);
2166  rb_define_singleton_method(rb_cEncoding, "default_internal", get_default_internal, 0);
2167  rb_define_singleton_method(rb_cEncoding, "default_internal=", set_default_internal, 1);
2168  rb_define_singleton_method(rb_cEncoding, "locale_charmap", rb_locale_charmap, 0); /* in localeinit.c */
2169 
2170  struct enc_table *enc_table = &global_enc_table;
2171 
2172  if (DEFAULT_ENCODING_LIST_CAPA < enc_table->count) rb_bug("DEFAULT_ENCODING_LIST_CAPA is too small");
2173 
2174  list = rb_additional_encoding_list = rb_ary_new();
2175  RBASIC_CLEAR_CLASS(list);
2177 
2178  list = rb_default_encoding_list = rb_ary_new2(DEFAULT_ENCODING_LIST_CAPA);
2179  RBASIC_CLEAR_CLASS(list);
2181 
2182  for (i = 0; i < enc_table->count; ++i) {
2183  rb_ary_push(list, enc_new(enc_table->list[i].enc));
2184  }
2185 
2186  rb_marshal_define_compat(rb_cEncoding, Qnil, 0, enc_m_loader);
2187 }
2188 
2189 void
2190 Init_encodings(void)
2191 {
2192  rb_enc_init(&global_enc_table);
2193 }
2194 
2195 /* locale insensitive ctype functions */
2196 
2197 void
2198 rb_enc_foreach_name(int (*func)(st_data_t name, st_data_t idx, st_data_t arg), st_data_t arg)
2199 {
2200  GLOBAL_ENC_TABLE_EVAL(enc_table, st_foreach(enc_table->names, func, arg));
2201 }
#define rb_define_singleton_method(klass, mid, func, arity)
Defines klass.mid.
Definition: cxxanyargs.hpp:685
static bool rb_enc_isascii(OnigCodePoint c, rb_encoding *enc)
Identical to rb_isascii(), except it additionally takes an encoding.
Definition: ctype.h:82
int rb_enc_tolower(int c, rb_encoding *enc)
Identical to rb_tolower(), except it additionally takes an encoding.
Definition: encoding.c:1300
int rb_enc_toupper(int c, rb_encoding *enc)
Identical to rb_toupper(), except it additionally takes an encoding.
Definition: encoding.c:1294
VALUE rb_enc_sprintf(rb_encoding *enc, const char *fmt,...)
Identical to rb_sprintf(), except it additionally takes an encoding.
Definition: sprintf.c:1182
@ RUBY_FL_SHAREABLE
This flag has something to do with Ractor.
Definition: fl_type.h:298
VALUE rb_define_class(const char *name, VALUE super)
Defines a top-level class.
Definition: class.c:837
void rb_undef_method(VALUE klass, const char *name)
Defines an undef of a method.
Definition: class.c:1938
void rb_define_method(VALUE klass, const char *name, VALUE(*func)(ANYARGS), int argc)
Defines a method.
Definition: class.c:1914
#define ENCODING_SET_INLINED(obj, i)
Old name of RB_ENCODING_SET_INLINED.
Definition: encoding.h:105
#define ENC_CODERANGE_7BIT
Old name of RUBY_ENC_CODERANGE_7BIT.
Definition: coderange.h:180
#define T_FILE
Old name of RUBY_T_FILE.
Definition: value_type.h:62
#define REALLOC_N
Old name of RB_REALLOC_N.
Definition: memory.h:397
#define T_STRING
Old name of RUBY_T_STRING.
Definition: value_type.h:78
#define ISUPPER
Old name of rb_isupper.
Definition: ctype.h:89
#define SPECIAL_CONST_P
Old name of RB_SPECIAL_CONST_P.
#define UNREACHABLE_RETURN
Old name of RBIMPL_UNREACHABLE_RETURN.
Definition: assume.h:31
#define T_DATA
Old name of RUBY_T_DATA.
Definition: value_type.h:60
#define CLASS_OF
Old name of rb_class_of.
Definition: globals.h:203
#define xmalloc
Old name of ruby_xmalloc.
Definition: xmalloc.h:53
#define ISDIGIT
Old name of rb_isdigit.
Definition: ctype.h:93
#define ISLOWER
Old name of rb_islower.
Definition: ctype.h:90
#define ASSUME
Old name of RBIMPL_ASSUME.
Definition: assume.h:29
#define MBCLEN_CHARFOUND_LEN(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_LEN.
Definition: encoding.h:533
#define ENCODING_INLINE_MAX
Old name of RUBY_ENCODING_INLINE_MAX.
Definition: encoding.h:66
#define STRCASECMP
Old name of st_locale_insensitive_strcasecmp.
Definition: ctype.h:102
#define ISASCII
Old name of rb_isascii.
Definition: ctype.h:85
#define TOLOWER
Old name of rb_tolower.
Definition: ctype.h:101
#define NUM2INT
Old name of RB_NUM2INT.
Definition: int.h:44
#define INT2NUM
Old name of RB_INT2NUM.
Definition: int.h:43
#define Qnil
Old name of RUBY_Qnil.
#define Qfalse
Old name of RUBY_Qfalse.
#define NIL_P
Old name of RB_NIL_P.
#define MBCLEN_CHARFOUND_P(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_P.
Definition: encoding.h:532
#define T_SYMBOL
Old name of RUBY_T_SYMBOL.
Definition: value_type.h:80
#define ENC_CODERANGE_ASCIIONLY(obj)
Old name of RB_ENC_CODERANGE_ASCIIONLY.
Definition: coderange.h:185
#define BUILTIN_TYPE
Old name of RB_BUILTIN_TYPE.
Definition: value_type.h:85
#define ENCODING_GET_INLINED(obj)
Old name of RB_ENCODING_GET_INLINED.
Definition: encoding.h:107
#define ENC_CODERANGE_CLEAR(obj)
Old name of RB_ENC_CODERANGE_CLEAR.
Definition: coderange.h:187
#define CONST_ID
Old name of RUBY_CONST_ID.
Definition: symbol.h:47
#define rb_ary_new2
Old name of rb_ary_new_capa.
Definition: array.h:651
#define ISALNUM
Old name of rb_isalnum.
Definition: ctype.h:91
#define FL_SET_RAW
Old name of RB_FL_SET_RAW.
Definition: fl_type.h:138
#define SYMBOL_P
Old name of RB_SYMBOL_P.
Definition: value_type.h:88
#define T_REGEXP
Old name of RUBY_T_REGEXP.
Definition: value_type.h:77
#define ruby_debug
This variable controls whether the interpreter is in debug mode.
Definition: error.h:470
void rb_raise(VALUE exc, const char *fmt,...)
Exception entry point.
Definition: error.c:3025
void rb_bug(const char *fmt,...)
Interpreter panic switch.
Definition: error.c:802
void rb_set_errinfo(VALUE err)
Sets the current exception ($!) to the given value.
Definition: eval.c:1764
VALUE rb_eTypeError
TypeError exception.
Definition: error.c:1099
VALUE rb_eEncCompatError
Encoding::CompatibilityError exception.
Definition: error.c:1106
void rb_warn(const char *fmt,...)
Identical to rb_warning(), except it reports always regardless of runtime -W flag.
Definition: error.c:418
VALUE rb_eArgError
ArgumentError exception.
Definition: error.c:1100
void rb_loaderror(const char *fmt,...)
Raises an instance of rb_eLoadError.
Definition: error.c:3044
VALUE rb_errinfo(void)
This is the same as $! in Ruby.
Definition: eval.c:1758
VALUE rb_eEncodingError
EncodingError exception.
Definition: error.c:1105
void rb_warning(const char *fmt,...)
Issues a warning.
Definition: error.c:449
VALUE rb_obj_class(VALUE obj)
Queries the class of an object.
Definition: object.c:188
VALUE rb_cEncoding
Encoding class.
Definition: encoding.c:57
VALUE rb_obj_freeze(VALUE obj)
Just calls rb_obj_freeze_inline() inside.
Definition: object.c:1161
Encoding relates APIs.
rb_encoding * rb_locale_encoding(void)
Queries the encoding that represents the current locale.
Definition: encoding.c:1573
rb_encoding * rb_default_external_encoding(void)
Queries the "default external" encoding.
Definition: encoding.c:1637
int rb_enc_dummy_p(rb_encoding *enc)
Queries if the passed encoding is dummy.
Definition: encoding.c:203
int rb_enc_precise_mbclen(const char *p, const char *e, rb_encoding *enc)
Queries the number of bytes of the character at the passed pointer.
Definition: encoding.c:1234
int rb_enc_get_index(VALUE obj)
Queries the index of the encoding of the passed object, if any.
Definition: encoding.c:979
int rb_to_encoding_index(VALUE obj)
Obtains a encoding index from a wider range of objects (than rb_enc_find_index()).
Definition: encoding.c:267
int rb_filesystem_encindex(void)
Identical to rb_filesystem_encoding(), except it returns the encoding's index instead of the encoding...
Definition: encoding.c:1579
VALUE rb_enc_associate(VALUE obj, rb_encoding *enc)
Identical to rb_enc_associate(), except it takes an encoding itself instead of its index.
Definition: encoding.c:1066
rb_encoding * rb_usascii_encoding(void)
Queries the encoding that represents US-ASCII.
Definition: encoding.c:1539
int rb_enc_codelen(int code, rb_encoding *enc)
Queries the number of bytes requested to represent the passed code point using the passed encoding.
Definition: encoding.c:1284
void rb_enc_copy(VALUE dst, VALUE src)
Destructively copies the encoding of the latter object to that of former one.
Definition: encoding.c:1192
int rb_utf8_encindex(void)
Identical to rb_utf8_encoding(), except it returns the encoding's index instead of the encoding itsel...
Definition: encoding.c:1533
int rb_enc_fast_mbclen(const char *p, const char *e, rb_encoding *enc)
Identical to rb_enc_mbclen() unless the character at p overruns e.
Definition: encoding.c:1216
int rb_ascii8bit_encindex(void)
Identical to rb_ascii8bit_encoding(), except it returns the encoding's index instead of the encoding ...
Definition: encoding.c:1521
rb_encoding * rb_enc_compatible(VALUE str1, VALUE str2)
Look for the "common" encoding between the two.
Definition: encoding.c:1176
unsigned int rb_enc_codepoint_len(const char *p, const char *e, int *len, rb_encoding *enc)
Queries the code point of character pointed by the passed pointer.
Definition: encoding.c:1270
int rb_enc_unicode_p(rb_encoding *enc)
Queries if the passed encoding is either one of UTF-8/16/32.
Definition: encoding.c:689
rb_encoding * rb_default_internal_encoding(void)
Queries the "default internal" encoding.
Definition: encoding.c:1724
int rb_enc_to_index(rb_encoding *enc)
Queries the index of the encoding.
Definition: encoding.c:197
rb_encoding * rb_find_encoding(VALUE obj)
Identical to rb_to_encoding_index(), except the return type.
Definition: encoding.c:336
rb_encoding * rb_enc_find(const char *name)
Identical to rb_find_encoding(), except it takes a C's string instead of Ruby's.
Definition: encoding.c:918
rb_encoding * rb_to_encoding(VALUE obj)
Identical to rb_find_encoding(), except it raises an exception instead of returning NULL.
Definition: encoding.c:329
void rb_enc_set_index(VALUE obj, int encindex)
Destructively assigns an encoding (via its index) to an object.
Definition: encoding.c:1030
VALUE rb_locale_charmap(VALUE klass)
Returns a platform-depended "charmap" of the current locale.
Definition: localeinit.c:91
rb_encoding * rb_ascii8bit_encoding(void)
Queries the encoding that represents ASCII-8BIT a.k.a.
Definition: encoding.c:1515
void rb_enc_set_default_internal(VALUE encoding)
Destructively assigns the passed encoding as the default internal encoding.
Definition: encoding.c:1774
VALUE rb_enc_default_external(void)
Identical to rb_default_external_encoding(), except it returns the Ruby-level counterpart instance of...
Definition: encoding.c:1651
rb_encoding * rb_enc_check(VALUE str1, VALUE str2)
Identical to rb_enc_compatible(), except it raises an exception instead of returning NULL.
Definition: encoding.c:1097
VALUE rb_enc_from_encoding(rb_encoding *enc)
Queries the Ruby-level counterpart instance of rb_cEncoding that corresponds to the passed encoding.
Definition: encoding.c:188
static bool rb_enc_asciicompat(rb_encoding *enc)
Queries if the passed encoding is in some sense compatible with ASCII.
Definition: encoding.h:782
int rb_define_dummy_encoding(const char *name)
Creates a new "dummy" encoding.
Definition: encoding.c:617
rb_encoding * rb_utf8_encoding(void)
Queries the encoding that represents UTF-8.
Definition: encoding.c:1527
int rb_locale_encindex(void)
Identical to rb_locale_encoding(), except it returns the encoding's index instead of the encoding its...
Definition: encoding.c:1553
rb_encoding * rb_enc_from_index(int idx)
Identical to rb_find_encoding(), except it takes an encoding index instead of a Ruby object.
Definition: encoding.c:414
int rb_enc_mbclen(const char *p, const char *e, rb_encoding *enc)
Queries the number of bytes of the character at the passed pointer.
Definition: encoding.c:1222
int rb_enc_capable(VALUE obj)
Queries if the passed object can have its encoding.
Definition: encoding.c:943
VALUE rb_enc_default_internal(void)
Identical to rb_default_internal_encoding(), except it returns the Ruby-level counterpart instance of...
Definition: encoding.c:1733
VALUE rb_enc_associate_index(VALUE obj, int encindex)
Identical to rb_enc_set_index(), except it additionally does contents fix-up depending on the passed ...
Definition: encoding.c:1038
rb_encoding * rb_enc_get(VALUE obj)
Identical to rb_enc_get_index(), except the return type.
Definition: encoding.c:1072
int rb_enc_replicate(const char *name, rb_encoding *src)
Creates a new encoding, using the passed one as a template.
Definition: encoding.c:550
static OnigCodePoint rb_enc_mbc_to_codepoint(const char *p, const char *e, rb_encoding *enc)
Identical to rb_enc_codepoint(), except it assumes the passed character is not broken.
Definition: encoding.h:607
static const char * rb_enc_name(rb_encoding *enc)
Queries the (canonical) name of the passed encoding.
Definition: encoding.h:433
void rb_enc_set_default_external(VALUE encoding)
Destructively assigns the passed encoding as the default external encoding.
Definition: encoding.c:1691
int rb_enc_find_index(const char *name)
Queries the index of the encoding.
Definition: encoding.c:881
static int rb_enc_mbminlen(rb_encoding *enc)
Queries the minimum number of bytes that the passed encoding needs to represent a character.
Definition: encoding.h:448
int rb_enc_alias(const char *alias, const char *orig)
Registers an "alias" name.
Definition: encoding.c:721
int rb_enc_ascget(const char *p, const char *e, int *len, rb_encoding *enc)
Queries the code point of character pointed by the passed pointer.
Definition: encoding.c:1246
int rb_usascii_encindex(void)
Identical to rb_usascii_encoding(), except it returns the encoding's index instead of the encoding it...
Definition: encoding.c:1545
rb_encoding * rb_filesystem_encoding(void)
Queries the "filesystem" encoding.
Definition: encoding.c:1592
int rb_enc_str_coderange(VALUE str)
Scans the passed string to collect its code range.
Definition: string.c:776
int rb_enc_str_asciionly_p(VALUE str)
Queries if the passed string is "ASCII only".
Definition: string.c:790
VALUE rb_obj_encoding(VALUE obj)
Identical to rb_enc_get_index(), except the return type.
Definition: encoding.c:1206
VALUE rb_funcallv(VALUE recv, ID mid, int argc, const VALUE *argv)
Identical to rb_funcall(), except it takes the method arguments as a C array.
Definition: vm_eval.c:1061
void rb_gc_register_mark_object(VALUE object)
Inform the garbage collector that object is a live Ruby object that should not be moved.
Definition: gc.c:8687
VALUE rb_ary_concat(VALUE lhs, VALUE rhs)
Destructively appends the contents of latter into the end of former.
Definition: array.c:4790
VALUE rb_ary_replace(VALUE copy, VALUE orig)
Replaces the contents of the former object with the contents of the latter.
Definition: array.c:4415
VALUE rb_ary_new(void)
Allocates a new, empty array.
Definition: array.c:750
VALUE rb_ary_push(VALUE ary, VALUE elem)
Special case of rb_ary_cat() that it adds only one element.
Definition: array.c:1308
VALUE rb_ary_entry(VALUE ary, long off)
Queries an element of an array.
Definition: array.c:1679
void rb_ary_store(VALUE ary, long key, VALUE val)
Destructively stores the passed value to the passed array's passed index.
Definition: array.c:1148
#define rb_check_frozen
Just another name of rb_check_frozen.
Definition: error.h:278
static int rb_check_arity(int argc, int min, int max)
Ensures that the passed integer is in the passed range.
Definition: error.h:294
VALUE rb_hash_aset(VALUE hash, VALUE key, VALUE val)
Inserts or replaces ("upsert"s) the objects into the given hash table.
Definition: hash.c:2903
VALUE rb_hash_new(void)
Creates a new, empty hash object.
Definition: hash.c:1529
VALUE rb_check_string_type(VALUE obj)
Try converting an object to its stringised representation using its to_str method,...
Definition: string.c:2659
VALUE rb_attr_get(VALUE obj, ID name)
Identical to rb_ivar_get()
Definition: variable.c:1293
VALUE rb_ivar_set(VALUE obj, ID name, VALUE val)
Identical to rb_iv_set(), except it accepts the name as an ID instead of a C string.
Definition: variable.c:1575
VALUE rb_ivar_get(VALUE obj, ID name)
Identical to rb_iv_get(), except it accepts the name as an ID instead of a C string.
Definition: variable.c:1285
void rb_define_alloc_func(VALUE klass, rb_alloc_func_t func)
Sets the allocator function of a class.
ID rb_intern(const char *name)
Finds or creates a symbol of the given name.
Definition: symbol.c:782
VALUE rb_sym2str(VALUE id)
Identical to rb_id2str(), except it takes an instance of rb_cSymbol rather than an ID.
Definition: symbol.c:924
void rb_define_const(VALUE klass, const char *name, VALUE val)
Defines a Ruby level constant under a namespace.
Definition: variable.c:3253
#define strdup(s)
Just another name of ruby_strdup.
Definition: util.h:176
VALUE rb_sprintf(const char *fmt,...)
Ruby's extended sprintf(3).
Definition: sprintf.c:1201
void rb_marshal_define_compat(VALUE newclass, VALUE oldclass, VALUE(*dumper)(VALUE), VALUE(*loader)(VALUE, VALUE))
Marshal format compatibility layer.
Definition: marshal.c:148
#define MEMCPY(p1, p2, type, n)
Handy macro to call memcpy.
Definition: memory.h:366
#define ALLOCA_N(type, n)
Definition: memory.h:286
#define RB_GC_GUARD(v)
Prevents premature destruction of local objects.
Definition: memory.h:161
int st_foreach(st_table *q, int_type *w, st_data_t e)
Iteration over the given table.
Definition: cxxanyargs.hpp:432
#define DATA_PTR(obj)
Convenient getter macro.
Definition: rdata.h:71
#define RDATA(obj)
Convenient casting macro.
Definition: rdata.h:63
#define StringValue(v)
Ensures that the parameter object is a String.
Definition: rstring.h:72
static char * RSTRING_END(VALUE str)
Queries the end of the contents pointer of the string.
Definition: rstring.h:527
static char * RSTRING_PTR(VALUE str)
Queries the contents pointer of the string.
Definition: rstring.h:497
static long RSTRING_LEN(VALUE str)
Queries the length of the string.
Definition: rstring.h:483
#define TypedData_Wrap_Struct(klass, data_type, sval)
Converts sval, a pointer to your struct, into a Ruby object.
Definition: rtypeddata.h:441
This is the struct that holds necessary info for a struct.
Definition: rtypeddata.h:190
Definition: encoding.c:63
Definition: st.h:79
uintptr_t ID
Type that represents a Ruby identifier such as a variable name.
Definition: value.h:52
uintptr_t VALUE
Type that represents a Ruby object.
Definition: value.h:40
static bool RB_TYPE_P(VALUE obj, enum ruby_value_type t)
Queries if the given object is of given type.
Definition: value_type.h:375