Ruby  3.1.4p223 (2023-03-30 revision HEAD)
encoding.h
Go to the documentation of this file.
1 #ifndef RUBY_INTERNAL_ENCODING_ENCODING_H /*-*-C++-*-vi:se ft=cpp:*/
2 #define RUBY_INTERNAL_ENCODING_ENCODING_H
24 #include "ruby/oniguruma.h"
31 #include "ruby/internal/value.h"
33 #include "ruby/internal/fl_type.h"
34 
36 
37 
43 
50 enum ruby_encoding_consts {
51 
53  RUBY_ENCODING_INLINE_MAX = 127,
54 
56  RUBY_ENCODING_SHIFT = (RUBY_FL_USHIFT+10),
57 
59  RUBY_ENCODING_MASK = (RUBY_ENCODING_INLINE_MAX<<RUBY_ENCODING_SHIFT
60  /* RUBY_FL_USER10..RUBY_FL_USER16 */),
61 
63  RUBY_ENCODING_MAXNAMELEN = 42
64 };
65 
66 #define ENCODING_INLINE_MAX RUBY_ENCODING_INLINE_MAX
67 #define ENCODING_SHIFT RUBY_ENCODING_SHIFT
68 #define ENCODING_MASK RUBY_ENCODING_MASK
79 static inline void
80 RB_ENCODING_SET_INLINED(VALUE obj, int encindex)
81 {
82  VALUE f = /* upcast */ encindex;
83 
84  f <<= RUBY_ENCODING_SHIFT;
85  RB_FL_UNSET_RAW(obj, RUBY_ENCODING_MASK);
86  RB_FL_SET_RAW(obj, f);
87 }
88 
97 static inline int
99 {
100  VALUE ret = RB_FL_TEST_RAW(obj, RUBY_ENCODING_MASK) >> RUBY_ENCODING_SHIFT;
101 
102  return RBIMPL_CAST((int)ret);
103 }
104 
105 #define ENCODING_SET_INLINED(obj,i) RB_ENCODING_SET_INLINED(obj,i)
106 #define ENCODING_SET(obj,i) RB_ENCODING_SET(obj,i)
107 #define ENCODING_GET_INLINED(obj) RB_ENCODING_GET_INLINED(obj)
108 #define ENCODING_GET(obj) RB_ENCODING_GET(obj)
109 #define ENCODING_IS_ASCII8BIT(obj) RB_ENCODING_IS_ASCII8BIT(obj)
110 #define ENCODING_MAXNAMELEN RUBY_ENCODING_MAXNAMELEN
117 
139 int rb_char_to_option_kcode(int c, int *option, int *kcode);
140 
156 int rb_enc_replicate(const char *name, rb_encoding *src);
157 
169 int rb_define_dummy_encoding(const char *name);
170 
179 int rb_enc_dummy_p(rb_encoding *enc);
180 
191 int rb_enc_to_index(rb_encoding *enc);
192 
200 int rb_enc_get_index(VALUE obj);
201 
210 static inline int
212 {
213  int encindex = RB_ENCODING_GET_INLINED(obj);
214 
215  if (encindex == RUBY_ENCODING_INLINE_MAX) {
216  return rb_enc_get_index(obj);
217  }
218  else {
219  return encindex;
220  }
221 }
222 
233 void rb_enc_set_index(VALUE obj, int encindex);
234 
236 static inline void
237 RB_ENCODING_SET(VALUE obj, int encindex)
238 {
239  rb_enc_set_index(obj, encindex);
240 }
241 
253 static inline void
255 {
256  RB_ENCODING_SET(obj, encindex);
257  RB_ENC_CODERANGE_SET(obj, cr);
258 }
259 
268 int rb_enc_capable(VALUE obj);
269 
278 int rb_enc_find_index(const char *name);
279 
293 int rb_enc_alias(const char *alias, const char *orig);
294 
303 int rb_to_encoding_index(VALUE obj);
304 
315 
325 
334 
348 
359 rb_encoding *rb_enc_check(VALUE str1,VALUE str2);
360 
375 VALUE rb_enc_associate_index(VALUE obj, int encindex);
376 
389 
403 void rb_enc_copy(VALUE dst, VALUE src);
404 
405 
415 
424 rb_encoding *rb_enc_find(const char *name);
425 
432 static inline const char *
434 {
435  return enc->name;
436 }
437 
447 static inline int
449 {
450  return enc->min_enc_len;
451 }
452 
462 static inline int
464 {
465  return enc->max_enc_len;
466 }
467 
484 int rb_enc_mbclen(const char *p, const char *e, rb_encoding *enc);
485 
502 int rb_enc_fast_mbclen(const char *p, const char *e, rb_encoding *enc);
503 
530 int rb_enc_precise_mbclen(const char *p, const char *e, rb_encoding *enc);
531 
532 #define MBCLEN_CHARFOUND_P(ret) ONIGENC_MBCLEN_CHARFOUND_P(ret)
533 #define MBCLEN_CHARFOUND_LEN(ret) ONIGENC_MBCLEN_CHARFOUND_LEN(ret)
534 #define MBCLEN_INVALID_P(ret) ONIGENC_MBCLEN_INVALID_P(ret)
535 #define MBCLEN_NEEDMORE_P(ret) ONIGENC_MBCLEN_NEEDMORE_P(ret)
536 #define MBCLEN_NEEDMORE_LEN(ret) ONIGENC_MBCLEN_NEEDMORE_LEN(ret)
552 int rb_enc_ascget(const char *p, const char *e, int *len, rb_encoding *enc);
553 
566 unsigned int rb_enc_codepoint_len(const char *p, const char *e, int *len, rb_encoding *enc);
567 
586 static inline unsigned int
587 rb_enc_codepoint(const char *p, const char *e, rb_encoding *enc)
588 {
589  return rb_enc_codepoint_len(p, e, 0, enc);
590  /* ^^^
591  * This can be `NULL` in C, `nullptr` in C++, and `0` for both.
592  * We choose the most portable one here.
593  */
594 }
595 
596 
606 static inline OnigCodePoint
607 rb_enc_mbc_to_codepoint(const char *p, const char *e, rb_encoding *enc)
608 {
609  const OnigUChar *up = RBIMPL_CAST((const OnigUChar *)p);
610  const OnigUChar *ue = RBIMPL_CAST((const OnigUChar *)e);
611 
612  return ONIGENC_MBC_TO_CODE(enc, up, ue);
613 }
614 
624 int rb_enc_codelen(int code, rb_encoding *enc);
625 
634 static inline int
636 {
637  OnigCodePoint uc = RBIMPL_CAST((OnigCodePoint)c);
638 
639  return ONIGENC_CODE_TO_MBCLEN(enc, uc);
640 }
641 
656 static inline int
657 rb_enc_mbcput(unsigned int c, void *buf, rb_encoding *enc)
658 {
659  OnigCodePoint uc = RBIMPL_CAST((OnigCodePoint)c);
660  OnigUChar *ubuf = RBIMPL_CAST((OnigUChar *)buf);
661 
662  return ONIGENC_CODE_TO_MBC(enc, uc, ubuf);
663 }
664 
675 static inline char *
676 rb_enc_prev_char(const char *s, const char *p, const char *e, rb_encoding *enc)
677 {
678  const OnigUChar *us = RBIMPL_CAST((const OnigUChar *)s);
679  const OnigUChar *up = RBIMPL_CAST((const OnigUChar *)p);
680  const OnigUChar *ue = RBIMPL_CAST((const OnigUChar *)e);
681  OnigUChar *ur = onigenc_get_prev_char_head(enc, us, up, ue);
682 
683  return RBIMPL_CAST((char *)ur);
684 }
685 
696 static inline char *
697 rb_enc_left_char_head(const char *s, const char *p, const char *e, rb_encoding *enc)
698 {
699  const OnigUChar *us = RBIMPL_CAST((const OnigUChar *)s);
700  const OnigUChar *up = RBIMPL_CAST((const OnigUChar *)p);
701  const OnigUChar *ue = RBIMPL_CAST((const OnigUChar *)e);
702  OnigUChar *ur = onigenc_get_left_adjust_char_head(enc, us, up, ue);
703 
704  return RBIMPL_CAST((char *)ur);
705 }
706 
717 static inline char *
718 rb_enc_right_char_head(const char *s, const char *p, const char *e, rb_encoding *enc)
719 {
720  const OnigUChar *us = RBIMPL_CAST((const OnigUChar *)s);
721  const OnigUChar *up = RBIMPL_CAST((const OnigUChar *)p);
722  const OnigUChar *ue = RBIMPL_CAST((const OnigUChar *)e);
723  OnigUChar *ur = onigenc_get_right_adjust_char_head(enc, us, up, ue);
724 
725  return RBIMPL_CAST((char *)ur);
726 }
727 
739 static inline char *
740 rb_enc_step_back(const char *s, const char *p, const char *e, int n, rb_encoding *enc)
741 {
742  const OnigUChar *us = RBIMPL_CAST((const OnigUChar *)s);
743  const OnigUChar *up = RBIMPL_CAST((const OnigUChar *)p);
744  const OnigUChar *ue = RBIMPL_CAST((const OnigUChar *)e);
745  const OnigUChar *ur = onigenc_step_back(enc, us, up, ue, n);
746 
747  return RBIMPL_CAST((char *)ur);
748 }
749 
760 static inline int
761 rb_enc_asciicompat_inline(rb_encoding *enc)
762 {
763  return rb_enc_mbminlen(enc)==1 && !rb_enc_dummy_p(enc);
764 }
765 
781 static inline bool
783 {
784  if (rb_enc_mbminlen(enc) != 1) {
785  return false;
786  }
787  else if (rb_enc_dummy_p(enc)) {
788  return false;
789  }
790  else {
791  return true;
792  }
793 }
794 
802 static inline bool
804 {
805  rb_encoding *enc = rb_enc_get(str);
806 
807  return rb_enc_asciicompat(enc);
808 }
809 
819 
835 int rb_enc_unicode_p(rb_encoding *enc);
836 
848 
860 
872 
886 
897 
906 
915 
916 #ifndef rb_ascii8bit_encindex
928 int rb_ascii8bit_encindex(void);
929 #endif
930 
940 static inline bool
942 {
944 }
945 
946 #ifndef rb_utf8_encindex
954 int rb_utf8_encindex(void);
955 #endif
956 
957 #ifndef rb_usascii_encindex
965 int rb_usascii_encindex(void);
966 #endif
967 
974 int rb_locale_encindex(void);
975 
982 int rb_filesystem_encindex(void);
983 
992 
1001 
1011 void rb_enc_set_default_external(VALUE encoding);
1012 
1022 void rb_enc_set_default_internal(VALUE encoding);
1023 
1034 
1036 
1037 
1038 #define RB_ENCODING_GET RB_ENCODING_GET
1039 #define RB_ENCODING_GET_INLINED RB_ENCODING_GET_INLINED
1040 #define RB_ENCODING_IS_ASCII8BIT RB_ENCODING_IS_ASCII8BIT
1041 #define RB_ENCODING_SET RB_ENCODING_SET
1042 #define RB_ENCODING_SET_INLINED RB_ENCODING_SET_INLINED
1043 #define rb_enc_asciicompat rb_enc_asciicompat
1044 #define rb_enc_code_to_mbclen rb_enc_code_to_mbclen
1045 #define rb_enc_codepoint rb_enc_codepoint
1046 #define rb_enc_left_char_head rb_enc_left_char_head
1047 #define rb_enc_mbc_to_codepoint rb_enc_mbc_to_codepoint
1048 #define rb_enc_mbcput rb_enc_mbcput
1049 #define rb_enc_mbmaxlen rb_enc_mbmaxlen
1050 #define rb_enc_mbminlen rb_enc_mbminlen
1051 #define rb_enc_name rb_enc_name
1052 #define rb_enc_prev_char rb_enc_prev_char
1053 #define rb_enc_right_char_head rb_enc_right_char_head
1054 #define rb_enc_step_back rb_enc_step_back
1055 #define rb_enc_str_asciicompat_p rb_enc_str_asciicompat_p
1058 #endif /* RUBY_INTERNAL_ENCODING_ENCODING_H */
ruby_coderange_type
What rb_enc_str_coderange() returns.
Definition: coderange.h:33
static void RB_ENC_CODERANGE_SET(VALUE obj, enum ruby_coderange_type cr)
Destructively modifies the passed object so that its (inline) code range is the passed one.
Definition: coderange.h:129
Defines RBIMPL_ATTR_CONST.
Defines RBIMPL_ATTR_DEPRECATED.
Tweaking visibility of C variables/functions.
#define RUBY_EXTERN
Declaration of externally visible global variables.
Definition: dllexport.h:47
#define RBIMPL_SYMBOL_EXPORT_END()
Counterpart of RBIMPL_SYMBOL_EXPORT_BEGIN.
Definition: dllexport.h:106
#define RBIMPL_SYMBOL_EXPORT_BEGIN()
Shortcut macro equivalent to RUBY_SYMBOL_EXPORT_BEGIN extern "C" {.
Definition: dllexport.h:97
Defines enum ruby_fl_type.
@ RUBY_FL_USHIFT
Number of bits in ruby_fl_type that are not open to users.
Definition: fl_type.h:167
static VALUE RB_FL_TEST_RAW(VALUE obj, VALUE flags)
This is an implenentation detail of RB_FL_TEST().
Definition: fl_type.h:507
static void RB_FL_SET_RAW(VALUE obj, VALUE flags)
This is an implenentation detail of RB_FL_SET().
Definition: fl_type.h:644
static void RB_FL_UNSET_RAW(VALUE obj, VALUE flags)
This is an implenentation detail of RB_FL_UNSET().
Definition: fl_type.h:704
VALUE rb_cEncoding
Encoding class.
Definition: encoding.c:57
rb_encoding * rb_locale_encoding(void)
Queries the encoding that represents the current locale.
Definition: encoding.c:1573
rb_encoding * rb_default_external_encoding(void)
Queries the "default external" encoding.
Definition: encoding.c:1637
int rb_enc_dummy_p(rb_encoding *enc)
Queries if the passed encoding is dummy.
Definition: encoding.c:203
int rb_enc_precise_mbclen(const char *p, const char *e, rb_encoding *enc)
Queries the number of bytes of the character at the passed pointer.
Definition: encoding.c:1234
int rb_enc_get_index(VALUE obj)
Queries the index of the encoding of the passed object, if any.
Definition: encoding.c:979
int rb_to_encoding_index(VALUE obj)
Obtains a encoding index from a wider range of objects (than rb_enc_find_index()).
Definition: encoding.c:267
int rb_filesystem_encindex(void)
Identical to rb_filesystem_encoding(), except it returns the encoding's index instead of the encoding...
Definition: encoding.c:1579
VALUE rb_enc_associate(VALUE obj, rb_encoding *enc)
Identical to rb_enc_associate(), except it takes an encoding itself instead of its index.
Definition: encoding.c:1066
rb_encoding * rb_usascii_encoding(void)
Queries the encoding that represents US-ASCII.
Definition: encoding.c:1539
static void RB_ENCODING_SET_INLINED(VALUE obj, int encindex)
Destructively assigns the passed encoding to the passed object.
Definition: encoding.h:80
static bool RB_ENCODING_IS_ASCII8BIT(VALUE obj)
Queries if the passed object is in ascii 8bit (== binary) encoding.
Definition: encoding.h:941
int rb_enc_codelen(int code, rb_encoding *enc)
Queries the number of bytes requested to represent the passed code point using the passed encoding.
Definition: encoding.c:1284
const OnigEncodingType rb_encoding
The type of encoding.
Definition: encoding.h:116
static char * rb_enc_left_char_head(const char *s, const char *p, const char *e, rb_encoding *enc)
Queries the left boundary of a character.
Definition: encoding.h:697
void rb_enc_copy(VALUE dst, VALUE src)
Destructively copies the encoding of the latter object to that of former one.
Definition: encoding.c:1192
int rb_utf8_encindex(void)
Identical to rb_utf8_encoding(), except it returns the encoding's index instead of the encoding itsel...
Definition: encoding.c:1533
int rb_enc_fast_mbclen(const char *p, const char *e, rb_encoding *enc)
Identical to rb_enc_mbclen() unless the character at p overruns e.
Definition: encoding.c:1216
static int RB_ENCODING_GET(VALUE obj)
Just another name of rb_enc_get_index.
Definition: encoding.h:211
int rb_ascii8bit_encindex(void)
Identical to rb_ascii8bit_encoding(), except it returns the encoding's index instead of the encoding ...
Definition: encoding.c:1521
rb_encoding * rb_enc_compatible(VALUE str1, VALUE str2)
Look for the "common" encoding between the two.
Definition: encoding.c:1176
unsigned int rb_enc_codepoint_len(const char *p, const char *e, int *len, rb_encoding *enc)
Queries the code point of character pointed by the passed pointer.
Definition: encoding.c:1270
int rb_enc_unicode_p(rb_encoding *enc)
Queries if the passed encoding is either one of UTF-8/16/32.
Definition: encoding.c:689
rb_encoding * rb_default_internal_encoding(void)
Queries the "default internal" encoding.
Definition: encoding.c:1724
int rb_enc_to_index(rb_encoding *enc)
Queries the index of the encoding.
Definition: encoding.c:197
static char * rb_enc_right_char_head(const char *s, const char *p, const char *e, rb_encoding *enc)
Queries the right boundary of a character.
Definition: encoding.h:718
rb_encoding * rb_find_encoding(VALUE obj)
Identical to rb_to_encoding_index(), except the return type.
Definition: encoding.c:336
rb_encoding * rb_enc_find(const char *name)
Identical to rb_find_encoding(), except it takes a C's string instead of Ruby's.
Definition: encoding.c:918
rb_encoding * rb_to_encoding(VALUE obj)
Identical to rb_find_encoding(), except it raises an exception instead of returning NULL.
Definition: encoding.c:329
void rb_enc_set_index(VALUE obj, int encindex)
Destructively assigns an encoding (via its index) to an object.
Definition: encoding.c:1030
VALUE rb_locale_charmap(VALUE klass)
Returns a platform-depended "charmap" of the current locale.
Definition: localeinit.c:91
rb_encoding * rb_ascii8bit_encoding(void)
Queries the encoding that represents ASCII-8BIT a.k.a.
Definition: encoding.c:1515
void rb_enc_set_default_internal(VALUE encoding)
Destructively assigns the passed encoding as the default internal encoding.
Definition: encoding.c:1774
VALUE rb_enc_default_external(void)
Identical to rb_default_external_encoding(), except it returns the Ruby-level counterpart instance of...
Definition: encoding.c:1651
rb_encoding * rb_enc_check(VALUE str1, VALUE str2)
Identical to rb_enc_compatible(), except it raises an exception instead of returning NULL.
Definition: encoding.c:1097
VALUE rb_enc_from_encoding(rb_encoding *enc)
Queries the Ruby-level counterpart instance of rb_cEncoding that corresponds to the passed encoding.
Definition: encoding.c:188
static bool rb_enc_asciicompat(rb_encoding *enc)
Queries if the passed encoding is in some sense compatible with ASCII.
Definition: encoding.h:782
int rb_define_dummy_encoding(const char *name)
Creates a new "dummy" encoding.
Definition: encoding.c:617
rb_encoding * rb_utf8_encoding(void)
Queries the encoding that represents UTF-8.
Definition: encoding.c:1527
static char * rb_enc_prev_char(const char *s, const char *p, const char *e, rb_encoding *enc)
Queries the previous (left) character.
Definition: encoding.h:676
static unsigned int rb_enc_codepoint(const char *p, const char *e, rb_encoding *enc)
Queries the code point of character pointed by the passed pointer.
Definition: encoding.h:587
static int rb_enc_mbcput(unsigned int c, void *buf, rb_encoding *enc)
Identical to rb_enc_uint_chr(), except it writes back to the passed buffer instead of allocating one.
Definition: encoding.h:657
int rb_locale_encindex(void)
Identical to rb_locale_encoding(), except it returns the encoding's index instead of the encoding its...
Definition: encoding.c:1553
int rb_char_to_option_kcode(int c, int *option, int *kcode)
Converts a character option to its encoding.
Definition: re.c:329
rb_encoding * rb_enc_from_index(int idx)
Identical to rb_find_encoding(), except it takes an encoding index instead of a Ruby object.
Definition: encoding.c:414
static int rb_enc_mbmaxlen(rb_encoding *enc)
Queries the maximum number of bytes that the passed encoding needs to represent a character.
Definition: encoding.h:463
int rb_enc_mbclen(const char *p, const char *e, rb_encoding *enc)
Queries the number of bytes of the character at the passed pointer.
Definition: encoding.c:1222
static void RB_ENCODING_SET(VALUE obj, int encindex)
Just another name of rb_enc_set_index.
Definition: encoding.h:237
int rb_enc_capable(VALUE obj)
Queries if the passed object can have its encoding.
Definition: encoding.c:943
static void RB_ENCODING_CODERANGE_SET(VALUE obj, int encindex, enum ruby_coderange_type cr)
This is RB_ENCODING_SET + RB_ENC_CODERANGE_SET combo.
Definition: encoding.h:254
VALUE rb_enc_default_internal(void)
Identical to rb_default_internal_encoding(), except it returns the Ruby-level counterpart instance of...
Definition: encoding.c:1733
VALUE rb_enc_associate_index(VALUE obj, int encindex)
Identical to rb_enc_set_index(), except it additionally does contents fix-up depending on the passed ...
Definition: encoding.c:1038
rb_encoding * rb_enc_get(VALUE obj)
Identical to rb_enc_get_index(), except the return type.
Definition: encoding.c:1072
int rb_enc_replicate(const char *name, rb_encoding *src)
Creates a new encoding, using the passed one as a template.
Definition: encoding.c:550
static int RB_ENCODING_GET_INLINED(VALUE obj)
Queries the encoding of the passed object.
Definition: encoding.h:98
static OnigCodePoint rb_enc_mbc_to_codepoint(const char *p, const char *e, rb_encoding *enc)
Identical to rb_enc_codepoint(), except it assumes the passed character is not broken.
Definition: encoding.h:607
static const char * rb_enc_name(rb_encoding *enc)
Queries the (canonical) name of the passed encoding.
Definition: encoding.h:433
void rb_enc_set_default_external(VALUE encoding)
Destructively assigns the passed encoding as the default external encoding.
Definition: encoding.c:1691
static char * rb_enc_step_back(const char *s, const char *p, const char *e, int n, rb_encoding *enc)
Scans the string backwards for n characters.
Definition: encoding.h:740
int rb_enc_find_index(const char *name)
Queries the index of the encoding.
Definition: encoding.c:881
static int rb_enc_mbminlen(rb_encoding *enc)
Queries the minimum number of bytes that the passed encoding needs to represent a character.
Definition: encoding.h:448
int rb_enc_alias(const char *alias, const char *orig)
Registers an "alias" name.
Definition: encoding.c:721
static int rb_enc_code_to_mbclen(int c, rb_encoding *enc)
Identical to rb_enc_codelen(), except it returns 0 for invalid code points.
Definition: encoding.h:635
int rb_enc_ascget(const char *p, const char *e, int *len, rb_encoding *enc)
Queries the code point of character pointed by the passed pointer.
Definition: encoding.c:1246
static bool rb_enc_str_asciicompat_p(VALUE str)
Queries if the passed string is in an ASCII-compatible encoding.
Definition: encoding.h:803
int rb_usascii_encindex(void)
Identical to rb_usascii_encoding(), except it returns the encoding's index instead of the encoding it...
Definition: encoding.c:1545
rb_encoding * rb_filesystem_encoding(void)
Queries the "filesystem" encoding.
Definition: encoding.c:1592
RBIMPL_ATTR_CONST() int rb_io_oflags_fmode(int oflags)
Converts an oflags (that rb_io_modestr_oflags() returns) to a fmode (that rb_io_mode_flags() returns)...
RBIMPL_ATTR_PURE() int rb_io_read_pending(rb_io_t *fptr)
Queries if the passed IO has any pending reads.
Defines RBIMPL_ATTR_NOALIAS.
#define RBIMPL_ATTR_NOALIAS()
Wraps (or simulates) __declspec((noalias))
Definition: noalias.h:62
Defines RBIMPL_ATTR_PURE.
Defines struct RBasic.
Defines RBIMPL_ATTR_RETURNS_NONNULL.
#define RBIMPL_ATTR_RETURNS_NONNULL()
Wraps (or simulates) __attribute__((returns_nonnull))
Defines VALUE and ID.
uintptr_t VALUE
Type that represents a Ruby object.
Definition: value.h:40