Ruby  3.1.4p223 (2023-03-30 revision HEAD)
transcode.c
1 /**********************************************************************
2 
3  transcode.c -
4 
5  $Author$
6  created at: Tue Oct 30 16:10:22 JST 2007
7 
8  Copyright (C) 2007 Martin Duerst
9 
10 **********************************************************************/
11 
12 #include "ruby/internal/config.h"
13 
14 #include <ctype.h>
15 
16 #include "internal.h"
17 #include "internal/array.h"
18 #include "internal/inits.h"
19 #include "internal/object.h"
20 #include "internal/string.h"
21 #include "internal/transcode.h"
22 #include "ruby/encoding.h"
23 
24 #include "transcode_data.h"
25 #include "id.h"
26 
27 #define ENABLE_ECONV_NEWLINE_OPTION 1
28 
29 /* VALUE rb_cEncoding = rb_define_class("Encoding", rb_cObject); */
30 static VALUE rb_eUndefinedConversionError;
31 static VALUE rb_eInvalidByteSequenceError;
32 static VALUE rb_eConverterNotFoundError;
33 
34 VALUE rb_cEncodingConverter;
35 
36 static ID id_destination_encoding;
37 static ID id_destination_encoding_name;
38 static ID id_error_bytes;
39 static ID id_error_char;
40 static ID id_incomplete_input;
41 static ID id_readagain_bytes;
42 static ID id_source_encoding;
43 static ID id_source_encoding_name;
44 
45 static VALUE sym_invalid, sym_undef, sym_replace, sym_fallback;
46 static VALUE sym_xml, sym_text, sym_attr;
47 static VALUE sym_universal_newline;
48 static VALUE sym_crlf_newline;
49 static VALUE sym_cr_newline;
50 #ifdef ENABLE_ECONV_NEWLINE_OPTION
51 static VALUE sym_newline, sym_universal, sym_crlf, sym_cr, sym_lf;
52 #endif
53 static VALUE sym_partial_input;
54 
55 static VALUE sym_invalid_byte_sequence;
56 static VALUE sym_undefined_conversion;
57 static VALUE sym_destination_buffer_full;
58 static VALUE sym_source_buffer_empty;
59 static VALUE sym_finished;
60 static VALUE sym_after_output;
61 static VALUE sym_incomplete_input;
62 
63 static unsigned char *
64 allocate_converted_string(const char *sname, const char *dname,
65  const unsigned char *str, size_t len,
66  unsigned char *caller_dst_buf, size_t caller_dst_bufsize,
67  size_t *dst_len_ptr);
68 
69 /* dynamic structure, one per conversion (similar to iconv_t) */
70 /* may carry conversion state (e.g. for iso-2022-jp) */
71 typedef struct rb_transcoding {
72  const rb_transcoder *transcoder;
73 
74  int flags;
75 
76  int resume_position;
77  unsigned int next_table;
78  VALUE next_info;
79  unsigned char next_byte;
80  unsigned int output_index;
81 
82  ssize_t recognized_len; /* already interpreted */
83  ssize_t readagain_len; /* not yet interpreted */
84  union {
85  unsigned char ary[8]; /* max_input <= sizeof(ary) */
86  unsigned char *ptr; /* length: max_input */
87  } readbuf; /* recognized_len + readagain_len used */
88 
89  ssize_t writebuf_off;
90  ssize_t writebuf_len;
91  union {
92  unsigned char ary[8]; /* max_output <= sizeof(ary) */
93  unsigned char *ptr; /* length: max_output */
94  } writebuf;
95 
96  union rb_transcoding_state_t { /* opaque data for stateful encoding */
97  void *ptr;
98  char ary[sizeof(double) > sizeof(void*) ? sizeof(double) : sizeof(void*)];
99  double dummy_for_alignment;
100  } state;
102 #define TRANSCODING_READBUF(tc) \
103  ((tc)->transcoder->max_input <= (int)sizeof((tc)->readbuf.ary) ? \
104  (tc)->readbuf.ary : \
105  (tc)->readbuf.ptr)
106 #define TRANSCODING_WRITEBUF(tc) \
107  ((tc)->transcoder->max_output <= (int)sizeof((tc)->writebuf.ary) ? \
108  (tc)->writebuf.ary : \
109  (tc)->writebuf.ptr)
110 #define TRANSCODING_WRITEBUF_SIZE(tc) \
111  ((tc)->transcoder->max_output <= (int)sizeof((tc)->writebuf.ary) ? \
112  sizeof((tc)->writebuf.ary) : \
113  (size_t)(tc)->transcoder->max_output)
114 #define TRANSCODING_STATE_EMBED_MAX ((int)sizeof(union rb_transcoding_state_t))
115 #define TRANSCODING_STATE(tc) \
116  ((tc)->transcoder->state_size <= (int)sizeof((tc)->state) ? \
117  (tc)->state.ary : \
118  (tc)->state.ptr)
119 
120 typedef struct {
121  struct rb_transcoding *tc;
122  unsigned char *out_buf_start;
123  unsigned char *out_data_start;
124  unsigned char *out_data_end;
125  unsigned char *out_buf_end;
126  rb_econv_result_t last_result;
128 
129 struct rb_econv_t {
130  int flags;
131  int started; /* bool */
132 
133  const char *source_encoding_name;
134  const char *destination_encoding_name;
135 
136  const unsigned char *replacement_str;
137  size_t replacement_len;
138  const char *replacement_enc;
139 
140  unsigned char *in_buf_start;
141  unsigned char *in_data_start;
142  unsigned char *in_data_end;
143  unsigned char *in_buf_end;
144  rb_econv_elem_t *elems;
145  int replacement_allocated; /* bool */
146  int num_allocated;
147  int num_trans;
148  int num_finished;
149  struct rb_transcoding *last_tc;
150 
151  /* last error */
152  struct {
153  rb_econv_result_t result;
154  struct rb_transcoding *error_tc;
155  const char *source_encoding;
156  const char *destination_encoding;
157  const unsigned char *error_bytes_start;
158  size_t error_bytes_len;
159  size_t readagain_len;
160  } last_error;
161 
162  /* The following fields are only for Encoding::Converter.
163  * rb_econv_open set them NULL. */
164  rb_encoding *source_encoding;
165  rb_encoding *destination_encoding;
166 };
167 
168 /*
169  * Dispatch data and logic
170  */
171 
172 #define DECORATOR_P(sname, dname) (*(sname) == '\0')
173 
174 typedef struct {
175  const char *sname;
176  const char *dname;
177  const char *lib; /* null means no need to load a library */
178  const rb_transcoder *transcoder;
180 
181 static st_table *transcoder_table;
182 
183 static transcoder_entry_t *
184 make_transcoder_entry(const char *sname, const char *dname)
185 {
186  st_data_t val;
187  st_table *table2;
188 
189  if (!st_lookup(transcoder_table, (st_data_t)sname, &val)) {
190  val = (st_data_t)st_init_strcasetable();
191  st_add_direct(transcoder_table, (st_data_t)sname, val);
192  }
193  table2 = (st_table *)val;
194  if (!st_lookup(table2, (st_data_t)dname, &val)) {
196  entry->sname = sname;
197  entry->dname = dname;
198  entry->lib = NULL;
199  entry->transcoder = NULL;
200  val = (st_data_t)entry;
201  st_add_direct(table2, (st_data_t)dname, val);
202  }
203  return (transcoder_entry_t *)val;
204 }
205 
206 static transcoder_entry_t *
207 get_transcoder_entry(const char *sname, const char *dname)
208 {
209  st_data_t val;
210  st_table *table2;
211 
212  if (!st_lookup(transcoder_table, (st_data_t)sname, &val)) {
213  return NULL;
214  }
215  table2 = (st_table *)val;
216  if (!st_lookup(table2, (st_data_t)dname, &val)) {
217  return NULL;
218  }
219  return (transcoder_entry_t *)val;
220 }
221 
222 void
223 rb_register_transcoder(const rb_transcoder *tr)
224 {
225  const char *const sname = tr->src_encoding;
226  const char *const dname = tr->dst_encoding;
227 
228  transcoder_entry_t *entry;
229 
230  entry = make_transcoder_entry(sname, dname);
231  if (entry->transcoder) {
232  rb_raise(rb_eArgError, "transcoder from %s to %s has been already registered",
233  sname, dname);
234  }
235 
236  entry->transcoder = tr;
237 }
238 
239 static void
240 declare_transcoder(const char *sname, const char *dname, const char *lib)
241 {
242  transcoder_entry_t *entry;
243 
244  entry = make_transcoder_entry(sname, dname);
245  entry->lib = lib;
246 }
247 
248 static const char transcoder_lib_prefix[] = "enc/trans/";
249 
250 void
251 rb_declare_transcoder(const char *enc1, const char *enc2, const char *lib)
252 {
253  if (!lib) {
254  rb_raise(rb_eArgError, "invalid library name - (null)");
255  }
256  declare_transcoder(enc1, enc2, lib);
257 }
258 
259 #define encoding_equal(enc1, enc2) (STRCASECMP((enc1), (enc2)) == 0)
260 
261 typedef struct search_path_queue_tag {
262  struct search_path_queue_tag *next;
263  const char *enc;
265 
266 typedef struct {
267  st_table *visited;
268  search_path_queue_t *queue;
269  search_path_queue_t **queue_last_ptr;
270  const char *base_enc;
272 
273 static int
274 transcode_search_path_i(st_data_t key, st_data_t val, st_data_t arg)
275 {
276  const char *dname = (const char *)key;
277  search_path_bfs_t *bfs = (search_path_bfs_t *)arg;
279 
280  if (st_lookup(bfs->visited, (st_data_t)dname, &val)) {
281  return ST_CONTINUE;
282  }
283 
285  q->enc = dname;
286  q->next = NULL;
287  *bfs->queue_last_ptr = q;
288  bfs->queue_last_ptr = &q->next;
289 
290  st_add_direct(bfs->visited, (st_data_t)dname, (st_data_t)bfs->base_enc);
291  return ST_CONTINUE;
292 }
293 
294 static int
295 transcode_search_path(const char *sname, const char *dname,
296  void (*callback)(const char *sname, const char *dname, int depth, void *arg),
297  void *arg)
298 {
299  search_path_bfs_t bfs;
301  st_data_t val;
302  st_table *table2;
303  int found;
304  int pathlen = -1;
305 
306  if (encoding_equal(sname, dname))
307  return -1;
308 
310  q->enc = sname;
311  q->next = NULL;
312  bfs.queue_last_ptr = &q->next;
313  bfs.queue = q;
314 
315  bfs.visited = st_init_strcasetable();
316  st_add_direct(bfs.visited, (st_data_t)sname, (st_data_t)NULL);
317 
318  while (bfs.queue) {
319  q = bfs.queue;
320  bfs.queue = q->next;
321  if (!bfs.queue)
322  bfs.queue_last_ptr = &bfs.queue;
323 
324  if (!st_lookup(transcoder_table, (st_data_t)q->enc, &val)) {
325  xfree(q);
326  continue;
327  }
328  table2 = (st_table *)val;
329 
330  if (st_lookup(table2, (st_data_t)dname, &val)) {
331  st_add_direct(bfs.visited, (st_data_t)dname, (st_data_t)q->enc);
332  xfree(q);
333  found = 1;
334  goto cleanup;
335  }
336 
337  bfs.base_enc = q->enc;
338  st_foreach(table2, transcode_search_path_i, (st_data_t)&bfs);
339  bfs.base_enc = NULL;
340 
341  xfree(q);
342  }
343  found = 0;
344 
345  cleanup:
346  while (bfs.queue) {
347  q = bfs.queue;
348  bfs.queue = q->next;
349  xfree(q);
350  }
351 
352  if (found) {
353  const char *enc = dname;
354  int depth;
355  pathlen = 0;
356  while (1) {
357  st_lookup(bfs.visited, (st_data_t)enc, &val);
358  if (!val)
359  break;
360  pathlen++;
361  enc = (const char *)val;
362  }
363  depth = pathlen;
364  enc = dname;
365  while (1) {
366  st_lookup(bfs.visited, (st_data_t)enc, &val);
367  if (!val)
368  break;
369  callback((const char *)val, enc, --depth, arg);
370  enc = (const char *)val;
371  }
372  }
373 
374  st_free_table(bfs.visited);
375 
376  return pathlen; /* is -1 if not found */
377 }
378 
379 int rb_require_internal_silent(VALUE fname);
380 
381 static const rb_transcoder *
382 load_transcoder_entry(transcoder_entry_t *entry)
383 {
384  if (entry->transcoder)
385  return entry->transcoder;
386 
387  if (entry->lib) {
388  const char *const lib = entry->lib;
389  const size_t len = strlen(lib);
390  const size_t total_len = sizeof(transcoder_lib_prefix) - 1 + len;
391  const VALUE fn = rb_str_new(0, total_len);
392  char *const path = RSTRING_PTR(fn);
393 
394  memcpy(path, transcoder_lib_prefix, sizeof(transcoder_lib_prefix) - 1);
395  memcpy(path + sizeof(transcoder_lib_prefix) - 1, lib, len);
396  rb_str_set_len(fn, total_len);
397  OBJ_FREEZE(fn);
398  rb_require_internal_silent(fn);
399  }
400 
401  if (entry->transcoder)
402  return entry->transcoder;
403 
404  return NULL;
405 }
406 
407 static const char*
408 get_replacement_character(const char *encname, size_t *len_ret, const char **repl_encname_ptr)
409 {
410  if (encoding_equal(encname, "UTF-8")) {
411  *len_ret = 3;
412  *repl_encname_ptr = "UTF-8";
413  return "\xEF\xBF\xBD";
414  }
415  else {
416  *len_ret = 1;
417  *repl_encname_ptr = "US-ASCII";
418  return "?";
419  }
420 }
421 
422 /*
423  * Transcoding engine logic
424  */
425 
426 static const unsigned char *
427 transcode_char_start(rb_transcoding *tc,
428  const unsigned char *in_start,
429  const unsigned char *inchar_start,
430  const unsigned char *in_p,
431  size_t *char_len_ptr)
432 {
433  const unsigned char *ptr;
434  if (inchar_start - in_start < tc->recognized_len) {
435  MEMCPY(TRANSCODING_READBUF(tc) + tc->recognized_len,
436  inchar_start, unsigned char, in_p - inchar_start);
437  ptr = TRANSCODING_READBUF(tc);
438  }
439  else {
440  ptr = inchar_start - tc->recognized_len;
441  }
442  *char_len_ptr = tc->recognized_len + (in_p - inchar_start);
443  return ptr;
444 }
445 
446 static rb_econv_result_t
447 transcode_restartable0(const unsigned char **in_pos, unsigned char **out_pos,
448  const unsigned char *in_stop, unsigned char *out_stop,
449  rb_transcoding *tc,
450  const int opt)
451 {
452  const rb_transcoder *tr = tc->transcoder;
453  int unitlen = tr->input_unit_length;
454  ssize_t readagain_len = 0;
455 
456  const unsigned char *inchar_start;
457  const unsigned char *in_p;
458 
459  unsigned char *out_p;
460 
461  in_p = inchar_start = *in_pos;
462 
463  out_p = *out_pos;
464 
465 #define SUSPEND(ret, num) \
466  do { \
467  tc->resume_position = (num); \
468  if (0 < in_p - inchar_start) \
469  MEMMOVE(TRANSCODING_READBUF(tc)+tc->recognized_len, \
470  inchar_start, unsigned char, in_p - inchar_start); \
471  *in_pos = in_p; \
472  *out_pos = out_p; \
473  tc->recognized_len += in_p - inchar_start; \
474  if (readagain_len) { \
475  tc->recognized_len -= readagain_len; \
476  tc->readagain_len = readagain_len; \
477  } \
478  return (ret); \
479  resume_label ## num:; \
480  } while (0)
481 #define SUSPEND_OBUF(num) \
482  do { \
483  while (out_stop - out_p < 1) { SUSPEND(econv_destination_buffer_full, num); } \
484  } while (0)
485 
486 #define SUSPEND_AFTER_OUTPUT(num) \
487  if ((opt & ECONV_AFTER_OUTPUT) && *out_pos != out_p) { \
488  SUSPEND(econv_after_output, num); \
489  }
490 
491 #define next_table (tc->next_table)
492 #define next_info (tc->next_info)
493 #define next_byte (tc->next_byte)
494 #define writebuf_len (tc->writebuf_len)
495 #define writebuf_off (tc->writebuf_off)
496 
497  switch (tc->resume_position) {
498  case 0: break;
499  case 1: goto resume_label1;
500  case 2: goto resume_label2;
501  case 3: goto resume_label3;
502  case 4: goto resume_label4;
503  case 5: goto resume_label5;
504  case 6: goto resume_label6;
505  case 7: goto resume_label7;
506  case 8: goto resume_label8;
507  case 9: goto resume_label9;
508  case 10: goto resume_label10;
509  case 11: goto resume_label11;
510  case 12: goto resume_label12;
511  case 13: goto resume_label13;
512  case 14: goto resume_label14;
513  case 15: goto resume_label15;
514  case 16: goto resume_label16;
515  case 17: goto resume_label17;
516  case 18: goto resume_label18;
517  case 19: goto resume_label19;
518  case 20: goto resume_label20;
519  case 21: goto resume_label21;
520  case 22: goto resume_label22;
521  case 23: goto resume_label23;
522  case 24: goto resume_label24;
523  case 25: goto resume_label25;
524  case 26: goto resume_label26;
525  case 27: goto resume_label27;
526  case 28: goto resume_label28;
527  case 29: goto resume_label29;
528  case 30: goto resume_label30;
529  case 31: goto resume_label31;
530  case 32: goto resume_label32;
531  case 33: goto resume_label33;
532  case 34: goto resume_label34;
533  }
534 
535  while (1) {
536  inchar_start = in_p;
537  tc->recognized_len = 0;
538  next_table = tr->conv_tree_start;
539 
540  SUSPEND_AFTER_OUTPUT(24);
541 
542  if (in_stop <= in_p) {
543  if (!(opt & ECONV_PARTIAL_INPUT))
544  break;
545  SUSPEND(econv_source_buffer_empty, 7);
546  continue;
547  }
548 
549 #define BYTE_ADDR(index) (tr->byte_array + (index))
550 #define WORD_ADDR(index) (tr->word_array + INFO2WORDINDEX(index))
551 #define BL_BASE BYTE_ADDR(BYTE_LOOKUP_BASE(WORD_ADDR(next_table)))
552 #define BL_INFO WORD_ADDR(BYTE_LOOKUP_INFO(WORD_ADDR(next_table)))
553 #define BL_MIN_BYTE (BL_BASE[0])
554 #define BL_MAX_BYTE (BL_BASE[1])
555 #define BL_OFFSET(byte) (BL_BASE[2+(byte)-BL_MIN_BYTE])
556 #define BL_ACTION(byte) (BL_INFO[BL_OFFSET((byte))])
557 
558  next_byte = (unsigned char)*in_p++;
559  follow_byte:
560  if (next_byte < BL_MIN_BYTE || BL_MAX_BYTE < next_byte)
561  next_info = INVALID;
562  else {
563  next_info = (VALUE)BL_ACTION(next_byte);
564  }
565  follow_info:
566  switch (next_info & 0x1F) {
567  case NOMAP:
568  {
569  const unsigned char *p = inchar_start;
570  writebuf_off = 0;
571  while (p < in_p) {
572  TRANSCODING_WRITEBUF(tc)[writebuf_off++] = (unsigned char)*p++;
573  }
574  writebuf_len = writebuf_off;
575  writebuf_off = 0;
576  while (writebuf_off < writebuf_len) {
577  SUSPEND_OBUF(3);
578  *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
579  }
580  }
581  continue;
582  case 0x00: case 0x04: case 0x08: case 0x0C:
583  case 0x10: case 0x14: case 0x18: case 0x1C:
584  SUSPEND_AFTER_OUTPUT(25);
585  while (in_p >= in_stop) {
586  if (!(opt & ECONV_PARTIAL_INPUT))
587  goto incomplete;
588  SUSPEND(econv_source_buffer_empty, 5);
589  }
590  next_byte = (unsigned char)*in_p++;
591  next_table = (unsigned int)next_info;
592  goto follow_byte;
593  case ZERObt: /* drop input */
594  continue;
595  case ONEbt:
596  SUSPEND_OBUF(9); *out_p++ = getBT1(next_info);
597  continue;
598  case TWObt:
599  SUSPEND_OBUF(10); *out_p++ = getBT1(next_info);
600  SUSPEND_OBUF(21); *out_p++ = getBT2(next_info);
601  continue;
602  case THREEbt:
603  SUSPEND_OBUF(11); *out_p++ = getBT1(next_info);
604  SUSPEND_OBUF(15); *out_p++ = getBT2(next_info);
605  SUSPEND_OBUF(16); *out_p++ = getBT3(next_info);
606  continue;
607  case FOURbt:
608  SUSPEND_OBUF(12); *out_p++ = getBT0(next_info);
609  SUSPEND_OBUF(17); *out_p++ = getBT1(next_info);
610  SUSPEND_OBUF(18); *out_p++ = getBT2(next_info);
611  SUSPEND_OBUF(19); *out_p++ = getBT3(next_info);
612  continue;
613  case GB4bt:
614  SUSPEND_OBUF(29); *out_p++ = getGB4bt0(next_info);
615  SUSPEND_OBUF(30); *out_p++ = getGB4bt1(next_info);
616  SUSPEND_OBUF(31); *out_p++ = getGB4bt2(next_info);
617  SUSPEND_OBUF(32); *out_p++ = getGB4bt3(next_info);
618  continue;
619  case STR1:
620  tc->output_index = 0;
621  while (tc->output_index < STR1_LENGTH(BYTE_ADDR(STR1_BYTEINDEX(next_info)))) {
622  SUSPEND_OBUF(28); *out_p++ = BYTE_ADDR(STR1_BYTEINDEX(next_info))[1+tc->output_index];
623  tc->output_index++;
624  }
625  continue;
626  case FUNii:
627  next_info = (VALUE)(*tr->func_ii)(TRANSCODING_STATE(tc), next_info);
628  goto follow_info;
629  case FUNsi:
630  {
631  const unsigned char *char_start;
632  size_t char_len;
633  char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
634  next_info = (VALUE)(*tr->func_si)(TRANSCODING_STATE(tc), char_start, (size_t)char_len);
635  goto follow_info;
636  }
637  case FUNio:
638  SUSPEND_OBUF(13);
639  if (tr->max_output <= out_stop - out_p)
640  out_p += tr->func_io(TRANSCODING_STATE(tc),
641  next_info, out_p, out_stop - out_p);
642  else {
643  writebuf_len = tr->func_io(TRANSCODING_STATE(tc),
644  next_info,
645  TRANSCODING_WRITEBUF(tc), TRANSCODING_WRITEBUF_SIZE(tc));
646  writebuf_off = 0;
647  while (writebuf_off < writebuf_len) {
648  SUSPEND_OBUF(20);
649  *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
650  }
651  }
652  break;
653  case FUNso:
654  {
655  const unsigned char *char_start;
656  size_t char_len;
657  SUSPEND_OBUF(14);
658  if (tr->max_output <= out_stop - out_p) {
659  char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
660  out_p += tr->func_so(TRANSCODING_STATE(tc),
661  char_start, (size_t)char_len,
662  out_p, out_stop - out_p);
663  }
664  else {
665  char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
666  writebuf_len = tr->func_so(TRANSCODING_STATE(tc),
667  char_start, (size_t)char_len,
668  TRANSCODING_WRITEBUF(tc), TRANSCODING_WRITEBUF_SIZE(tc));
669  writebuf_off = 0;
670  while (writebuf_off < writebuf_len) {
671  SUSPEND_OBUF(22);
672  *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
673  }
674  }
675  break;
676  }
677  case FUNsio:
678  {
679  const unsigned char *char_start;
680  size_t char_len;
681  SUSPEND_OBUF(33);
682  if (tr->max_output <= out_stop - out_p) {
683  char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
684  out_p += tr->func_sio(TRANSCODING_STATE(tc),
685  char_start, (size_t)char_len, next_info,
686  out_p, out_stop - out_p);
687  }
688  else {
689  char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
690  writebuf_len = tr->func_sio(TRANSCODING_STATE(tc),
691  char_start, (size_t)char_len, next_info,
692  TRANSCODING_WRITEBUF(tc), TRANSCODING_WRITEBUF_SIZE(tc));
693  writebuf_off = 0;
694  while (writebuf_off < writebuf_len) {
695  SUSPEND_OBUF(34);
696  *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
697  }
698  }
699  break;
700  }
701  case INVALID:
702  if (tc->recognized_len + (in_p - inchar_start) <= unitlen) {
703  if (tc->recognized_len + (in_p - inchar_start) < unitlen)
704  SUSPEND_AFTER_OUTPUT(26);
705  while ((opt & ECONV_PARTIAL_INPUT) && tc->recognized_len + (in_stop - inchar_start) < unitlen) {
706  in_p = in_stop;
707  SUSPEND(econv_source_buffer_empty, 8);
708  }
709  if (tc->recognized_len + (in_stop - inchar_start) <= unitlen) {
710  in_p = in_stop;
711  }
712  else {
713  in_p = inchar_start + (unitlen - tc->recognized_len);
714  }
715  }
716  else {
717  ssize_t invalid_len; /* including the last byte which causes invalid */
718  ssize_t discard_len;
719  invalid_len = tc->recognized_len + (in_p - inchar_start);
720  discard_len = ((invalid_len - 1) / unitlen) * unitlen;
721  readagain_len = invalid_len - discard_len;
722  }
723  goto invalid;
724  case UNDEF:
725  goto undef;
726  default:
727  rb_raise(rb_eRuntimeError, "unknown transcoding instruction");
728  }
729  continue;
730 
731  invalid:
732  SUSPEND(econv_invalid_byte_sequence, 1);
733  continue;
734 
735  incomplete:
736  SUSPEND(econv_incomplete_input, 27);
737  continue;
738 
739  undef:
740  SUSPEND(econv_undefined_conversion, 2);
741  continue;
742  }
743 
744  /* cleanup */
745  if (tr->finish_func) {
746  SUSPEND_OBUF(4);
747  if (tr->max_output <= out_stop - out_p) {
748  out_p += tr->finish_func(TRANSCODING_STATE(tc),
749  out_p, out_stop - out_p);
750  }
751  else {
752  writebuf_len = tr->finish_func(TRANSCODING_STATE(tc),
753  TRANSCODING_WRITEBUF(tc), TRANSCODING_WRITEBUF_SIZE(tc));
754  writebuf_off = 0;
755  while (writebuf_off < writebuf_len) {
756  SUSPEND_OBUF(23);
757  *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
758  }
759  }
760  }
761  while (1)
762  SUSPEND(econv_finished, 6);
763 #undef SUSPEND
764 #undef next_table
765 #undef next_info
766 #undef next_byte
767 #undef writebuf_len
768 #undef writebuf_off
769 }
770 
771 static rb_econv_result_t
772 transcode_restartable(const unsigned char **in_pos, unsigned char **out_pos,
773  const unsigned char *in_stop, unsigned char *out_stop,
774  rb_transcoding *tc,
775  const int opt)
776 {
777  if (tc->readagain_len) {
778  unsigned char *readagain_buf = ALLOCA_N(unsigned char, tc->readagain_len);
779  const unsigned char *readagain_pos = readagain_buf;
780  const unsigned char *readagain_stop = readagain_buf + tc->readagain_len;
781  rb_econv_result_t res;
782 
783  MEMCPY(readagain_buf, TRANSCODING_READBUF(tc) + tc->recognized_len,
784  unsigned char, tc->readagain_len);
785  tc->readagain_len = 0;
786  res = transcode_restartable0(&readagain_pos, out_pos, readagain_stop, out_stop, tc, opt|ECONV_PARTIAL_INPUT);
787  if (res != econv_source_buffer_empty) {
788  MEMCPY(TRANSCODING_READBUF(tc) + tc->recognized_len + tc->readagain_len,
789  readagain_pos, unsigned char, readagain_stop - readagain_pos);
790  tc->readagain_len += readagain_stop - readagain_pos;
791  return res;
792  }
793  }
794  return transcode_restartable0(in_pos, out_pos, in_stop, out_stop, tc, opt);
795 }
796 
797 static rb_transcoding *
798 rb_transcoding_open_by_transcoder(const rb_transcoder *tr, int flags)
799 {
800  rb_transcoding *tc;
801 
802  tc = ALLOC(rb_transcoding);
803  tc->transcoder = tr;
804  tc->flags = flags;
805  if (TRANSCODING_STATE_EMBED_MAX < tr->state_size)
806  tc->state.ptr = xmalloc(tr->state_size);
807  if (tr->state_init_func) {
808  (tr->state_init_func)(TRANSCODING_STATE(tc)); /* xxx: check return value */
809  }
810  tc->resume_position = 0;
811  tc->recognized_len = 0;
812  tc->readagain_len = 0;
813  tc->writebuf_len = 0;
814  tc->writebuf_off = 0;
815  if ((int)sizeof(tc->readbuf.ary) < tr->max_input) {
816  tc->readbuf.ptr = xmalloc(tr->max_input);
817  }
818  if ((int)sizeof(tc->writebuf.ary) < tr->max_output) {
819  tc->writebuf.ptr = xmalloc(tr->max_output);
820  }
821  return tc;
822 }
823 
824 static rb_econv_result_t
825 rb_transcoding_convert(rb_transcoding *tc,
826  const unsigned char **input_ptr, const unsigned char *input_stop,
827  unsigned char **output_ptr, unsigned char *output_stop,
828  int flags)
829 {
830  return transcode_restartable(
831  input_ptr, output_ptr,
832  input_stop, output_stop,
833  tc, flags);
834 }
835 
836 static void
837 rb_transcoding_close(rb_transcoding *tc)
838 {
839  const rb_transcoder *tr = tc->transcoder;
840  if (tr->state_fini_func) {
841  (tr->state_fini_func)(TRANSCODING_STATE(tc)); /* check return value? */
842  }
843  if (TRANSCODING_STATE_EMBED_MAX < tr->state_size)
844  xfree(tc->state.ptr);
845  if ((int)sizeof(tc->readbuf.ary) < tr->max_input)
846  xfree(tc->readbuf.ptr);
847  if ((int)sizeof(tc->writebuf.ary) < tr->max_output)
848  xfree(tc->writebuf.ptr);
849  xfree(tc);
850 }
851 
852 static size_t
853 rb_transcoding_memsize(rb_transcoding *tc)
854 {
855  size_t size = sizeof(rb_transcoding);
856  const rb_transcoder *tr = tc->transcoder;
857 
858  if (TRANSCODING_STATE_EMBED_MAX < tr->state_size) {
859  size += tr->state_size;
860  }
861  if ((int)sizeof(tc->readbuf.ary) < tr->max_input) {
862  size += tr->max_input;
863  }
864  if ((int)sizeof(tc->writebuf.ary) < tr->max_output) {
865  size += tr->max_output;
866  }
867  return size;
868 }
869 
870 static rb_econv_t *
871 rb_econv_alloc(int n_hint)
872 {
873  rb_econv_t *ec;
874 
875  if (n_hint <= 0)
876  n_hint = 1;
877 
878  ec = ALLOC(rb_econv_t);
879  ec->flags = 0;
880  ec->source_encoding_name = NULL;
881  ec->destination_encoding_name = NULL;
882  ec->started = 0;
883  ec->replacement_str = NULL;
884  ec->replacement_len = 0;
885  ec->replacement_enc = NULL;
886  ec->replacement_allocated = 0;
887  ec->in_buf_start = NULL;
888  ec->in_data_start = NULL;
889  ec->in_data_end = NULL;
890  ec->in_buf_end = NULL;
891  ec->num_allocated = n_hint;
892  ec->num_trans = 0;
893  ec->elems = ALLOC_N(rb_econv_elem_t, ec->num_allocated);
894  ec->num_finished = 0;
895  ec->last_tc = NULL;
896  ec->last_error.result = econv_source_buffer_empty;
897  ec->last_error.error_tc = NULL;
898  ec->last_error.source_encoding = NULL;
899  ec->last_error.destination_encoding = NULL;
900  ec->last_error.error_bytes_start = NULL;
901  ec->last_error.error_bytes_len = 0;
902  ec->last_error.readagain_len = 0;
903  ec->source_encoding = NULL;
904  ec->destination_encoding = NULL;
905  return ec;
906 }
907 
908 static int
909 rb_econv_add_transcoder_at(rb_econv_t *ec, const rb_transcoder *tr, int i)
910 {
911  int n, j;
912  int bufsize = 4096;
913  unsigned char *p;
914 
915  if (ec->num_trans == ec->num_allocated) {
916  n = ec->num_allocated * 2;
917  REALLOC_N(ec->elems, rb_econv_elem_t, n);
918  ec->num_allocated = n;
919  }
920 
921  p = xmalloc(bufsize);
922 
923  MEMMOVE(ec->elems+i+1, ec->elems+i, rb_econv_elem_t, ec->num_trans-i);
924 
925  ec->elems[i].tc = rb_transcoding_open_by_transcoder(tr, 0);
926  ec->elems[i].out_buf_start = p;
927  ec->elems[i].out_buf_end = p + bufsize;
928  ec->elems[i].out_data_start = p;
929  ec->elems[i].out_data_end = p;
930  ec->elems[i].last_result = econv_source_buffer_empty;
931 
932  ec->num_trans++;
933 
934  if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding))
935  for (j = ec->num_trans-1; i <= j; j--) {
936  rb_transcoding *tc = ec->elems[j].tc;
937  const rb_transcoder *tr2 = tc->transcoder;
938  if (!DECORATOR_P(tr2->src_encoding, tr2->dst_encoding)) {
939  ec->last_tc = tc;
940  break;
941  }
942  }
943 
944  return 0;
945 }
946 
947 static rb_econv_t *
948 rb_econv_open_by_transcoder_entries(int n, transcoder_entry_t **entries)
949 {
950  rb_econv_t *ec;
951  int i, ret;
952 
953  for (i = 0; i < n; i++) {
954  const rb_transcoder *tr;
955  tr = load_transcoder_entry(entries[i]);
956  if (!tr)
957  return NULL;
958  }
959 
960  ec = rb_econv_alloc(n);
961 
962  for (i = 0; i < n; i++) {
963  const rb_transcoder *tr = load_transcoder_entry(entries[i]);
964  ret = rb_econv_add_transcoder_at(ec, tr, ec->num_trans);
965  if (ret == -1) {
966  rb_econv_close(ec);
967  return NULL;
968  }
969  }
970 
971  return ec;
972 }
973 
974 struct trans_open_t {
975  transcoder_entry_t **entries;
976  int num_additional;
977 };
978 
979 static void
980 trans_open_i(const char *sname, const char *dname, int depth, void *arg)
981 {
982  struct trans_open_t *toarg = arg;
983 
984  if (!toarg->entries) {
985  toarg->entries = ALLOC_N(transcoder_entry_t *, depth+1+toarg->num_additional);
986  }
987  toarg->entries[depth] = get_transcoder_entry(sname, dname);
988 }
989 
990 static rb_econv_t *
991 rb_econv_open0(const char *sname, const char *dname, int ecflags)
992 {
993  transcoder_entry_t **entries = NULL;
994  int num_trans;
995  rb_econv_t *ec;
996 
997  /* Just check if sname and dname are defined */
998  /* (This check is needed?) */
999  if (*sname) rb_enc_find_index(sname);
1000  if (*dname) rb_enc_find_index(dname);
1001 
1002  if (*sname == '\0' && *dname == '\0') {
1003  num_trans = 0;
1004  entries = NULL;
1005  sname = dname = "";
1006  }
1007  else {
1008  struct trans_open_t toarg;
1009  toarg.entries = NULL;
1010  toarg.num_additional = 0;
1011  num_trans = transcode_search_path(sname, dname, trans_open_i, (void *)&toarg);
1012  entries = toarg.entries;
1013  if (num_trans < 0) {
1014  xfree(entries);
1015  return NULL;
1016  }
1017  }
1018 
1019  ec = rb_econv_open_by_transcoder_entries(num_trans, entries);
1020  xfree(entries);
1021  if (!ec)
1022  return NULL;
1023 
1024  ec->flags = ecflags;
1025  ec->source_encoding_name = sname;
1026  ec->destination_encoding_name = dname;
1027 
1028  return ec;
1029 }
1030 
1031 #define MAX_ECFLAGS_DECORATORS 32
1032 
1033 static int
1034 decorator_names(int ecflags, const char **decorators_ret)
1035 {
1036  int num_decorators;
1037 
1038  switch (ecflags & ECONV_NEWLINE_DECORATOR_MASK) {
1042  case 0:
1043  break;
1044  default:
1045  return -1;
1046  }
1047 
1048  if ((ecflags & ECONV_XML_TEXT_DECORATOR) &&
1050  return -1;
1051 
1052  num_decorators = 0;
1053 
1054  if (ecflags & ECONV_XML_TEXT_DECORATOR)
1055  decorators_ret[num_decorators++] = "xml_text_escape";
1056  if (ecflags & ECONV_XML_ATTR_CONTENT_DECORATOR)
1057  decorators_ret[num_decorators++] = "xml_attr_content_escape";
1058  if (ecflags & ECONV_XML_ATTR_QUOTE_DECORATOR)
1059  decorators_ret[num_decorators++] = "xml_attr_quote";
1060 
1061  if (ecflags & ECONV_CRLF_NEWLINE_DECORATOR)
1062  decorators_ret[num_decorators++] = "crlf_newline";
1063  if (ecflags & ECONV_CR_NEWLINE_DECORATOR)
1064  decorators_ret[num_decorators++] = "cr_newline";
1065  if (ecflags & ECONV_UNIVERSAL_NEWLINE_DECORATOR)
1066  decorators_ret[num_decorators++] = "universal_newline";
1067 
1068  return num_decorators;
1069 }
1070 
1071 rb_econv_t *
1072 rb_econv_open(const char *sname, const char *dname, int ecflags)
1073 {
1074  rb_econv_t *ec;
1075  int num_decorators;
1076  const char *decorators[MAX_ECFLAGS_DECORATORS];
1077  int i;
1078 
1079  num_decorators = decorator_names(ecflags, decorators);
1080  if (num_decorators == -1)
1081  return NULL;
1082 
1083  ec = rb_econv_open0(sname, dname, ecflags & ECONV_ERROR_HANDLER_MASK);
1084  if (!ec)
1085  return NULL;
1086 
1087  for (i = 0; i < num_decorators; i++)
1088  if (rb_econv_decorate_at_last(ec, decorators[i]) == -1) {
1089  rb_econv_close(ec);
1090  return NULL;
1091  }
1092 
1093  ec->flags |= ecflags & ~ECONV_ERROR_HANDLER_MASK;
1094 
1095  return ec;
1096 }
1097 
1098 static int
1099 trans_sweep(rb_econv_t *ec,
1100  const unsigned char **input_ptr, const unsigned char *input_stop,
1101  unsigned char **output_ptr, unsigned char *output_stop,
1102  int flags,
1103  int start)
1104 {
1105  int try;
1106  int i, f;
1107 
1108  const unsigned char **ipp, *is, *iold;
1109  unsigned char **opp, *os, *oold;
1110  rb_econv_result_t res;
1111 
1112  try = 1;
1113  while (try) {
1114  try = 0;
1115  for (i = start; i < ec->num_trans; i++) {
1116  rb_econv_elem_t *te = &ec->elems[i];
1117 
1118  if (i == 0) {
1119  ipp = input_ptr;
1120  is = input_stop;
1121  }
1122  else {
1123  rb_econv_elem_t *prev_te = &ec->elems[i-1];
1124  ipp = (const unsigned char **)&prev_te->out_data_start;
1125  is = prev_te->out_data_end;
1126  }
1127 
1128  if (i == ec->num_trans-1) {
1129  opp = output_ptr;
1130  os = output_stop;
1131  }
1132  else {
1133  if (te->out_buf_start != te->out_data_start) {
1134  ssize_t len = te->out_data_end - te->out_data_start;
1135  ssize_t off = te->out_data_start - te->out_buf_start;
1136  MEMMOVE(te->out_buf_start, te->out_data_start, unsigned char, len);
1137  te->out_data_start = te->out_buf_start;
1138  te->out_data_end -= off;
1139  }
1140  opp = &te->out_data_end;
1141  os = te->out_buf_end;
1142  }
1143 
1144  f = flags;
1145  if (ec->num_finished != i)
1146  f |= ECONV_PARTIAL_INPUT;
1147  if (i == 0 && (flags & ECONV_AFTER_OUTPUT)) {
1148  start = 1;
1149  flags &= ~ECONV_AFTER_OUTPUT;
1150  }
1151  if (i != 0)
1152  f &= ~ECONV_AFTER_OUTPUT;
1153  iold = *ipp;
1154  oold = *opp;
1155  te->last_result = res = rb_transcoding_convert(te->tc, ipp, is, opp, os, f);
1156  if (iold != *ipp || oold != *opp)
1157  try = 1;
1158 
1159  switch (res) {
1163  case econv_after_output:
1164  return i;
1165 
1168  break;
1169 
1170  case econv_finished:
1171  ec->num_finished = i+1;
1172  break;
1173  }
1174  }
1175  }
1176  return -1;
1177 }
1178 
1179 static rb_econv_result_t
1180 rb_trans_conv(rb_econv_t *ec,
1181  const unsigned char **input_ptr, const unsigned char *input_stop,
1182  unsigned char **output_ptr, unsigned char *output_stop,
1183  int flags,
1184  int *result_position_ptr)
1185 {
1186  int i;
1187  int needreport_index;
1188  int sweep_start;
1189 
1190  unsigned char empty_buf;
1191  unsigned char *empty_ptr = &empty_buf;
1192 
1193  if (!input_ptr) {
1194  input_ptr = (const unsigned char **)&empty_ptr;
1195  input_stop = empty_ptr;
1196  }
1197 
1198  if (!output_ptr) {
1199  output_ptr = &empty_ptr;
1200  output_stop = empty_ptr;
1201  }
1202 
1203  if (ec->elems[0].last_result == econv_after_output)
1204  ec->elems[0].last_result = econv_source_buffer_empty;
1205 
1206  for (i = ec->num_trans-1; 0 <= i; i--) {
1207  switch (ec->elems[i].last_result) {
1211  case econv_after_output:
1212  case econv_finished:
1213  sweep_start = i+1;
1214  goto found_needreport;
1215 
1218  break;
1219 
1220  default:
1221  rb_bug("unexpected transcode last result");
1222  }
1223  }
1224 
1225  /* /^[sd]+$/ is confirmed. but actually /^s*d*$/. */
1226 
1227  if (ec->elems[ec->num_trans-1].last_result == econv_destination_buffer_full &&
1228  (flags & ECONV_AFTER_OUTPUT)) {
1229  rb_econv_result_t res;
1230 
1231  res = rb_trans_conv(ec, NULL, NULL, output_ptr, output_stop,
1233  result_position_ptr);
1234 
1235  if (res == econv_source_buffer_empty)
1236  return econv_after_output;
1237  return res;
1238  }
1239 
1240  sweep_start = 0;
1241 
1242  found_needreport:
1243 
1244  do {
1245  needreport_index = trans_sweep(ec, input_ptr, input_stop, output_ptr, output_stop, flags, sweep_start);
1246  sweep_start = needreport_index + 1;
1247  } while (needreport_index != -1 && needreport_index != ec->num_trans-1);
1248 
1249  for (i = ec->num_trans-1; 0 <= i; i--) {
1250  if (ec->elems[i].last_result != econv_source_buffer_empty) {
1251  rb_econv_result_t res = ec->elems[i].last_result;
1252  if (res == econv_invalid_byte_sequence ||
1253  res == econv_incomplete_input ||
1254  res == econv_undefined_conversion ||
1255  res == econv_after_output) {
1256  ec->elems[i].last_result = econv_source_buffer_empty;
1257  }
1258  if (result_position_ptr)
1259  *result_position_ptr = i;
1260  return res;
1261  }
1262  }
1263  if (result_position_ptr)
1264  *result_position_ptr = -1;
1266 }
1267 
1268 static rb_econv_result_t
1269 rb_econv_convert0(rb_econv_t *ec,
1270  const unsigned char **input_ptr, const unsigned char *input_stop,
1271  unsigned char **output_ptr, unsigned char *output_stop,
1272  int flags)
1273 {
1274  rb_econv_result_t res;
1275  int result_position;
1276  int has_output = 0;
1277 
1278  memset(&ec->last_error, 0, sizeof(ec->last_error));
1279 
1280  if (ec->num_trans == 0) {
1281  size_t len;
1282  if (ec->in_buf_start && ec->in_data_start != ec->in_data_end) {
1283  if (output_stop - *output_ptr < ec->in_data_end - ec->in_data_start) {
1284  len = output_stop - *output_ptr;
1285  memcpy(*output_ptr, ec->in_data_start, len);
1286  *output_ptr = output_stop;
1287  ec->in_data_start += len;
1289  goto gotresult;
1290  }
1291  len = ec->in_data_end - ec->in_data_start;
1292  memcpy(*output_ptr, ec->in_data_start, len);
1293  *output_ptr += len;
1294  ec->in_data_start = ec->in_data_end = ec->in_buf_start;
1295  if (flags & ECONV_AFTER_OUTPUT) {
1296  res = econv_after_output;
1297  goto gotresult;
1298  }
1299  }
1300  if (output_stop - *output_ptr < input_stop - *input_ptr) {
1301  len = output_stop - *output_ptr;
1302  }
1303  else {
1304  len = input_stop - *input_ptr;
1305  }
1306  if (0 < len && (flags & ECONV_AFTER_OUTPUT)) {
1307  *(*output_ptr)++ = *(*input_ptr)++;
1308  res = econv_after_output;
1309  goto gotresult;
1310  }
1311  memcpy(*output_ptr, *input_ptr, len);
1312  *output_ptr += len;
1313  *input_ptr += len;
1314  if (*input_ptr != input_stop)
1316  else if (flags & ECONV_PARTIAL_INPUT)
1318  else
1319  res = econv_finished;
1320  goto gotresult;
1321  }
1322 
1323  if (ec->elems[ec->num_trans-1].out_data_start) {
1324  unsigned char *data_start = ec->elems[ec->num_trans-1].out_data_start;
1325  unsigned char *data_end = ec->elems[ec->num_trans-1].out_data_end;
1326  if (data_start != data_end) {
1327  size_t len;
1328  if (output_stop - *output_ptr < data_end - data_start) {
1329  len = output_stop - *output_ptr;
1330  memcpy(*output_ptr, data_start, len);
1331  *output_ptr = output_stop;
1332  ec->elems[ec->num_trans-1].out_data_start += len;
1334  goto gotresult;
1335  }
1336  len = data_end - data_start;
1337  memcpy(*output_ptr, data_start, len);
1338  *output_ptr += len;
1339  ec->elems[ec->num_trans-1].out_data_start =
1340  ec->elems[ec->num_trans-1].out_data_end =
1341  ec->elems[ec->num_trans-1].out_buf_start;
1342  has_output = 1;
1343  }
1344  }
1345 
1346  if (ec->in_buf_start &&
1347  ec->in_data_start != ec->in_data_end) {
1348  res = rb_trans_conv(ec, (const unsigned char **)&ec->in_data_start, ec->in_data_end, output_ptr, output_stop,
1349  (flags&~ECONV_AFTER_OUTPUT)|ECONV_PARTIAL_INPUT, &result_position);
1350  if (res != econv_source_buffer_empty)
1351  goto gotresult;
1352  }
1353 
1354  if (has_output &&
1355  (flags & ECONV_AFTER_OUTPUT) &&
1356  *input_ptr != input_stop) {
1357  input_stop = *input_ptr;
1358  res = rb_trans_conv(ec, input_ptr, input_stop, output_ptr, output_stop, flags, &result_position);
1359  if (res == econv_source_buffer_empty)
1360  res = econv_after_output;
1361  }
1362  else if ((flags & ECONV_AFTER_OUTPUT) ||
1363  ec->num_trans == 1) {
1364  res = rb_trans_conv(ec, input_ptr, input_stop, output_ptr, output_stop, flags, &result_position);
1365  }
1366  else {
1367  flags |= ECONV_AFTER_OUTPUT;
1368  do {
1369  res = rb_trans_conv(ec, input_ptr, input_stop, output_ptr, output_stop, flags, &result_position);
1370  } while (res == econv_after_output);
1371  }
1372 
1373  gotresult:
1374  ec->last_error.result = res;
1375  if (res == econv_invalid_byte_sequence ||
1376  res == econv_incomplete_input ||
1377  res == econv_undefined_conversion) {
1378  rb_transcoding *error_tc = ec->elems[result_position].tc;
1379  ec->last_error.error_tc = error_tc;
1380  ec->last_error.source_encoding = error_tc->transcoder->src_encoding;
1381  ec->last_error.destination_encoding = error_tc->transcoder->dst_encoding;
1382  ec->last_error.error_bytes_start = TRANSCODING_READBUF(error_tc);
1383  ec->last_error.error_bytes_len = error_tc->recognized_len;
1384  ec->last_error.readagain_len = error_tc->readagain_len;
1385  }
1386 
1387  return res;
1388 }
1389 
1390 static int output_replacement_character(rb_econv_t *ec);
1391 
1392 static int
1393 output_hex_charref(rb_econv_t *ec)
1394 {
1395  int ret;
1396  unsigned char utfbuf[1024];
1397  const unsigned char *utf;
1398  size_t utf_len;
1399  int utf_allocated = 0;
1400  char charef_buf[16];
1401  const unsigned char *p;
1402 
1403  if (encoding_equal(ec->last_error.source_encoding, "UTF-32BE")) {
1404  utf = ec->last_error.error_bytes_start;
1405  utf_len = ec->last_error.error_bytes_len;
1406  }
1407  else {
1408  utf = allocate_converted_string(ec->last_error.source_encoding, "UTF-32BE",
1409  ec->last_error.error_bytes_start, ec->last_error.error_bytes_len,
1410  utfbuf, sizeof(utfbuf),
1411  &utf_len);
1412  if (!utf)
1413  return -1;
1414  if (utf != utfbuf && utf != ec->last_error.error_bytes_start)
1415  utf_allocated = 1;
1416  }
1417 
1418  if (utf_len % 4 != 0)
1419  goto fail;
1420 
1421  p = utf;
1422  while (4 <= utf_len) {
1423  unsigned int u = 0;
1424  u += p[0] << 24;
1425  u += p[1] << 16;
1426  u += p[2] << 8;
1427  u += p[3];
1428  snprintf(charef_buf, sizeof(charef_buf), "&#x%X;", u);
1429 
1430  ret = rb_econv_insert_output(ec, (unsigned char *)charef_buf, strlen(charef_buf), "US-ASCII");
1431  if (ret == -1)
1432  goto fail;
1433 
1434  p += 4;
1435  utf_len -= 4;
1436  }
1437 
1438  if (utf_allocated)
1439  xfree((void *)utf);
1440  return 0;
1441 
1442  fail:
1443  if (utf_allocated)
1444  xfree((void *)utf);
1445  return -1;
1446 }
1447 
1450  const unsigned char **input_ptr, const unsigned char *input_stop,
1451  unsigned char **output_ptr, unsigned char *output_stop,
1452  int flags)
1453 {
1454  rb_econv_result_t ret;
1455 
1456  unsigned char empty_buf;
1457  unsigned char *empty_ptr = &empty_buf;
1458 
1459  ec->started = 1;
1460 
1461  if (!input_ptr) {
1462  input_ptr = (const unsigned char **)&empty_ptr;
1463  input_stop = empty_ptr;
1464  }
1465 
1466  if (!output_ptr) {
1467  output_ptr = &empty_ptr;
1468  output_stop = empty_ptr;
1469  }
1470 
1471  resume:
1472  ret = rb_econv_convert0(ec, input_ptr, input_stop, output_ptr, output_stop, flags);
1473 
1474  if (ret == econv_invalid_byte_sequence ||
1475  ret == econv_incomplete_input) {
1476  /* deal with invalid byte sequence */
1477  /* todo: add more alternative behaviors */
1478  switch (ec->flags & ECONV_INVALID_MASK) {
1479  case ECONV_INVALID_REPLACE:
1480  if (output_replacement_character(ec) == 0)
1481  goto resume;
1482  }
1483  }
1484 
1485  if (ret == econv_undefined_conversion) {
1486  /* valid character in source encoding
1487  * but no related character(s) in destination encoding */
1488  /* todo: add more alternative behaviors */
1489  switch (ec->flags & ECONV_UNDEF_MASK) {
1490  case ECONV_UNDEF_REPLACE:
1491  if (output_replacement_character(ec) == 0)
1492  goto resume;
1493  break;
1494 
1496  if (output_hex_charref(ec) == 0)
1497  goto resume;
1498  break;
1499  }
1500  }
1501 
1502  return ret;
1503 }
1504 
1505 const char *
1507 {
1508  rb_transcoding *tc = ec->last_tc;
1509  const rb_transcoder *tr;
1510 
1511  if (tc == NULL)
1512  return "";
1513 
1514  tr = tc->transcoder;
1515 
1516  if (tr->asciicompat_type == asciicompat_encoder)
1517  return tr->src_encoding;
1518  return tr->dst_encoding;
1519 }
1520 
1521 static unsigned char *
1522 allocate_converted_string(const char *sname, const char *dname,
1523  const unsigned char *str, size_t len,
1524  unsigned char *caller_dst_buf, size_t caller_dst_bufsize,
1525  size_t *dst_len_ptr)
1526 {
1527  unsigned char *dst_str;
1528  size_t dst_len;
1529  size_t dst_bufsize;
1530 
1531  rb_econv_t *ec;
1532  rb_econv_result_t res;
1533 
1534  const unsigned char *sp;
1535  unsigned char *dp;
1536 
1537  if (caller_dst_buf)
1538  dst_bufsize = caller_dst_bufsize;
1539  else if (len == 0)
1540  dst_bufsize = 1;
1541  else
1542  dst_bufsize = len;
1543 
1544  ec = rb_econv_open(sname, dname, 0);
1545  if (ec == NULL)
1546  return NULL;
1547  if (caller_dst_buf)
1548  dst_str = caller_dst_buf;
1549  else
1550  dst_str = xmalloc(dst_bufsize);
1551  dst_len = 0;
1552  sp = str;
1553  dp = dst_str+dst_len;
1554  res = rb_econv_convert(ec, &sp, str+len, &dp, dst_str+dst_bufsize, 0);
1555  dst_len = dp - dst_str;
1556  while (res == econv_destination_buffer_full) {
1557  if (SIZE_MAX/2 < dst_bufsize) {
1558  goto fail;
1559  }
1560  dst_bufsize *= 2;
1561  if (dst_str == caller_dst_buf) {
1562  unsigned char *tmp;
1563  tmp = xmalloc(dst_bufsize);
1564  memcpy(tmp, dst_str, dst_bufsize/2);
1565  dst_str = tmp;
1566  }
1567  else {
1568  dst_str = xrealloc(dst_str, dst_bufsize);
1569  }
1570  dp = dst_str+dst_len;
1571  res = rb_econv_convert(ec, &sp, str+len, &dp, dst_str+dst_bufsize, 0);
1572  dst_len = dp - dst_str;
1573  }
1574  if (res != econv_finished) {
1575  goto fail;
1576  }
1577  rb_econv_close(ec);
1578  *dst_len_ptr = dst_len;
1579  return dst_str;
1580 
1581  fail:
1582  if (dst_str != caller_dst_buf)
1583  xfree(dst_str);
1584  rb_econv_close(ec);
1585  return NULL;
1586 }
1587 
1588 /* result: 0:success -1:failure */
1589 int
1591  const unsigned char *str, size_t len, const char *str_encoding)
1592 {
1593  const char *insert_encoding = rb_econv_encoding_to_insert_output(ec);
1594  unsigned char insert_buf[4096];
1595  const unsigned char *insert_str = NULL;
1596  size_t insert_len;
1597 
1598  int last_trans_index;
1599  rb_transcoding *tc;
1600 
1601  unsigned char **buf_start_p;
1602  unsigned char **data_start_p;
1603  unsigned char **data_end_p;
1604  unsigned char **buf_end_p;
1605 
1606  size_t need;
1607 
1608  ec->started = 1;
1609 
1610  if (len == 0)
1611  return 0;
1612 
1613  if (encoding_equal(insert_encoding, str_encoding)) {
1614  insert_str = str;
1615  insert_len = len;
1616  }
1617  else {
1618  insert_str = allocate_converted_string(str_encoding, insert_encoding,
1619  str, len, insert_buf, sizeof(insert_buf), &insert_len);
1620  if (insert_str == NULL)
1621  return -1;
1622  }
1623 
1624  need = insert_len;
1625 
1626  last_trans_index = ec->num_trans-1;
1627  if (ec->num_trans == 0) {
1628  tc = NULL;
1629  buf_start_p = &ec->in_buf_start;
1630  data_start_p = &ec->in_data_start;
1631  data_end_p = &ec->in_data_end;
1632  buf_end_p = &ec->in_buf_end;
1633  }
1634  else if (ec->elems[last_trans_index].tc->transcoder->asciicompat_type == asciicompat_encoder) {
1635  tc = ec->elems[last_trans_index].tc;
1636  need += tc->readagain_len;
1637  if (need < insert_len)
1638  goto fail;
1639  if (last_trans_index == 0) {
1640  buf_start_p = &ec->in_buf_start;
1641  data_start_p = &ec->in_data_start;
1642  data_end_p = &ec->in_data_end;
1643  buf_end_p = &ec->in_buf_end;
1644  }
1645  else {
1646  rb_econv_elem_t *ee = &ec->elems[last_trans_index-1];
1647  buf_start_p = &ee->out_buf_start;
1648  data_start_p = &ee->out_data_start;
1649  data_end_p = &ee->out_data_end;
1650  buf_end_p = &ee->out_buf_end;
1651  }
1652  }
1653  else {
1654  rb_econv_elem_t *ee = &ec->elems[last_trans_index];
1655  buf_start_p = &ee->out_buf_start;
1656  data_start_p = &ee->out_data_start;
1657  data_end_p = &ee->out_data_end;
1658  buf_end_p = &ee->out_buf_end;
1659  tc = ec->elems[last_trans_index].tc;
1660  }
1661 
1662  if (*buf_start_p == NULL) {
1663  unsigned char *buf = xmalloc(need);
1664  *buf_start_p = buf;
1665  *data_start_p = buf;
1666  *data_end_p = buf;
1667  *buf_end_p = buf+need;
1668  }
1669  else if ((size_t)(*buf_end_p - *data_end_p) < need) {
1670  MEMMOVE(*buf_start_p, *data_start_p, unsigned char, *data_end_p - *data_start_p);
1671  *data_end_p = *buf_start_p + (*data_end_p - *data_start_p);
1672  *data_start_p = *buf_start_p;
1673  if ((size_t)(*buf_end_p - *data_end_p) < need) {
1674  unsigned char *buf;
1675  size_t s = (*data_end_p - *buf_start_p) + need;
1676  if (s < need)
1677  goto fail;
1678  buf = xrealloc(*buf_start_p, s);
1679  *data_start_p = buf;
1680  *data_end_p = buf + (*data_end_p - *buf_start_p);
1681  *buf_start_p = buf;
1682  *buf_end_p = buf + s;
1683  }
1684  }
1685 
1686  memcpy(*data_end_p, insert_str, insert_len);
1687  *data_end_p += insert_len;
1688  if (tc && tc->transcoder->asciicompat_type == asciicompat_encoder) {
1689  memcpy(*data_end_p, TRANSCODING_READBUF(tc)+tc->recognized_len, tc->readagain_len);
1690  *data_end_p += tc->readagain_len;
1691  tc->readagain_len = 0;
1692  }
1693 
1694  if (insert_str != str && insert_str != insert_buf)
1695  xfree((void*)insert_str);
1696  return 0;
1697 
1698  fail:
1699  if (insert_str != str && insert_str != insert_buf)
1700  xfree((void*)insert_str);
1701  return -1;
1702 }
1703 
1704 void
1706 {
1707  int i;
1708 
1709  if (ec->replacement_allocated) {
1710  xfree((void *)ec->replacement_str);
1711  }
1712  for (i = 0; i < ec->num_trans; i++) {
1713  rb_transcoding_close(ec->elems[i].tc);
1714  if (ec->elems[i].out_buf_start)
1715  xfree(ec->elems[i].out_buf_start);
1716  }
1717  xfree(ec->in_buf_start);
1718  xfree(ec->elems);
1719  xfree(ec);
1720 }
1721 
1722 size_t
1723 rb_econv_memsize(rb_econv_t *ec)
1724 {
1725  size_t size = sizeof(rb_econv_t);
1726  int i;
1727 
1728  if (ec->replacement_allocated) {
1729  size += ec->replacement_len;
1730  }
1731  for (i = 0; i < ec->num_trans; i++) {
1732  size += rb_transcoding_memsize(ec->elems[i].tc);
1733 
1734  if (ec->elems[i].out_buf_start) {
1735  size += ec->elems[i].out_buf_end - ec->elems[i].out_buf_start;
1736  }
1737  }
1738  size += ec->in_buf_end - ec->in_buf_start;
1739  size += sizeof(rb_econv_elem_t) * ec->num_allocated;
1740 
1741  return size;
1742 }
1743 
1744 int
1746 {
1747  if (ec->num_trans == 0)
1748  return 0;
1749 #if SIZEOF_SIZE_T > SIZEOF_INT
1750  if (ec->elems[0].tc->readagain_len > INT_MAX) return INT_MAX;
1751 #endif
1752  return (int)ec->elems[0].tc->readagain_len;
1753 }
1754 
1755 void
1756 rb_econv_putback(rb_econv_t *ec, unsigned char *p, int n)
1757 {
1758  rb_transcoding *tc;
1759  if (ec->num_trans == 0 || n == 0)
1760  return;
1761  tc = ec->elems[0].tc;
1762  memcpy(p, TRANSCODING_READBUF(tc) + tc->recognized_len + tc->readagain_len - n, n);
1763  tc->readagain_len -= n;
1764 }
1765 
1767  const char *ascii_compat_name;
1768  const char *ascii_incompat_name;
1769 };
1770 
1771 static int
1772 asciicompat_encoding_i(st_data_t key, st_data_t val, st_data_t arg)
1773 {
1774  struct asciicompat_encoding_t *data = (struct asciicompat_encoding_t *)arg;
1775  transcoder_entry_t *entry = (transcoder_entry_t *)val;
1776  const rb_transcoder *tr;
1777 
1778  if (DECORATOR_P(entry->sname, entry->dname))
1779  return ST_CONTINUE;
1780  tr = load_transcoder_entry(entry);
1781  if (tr && tr->asciicompat_type == asciicompat_decoder) {
1782  data->ascii_compat_name = tr->dst_encoding;
1783  return ST_STOP;
1784  }
1785  return ST_CONTINUE;
1786 }
1787 
1788 const char *
1789 rb_econv_asciicompat_encoding(const char *ascii_incompat_name)
1790 {
1791  st_data_t v;
1792  st_table *table2;
1793  struct asciicompat_encoding_t data;
1794 
1795  if (!st_lookup(transcoder_table, (st_data_t)ascii_incompat_name, &v))
1796  return NULL;
1797  table2 = (st_table *)v;
1798 
1799  /*
1800  * Assumption:
1801  * There is at most one transcoder for
1802  * converting from ASCII incompatible encoding.
1803  *
1804  * For ISO-2022-JP, there is ISO-2022-JP -> stateless-ISO-2022-JP and no others.
1805  */
1806  if (table2->num_entries != 1)
1807  return NULL;
1808 
1809  data.ascii_incompat_name = ascii_incompat_name;
1810  data.ascii_compat_name = NULL;
1811  st_foreach(table2, asciicompat_encoding_i, (st_data_t)&data);
1812  return data.ascii_compat_name;
1813 }
1814 
1815 /*
1816  * Append `len` bytes pointed by `ss` to `dst` with converting with `ec`.
1817  *
1818  * If the result of the conversion is not compatible with the encoding of
1819  * `dst`, `dst` may not be valid encoding.
1820  */
1821 VALUE
1822 rb_econv_append(rb_econv_t *ec, const char *ss, long len, VALUE dst, int flags)
1823 {
1824  unsigned const char *sp, *se;
1825  unsigned char *ds, *dp, *de;
1826  rb_econv_result_t res;
1827  int max_output;
1828  enum ruby_coderange_type coderange;
1829  rb_encoding *dst_enc = ec->destination_encoding;
1830 
1831  if (NIL_P(dst)) {
1832  dst = rb_str_buf_new(len);
1833  if (dst_enc) {
1834  rb_enc_associate(dst, dst_enc);
1835  }
1836  coderange = ENC_CODERANGE_7BIT; // scan from the start
1837  }
1838  else {
1839  dst_enc = rb_enc_get(dst);
1840  coderange = rb_enc_str_coderange(dst);
1841  }
1842 
1843  if (ec->last_tc)
1844  max_output = ec->last_tc->transcoder->max_output;
1845  else
1846  max_output = 1;
1847 
1848  do {
1849  int cr;
1850  long dlen = RSTRING_LEN(dst);
1851  if (rb_str_capacity(dst) - dlen < (size_t)len + max_output) {
1852  unsigned long new_capa = (unsigned long)dlen + len + max_output;
1853  if (LONG_MAX < new_capa)
1854  rb_raise(rb_eArgError, "too long string");
1855  rb_str_modify_expand(dst, new_capa - dlen);
1856  }
1857  sp = (const unsigned char *)ss;
1858  se = sp + len;
1859  ds = (unsigned char *)RSTRING_PTR(dst);
1860  de = ds + rb_str_capacity(dst);
1861  dp = ds += dlen;
1862  res = rb_econv_convert(ec, &sp, se, &dp, de, flags);
1863  switch (coderange) {
1864  case ENC_CODERANGE_7BIT:
1865  case ENC_CODERANGE_VALID:
1866  cr = (int)coderange;
1867  rb_str_coderange_scan_restartable((char *)ds, (char *)dp, dst_enc, &cr);
1868  coderange = cr;
1869  ENC_CODERANGE_SET(dst, coderange);
1870  break;
1871  case ENC_CODERANGE_UNKNOWN:
1872  case ENC_CODERANGE_BROKEN:
1873  break;
1874  }
1875  len -= (const char *)sp - ss;
1876  ss = (const char *)sp;
1877  rb_str_set_len(dst, dlen + (dp - ds));
1879  } while (res == econv_destination_buffer_full);
1880 
1881  return dst;
1882 }
1883 
1884 VALUE
1885 rb_econv_substr_append(rb_econv_t *ec, VALUE src, long off, long len, VALUE dst, int flags)
1886 {
1887  src = rb_str_new_frozen(src);
1888  dst = rb_econv_append(ec, RSTRING_PTR(src) + off, len, dst, flags);
1889  RB_GC_GUARD(src);
1890  return dst;
1891 }
1892 
1893 VALUE
1894 rb_econv_str_append(rb_econv_t *ec, VALUE src, VALUE dst, int flags)
1895 {
1896  return rb_econv_substr_append(ec, src, 0, RSTRING_LEN(src), dst, flags);
1897 }
1898 
1899 VALUE
1900 rb_econv_substr_convert(rb_econv_t *ec, VALUE src, long byteoff, long bytesize, int flags)
1901 {
1902  return rb_econv_substr_append(ec, src, byteoff, bytesize, Qnil, flags);
1903 }
1904 
1905 VALUE
1907 {
1908  return rb_econv_substr_append(ec, src, 0, RSTRING_LEN(src), Qnil, flags);
1909 }
1910 
1911 static int
1912 rb_econv_add_converter(rb_econv_t *ec, const char *sname, const char *dname, int n)
1913 {
1914  transcoder_entry_t *entry;
1915  const rb_transcoder *tr;
1916 
1917  if (ec->started != 0)
1918  return -1;
1919 
1920  entry = get_transcoder_entry(sname, dname);
1921  if (!entry)
1922  return -1;
1923 
1924  tr = load_transcoder_entry(entry);
1925  if (!tr) return -1;
1926 
1927  return rb_econv_add_transcoder_at(ec, tr, n);
1928 }
1929 
1930 static int
1931 rb_econv_decorate_at(rb_econv_t *ec, const char *decorator_name, int n)
1932 {
1933  return rb_econv_add_converter(ec, "", decorator_name, n);
1934 }
1935 
1936 int
1937 rb_econv_decorate_at_first(rb_econv_t *ec, const char *decorator_name)
1938 {
1939  const rb_transcoder *tr;
1940 
1941  if (ec->num_trans == 0)
1942  return rb_econv_decorate_at(ec, decorator_name, 0);
1943 
1944  tr = ec->elems[0].tc->transcoder;
1945 
1946  if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding) &&
1947  tr->asciicompat_type == asciicompat_decoder)
1948  return rb_econv_decorate_at(ec, decorator_name, 1);
1949 
1950  return rb_econv_decorate_at(ec, decorator_name, 0);
1951 }
1952 
1953 int
1954 rb_econv_decorate_at_last(rb_econv_t *ec, const char *decorator_name)
1955 {
1956  const rb_transcoder *tr;
1957 
1958  if (ec->num_trans == 0)
1959  return rb_econv_decorate_at(ec, decorator_name, 0);
1960 
1961  tr = ec->elems[ec->num_trans-1].tc->transcoder;
1962 
1963  if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding) &&
1964  tr->asciicompat_type == asciicompat_encoder)
1965  return rb_econv_decorate_at(ec, decorator_name, ec->num_trans-1);
1966 
1967  return rb_econv_decorate_at(ec, decorator_name, ec->num_trans);
1968 }
1969 
1970 void
1972 {
1973  const char *dname = 0;
1974 
1975  switch (ec->flags & ECONV_NEWLINE_DECORATOR_MASK) {
1977  dname = "universal_newline";
1978  break;
1980  dname = "crlf_newline";
1981  break;
1983  dname = "cr_newline";
1984  break;
1985  }
1986 
1987  if (dname) {
1988  const rb_transcoder *transcoder = get_transcoder_entry("", dname)->transcoder;
1989  int num_trans = ec->num_trans;
1990  int i, j = 0;
1991 
1992  for (i=0; i < num_trans; i++) {
1993  if (transcoder == ec->elems[i].tc->transcoder) {
1994  rb_transcoding_close(ec->elems[i].tc);
1995  xfree(ec->elems[i].out_buf_start);
1996  ec->num_trans--;
1997  }
1998  else
1999  ec->elems[j++] = ec->elems[i];
2000  }
2001  }
2002 
2003  ec->flags &= ~ECONV_NEWLINE_DECORATOR_MASK;
2004 }
2005 
2006 static VALUE
2007 econv_description(const char *sname, const char *dname, int ecflags, VALUE mesg)
2008 {
2009  int has_description = 0;
2010 
2011  if (NIL_P(mesg))
2012  mesg = rb_str_new(NULL, 0);
2013 
2014  if (*sname != '\0' || *dname != '\0') {
2015  if (*sname == '\0')
2016  rb_str_cat2(mesg, dname);
2017  else if (*dname == '\0')
2018  rb_str_cat2(mesg, sname);
2019  else
2020  rb_str_catf(mesg, "%s to %s", sname, dname);
2021  has_description = 1;
2022  }
2023 
2024  if (ecflags & (ECONV_NEWLINE_DECORATOR_MASK|
2028  const char *pre = "";
2029  if (has_description)
2030  rb_str_cat2(mesg, " with ");
2031  if (ecflags & ECONV_UNIVERSAL_NEWLINE_DECORATOR) {
2032  rb_str_cat2(mesg, pre); pre = ",";
2033  rb_str_cat2(mesg, "universal_newline");
2034  }
2035  if (ecflags & ECONV_CRLF_NEWLINE_DECORATOR) {
2036  rb_str_cat2(mesg, pre); pre = ",";
2037  rb_str_cat2(mesg, "crlf_newline");
2038  }
2039  if (ecflags & ECONV_CR_NEWLINE_DECORATOR) {
2040  rb_str_cat2(mesg, pre); pre = ",";
2041  rb_str_cat2(mesg, "cr_newline");
2042  }
2043  if (ecflags & ECONV_XML_TEXT_DECORATOR) {
2044  rb_str_cat2(mesg, pre); pre = ",";
2045  rb_str_cat2(mesg, "xml_text");
2046  }
2047  if (ecflags & ECONV_XML_ATTR_CONTENT_DECORATOR) {
2048  rb_str_cat2(mesg, pre); pre = ",";
2049  rb_str_cat2(mesg, "xml_attr_content");
2050  }
2051  if (ecflags & ECONV_XML_ATTR_QUOTE_DECORATOR) {
2052  rb_str_cat2(mesg, pre); pre = ",";
2053  rb_str_cat2(mesg, "xml_attr_quote");
2054  }
2055  has_description = 1;
2056  }
2057  if (!has_description) {
2058  rb_str_cat2(mesg, "no-conversion");
2059  }
2060 
2061  return mesg;
2062 }
2063 
2064 VALUE
2065 rb_econv_open_exc(const char *sname, const char *dname, int ecflags)
2066 {
2067  VALUE mesg, exc;
2068  mesg = rb_str_new_cstr("code converter not found (");
2069  econv_description(sname, dname, ecflags, mesg);
2070  rb_str_cat2(mesg, ")");
2071  exc = rb_exc_new3(rb_eConverterNotFoundError, mesg);
2072  return exc;
2073 }
2074 
2075 static VALUE
2076 make_econv_exception(rb_econv_t *ec)
2077 {
2078  VALUE mesg, exc;
2079  if (ec->last_error.result == econv_invalid_byte_sequence ||
2080  ec->last_error.result == econv_incomplete_input) {
2081  const char *err = (const char *)ec->last_error.error_bytes_start;
2082  size_t error_len = ec->last_error.error_bytes_len;
2083  VALUE bytes = rb_str_new(err, error_len);
2084  VALUE dumped = rb_str_dump(bytes);
2085  size_t readagain_len = ec->last_error.readagain_len;
2086  VALUE bytes2 = Qnil;
2087  VALUE dumped2;
2088  if (ec->last_error.result == econv_incomplete_input) {
2089  mesg = rb_sprintf("incomplete %s on %s",
2090  StringValueCStr(dumped),
2091  ec->last_error.source_encoding);
2092  }
2093  else if (readagain_len) {
2094  bytes2 = rb_str_new(err+error_len, readagain_len);
2095  dumped2 = rb_str_dump(bytes2);
2096  mesg = rb_sprintf("%s followed by %s on %s",
2097  StringValueCStr(dumped),
2098  StringValueCStr(dumped2),
2099  ec->last_error.source_encoding);
2100  }
2101  else {
2102  mesg = rb_sprintf("%s on %s",
2103  StringValueCStr(dumped),
2104  ec->last_error.source_encoding);
2105  }
2106 
2107  exc = rb_exc_new3(rb_eInvalidByteSequenceError, mesg);
2108  rb_ivar_set(exc, id_error_bytes, bytes);
2109  rb_ivar_set(exc, id_readagain_bytes, bytes2);
2110  rb_ivar_set(exc, id_incomplete_input, RBOOL(ec->last_error.result == econv_incomplete_input));
2111  goto set_encs;
2112  }
2113  if (ec->last_error.result == econv_undefined_conversion) {
2114  VALUE bytes = rb_str_new((const char *)ec->last_error.error_bytes_start,
2115  ec->last_error.error_bytes_len);
2116  VALUE dumped = Qnil;
2117  int idx;
2118  if (strcmp(ec->last_error.source_encoding, "UTF-8") == 0) {
2119  rb_encoding *utf8 = rb_utf8_encoding();
2120  const char *start, *end;
2121  int n;
2122  start = (const char *)ec->last_error.error_bytes_start;
2123  end = start + ec->last_error.error_bytes_len;
2124  n = rb_enc_precise_mbclen(start, end, utf8);
2125  if (MBCLEN_CHARFOUND_P(n) &&
2126  (size_t)MBCLEN_CHARFOUND_LEN(n) == ec->last_error.error_bytes_len) {
2127  unsigned int cc = rb_enc_mbc_to_codepoint(start, end, utf8);
2128  dumped = rb_sprintf("U+%04X", cc);
2129  }
2130  }
2131  if (NIL_P(dumped))
2132  dumped = rb_str_dump(bytes);
2133  if (strcmp(ec->last_error.source_encoding,
2134  ec->source_encoding_name) == 0 &&
2135  strcmp(ec->last_error.destination_encoding,
2136  ec->destination_encoding_name) == 0) {
2137  mesg = rb_sprintf("%s from %s to %s",
2138  StringValueCStr(dumped),
2139  ec->last_error.source_encoding,
2140  ec->last_error.destination_encoding);
2141  }
2142  else {
2143  int i;
2144  mesg = rb_sprintf("%s to %s in conversion from %s",
2145  StringValueCStr(dumped),
2146  ec->last_error.destination_encoding,
2147  ec->source_encoding_name);
2148  for (i = 0; i < ec->num_trans; i++) {
2149  const rb_transcoder *tr = ec->elems[i].tc->transcoder;
2150  if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding))
2151  rb_str_catf(mesg, " to %s",
2152  ec->elems[i].tc->transcoder->dst_encoding);
2153  }
2154  }
2155  exc = rb_exc_new3(rb_eUndefinedConversionError, mesg);
2156  idx = rb_enc_find_index(ec->last_error.source_encoding);
2157  if (0 <= idx)
2158  rb_enc_associate_index(bytes, idx);
2159  rb_ivar_set(exc, id_error_char, bytes);
2160  goto set_encs;
2161  }
2162  return Qnil;
2163 
2164  set_encs:
2165  rb_ivar_set(exc, id_source_encoding_name, rb_str_new2(ec->last_error.source_encoding));
2166  rb_ivar_set(exc, id_destination_encoding_name, rb_str_new2(ec->last_error.destination_encoding));
2167  int idx = rb_enc_find_index(ec->last_error.source_encoding);
2168  if (0 <= idx)
2169  rb_ivar_set(exc, id_source_encoding, rb_enc_from_encoding(rb_enc_from_index(idx)));
2170  idx = rb_enc_find_index(ec->last_error.destination_encoding);
2171  if (0 <= idx)
2172  rb_ivar_set(exc, id_destination_encoding, rb_enc_from_encoding(rb_enc_from_index(idx)));
2173  return exc;
2174 }
2175 
2176 static void
2177 more_output_buffer(
2178  VALUE destination,
2179  unsigned char *(*resize_destination)(VALUE, size_t, size_t),
2180  int max_output,
2181  unsigned char **out_start_ptr,
2182  unsigned char **out_pos,
2183  unsigned char **out_stop_ptr)
2184 {
2185  size_t len = (*out_pos - *out_start_ptr);
2186  size_t new_len = (len + max_output) * 2;
2187  *out_start_ptr = resize_destination(destination, len, new_len);
2188  *out_pos = *out_start_ptr + len;
2189  *out_stop_ptr = *out_start_ptr + new_len;
2190 }
2191 
2192 static int
2193 make_replacement(rb_econv_t *ec)
2194 {
2195  rb_transcoding *tc;
2196  const rb_transcoder *tr;
2197  const unsigned char *replacement;
2198  const char *repl_enc;
2199  const char *ins_enc;
2200  size_t len;
2201 
2202  if (ec->replacement_str)
2203  return 0;
2204 
2205  ins_enc = rb_econv_encoding_to_insert_output(ec);
2206 
2207  tc = ec->last_tc;
2208  if (*ins_enc) {
2209  tr = tc->transcoder;
2210  rb_enc_find(tr->dst_encoding);
2211  replacement = (const unsigned char *)get_replacement_character(ins_enc, &len, &repl_enc);
2212  }
2213  else {
2214  replacement = (unsigned char *)"?";
2215  len = 1;
2216  repl_enc = "";
2217  }
2218 
2219  ec->replacement_str = replacement;
2220  ec->replacement_len = len;
2221  ec->replacement_enc = repl_enc;
2222  ec->replacement_allocated = 0;
2223  return 0;
2224 }
2225 
2226 int
2228  const unsigned char *str, size_t len, const char *encname)
2229 {
2230  unsigned char *str2;
2231  size_t len2;
2232  const char *encname2;
2233 
2234  encname2 = rb_econv_encoding_to_insert_output(ec);
2235 
2236  if (!*encname2 || encoding_equal(encname, encname2)) {
2237  str2 = xmalloc(len);
2238  MEMCPY(str2, str, unsigned char, len); /* xxx: str may be invalid */
2239  len2 = len;
2240  encname2 = encname;
2241  }
2242  else {
2243  str2 = allocate_converted_string(encname, encname2, str, len, NULL, 0, &len2);
2244  if (!str2)
2245  return -1;
2246  }
2247 
2248  if (ec->replacement_allocated) {
2249  xfree((void *)ec->replacement_str);
2250  }
2251  ec->replacement_allocated = 1;
2252  ec->replacement_str = str2;
2253  ec->replacement_len = len2;
2254  ec->replacement_enc = encname2;
2255  return 0;
2256 }
2257 
2258 static int
2259 output_replacement_character(rb_econv_t *ec)
2260 {
2261  int ret;
2262 
2263  if (make_replacement(ec) == -1)
2264  return -1;
2265 
2266  ret = rb_econv_insert_output(ec, ec->replacement_str, ec->replacement_len, ec->replacement_enc);
2267  if (ret == -1)
2268  return -1;
2269 
2270  return 0;
2271 }
2272 
2273 #if 1
2274 #define hash_fallback rb_hash_aref
2275 
2276 static VALUE
2277 proc_fallback(VALUE fallback, VALUE c)
2278 {
2279  return rb_proc_call(fallback, rb_ary_new4(1, &c));
2280 }
2281 
2282 static VALUE
2283 method_fallback(VALUE fallback, VALUE c)
2284 {
2285  return rb_method_call(1, &c, fallback);
2286 }
2287 
2288 static VALUE
2289 aref_fallback(VALUE fallback, VALUE c)
2290 {
2291  return rb_funcallv_public(fallback, idAREF, 1, &c);
2292 }
2293 
2294 static void
2295 transcode_loop(const unsigned char **in_pos, unsigned char **out_pos,
2296  const unsigned char *in_stop, unsigned char *out_stop,
2297  VALUE destination,
2298  unsigned char *(*resize_destination)(VALUE, size_t, size_t),
2299  const char *src_encoding,
2300  const char *dst_encoding,
2301  int ecflags,
2302  VALUE ecopts)
2303 {
2304  rb_econv_t *ec;
2305  rb_transcoding *last_tc;
2306  rb_econv_result_t ret;
2307  unsigned char *out_start = *out_pos;
2308  int max_output;
2309  VALUE exc;
2310  VALUE fallback = Qnil;
2311  VALUE (*fallback_func)(VALUE, VALUE) = 0;
2312 
2313  ec = rb_econv_open_opts(src_encoding, dst_encoding, ecflags, ecopts);
2314  if (!ec)
2315  rb_exc_raise(rb_econv_open_exc(src_encoding, dst_encoding, ecflags));
2316 
2317  if (!NIL_P(ecopts) && RB_TYPE_P(ecopts, T_HASH)) {
2318  fallback = rb_hash_aref(ecopts, sym_fallback);
2319  if (RB_TYPE_P(fallback, T_HASH)) {
2320  fallback_func = hash_fallback;
2321  }
2322  else if (rb_obj_is_proc(fallback)) {
2323  fallback_func = proc_fallback;
2324  }
2325  else if (rb_obj_is_method(fallback)) {
2326  fallback_func = method_fallback;
2327  }
2328  else {
2329  fallback_func = aref_fallback;
2330  }
2331  }
2332  last_tc = ec->last_tc;
2333  max_output = last_tc ? last_tc->transcoder->max_output : 1;
2334 
2335  resume:
2336  ret = rb_econv_convert(ec, in_pos, in_stop, out_pos, out_stop, 0);
2337 
2338  if (!NIL_P(fallback) && ret == econv_undefined_conversion) {
2339  VALUE rep = rb_enc_str_new(
2340  (const char *)ec->last_error.error_bytes_start,
2341  ec->last_error.error_bytes_len,
2342  rb_enc_find(ec->last_error.source_encoding));
2343  rep = (*fallback_func)(fallback, rep);
2344  if (rep != Qundef && !NIL_P(rep)) {
2345  StringValue(rep);
2346  ret = rb_econv_insert_output(ec, (const unsigned char *)RSTRING_PTR(rep),
2347  RSTRING_LEN(rep), rb_enc_name(rb_enc_get(rep)));
2348  if ((int)ret == -1) {
2349  rb_raise(rb_eArgError, "too big fallback string");
2350  }
2351  goto resume;
2352  }
2353  }
2354 
2355  if (ret == econv_invalid_byte_sequence ||
2356  ret == econv_incomplete_input ||
2357  ret == econv_undefined_conversion) {
2358  exc = make_econv_exception(ec);
2359  rb_econv_close(ec);
2360  rb_exc_raise(exc);
2361  }
2362 
2363  if (ret == econv_destination_buffer_full) {
2364  more_output_buffer(destination, resize_destination, max_output, &out_start, out_pos, &out_stop);
2365  goto resume;
2366  }
2367 
2368  rb_econv_close(ec);
2369  return;
2370 }
2371 #else
2372 /* sample transcode_loop implementation in byte-by-byte stream style */
2373 static void
2374 transcode_loop(const unsigned char **in_pos, unsigned char **out_pos,
2375  const unsigned char *in_stop, unsigned char *out_stop,
2376  VALUE destination,
2377  unsigned char *(*resize_destination)(VALUE, size_t, size_t),
2378  const char *src_encoding,
2379  const char *dst_encoding,
2380  int ecflags,
2381  VALUE ecopts)
2382 {
2383  rb_econv_t *ec;
2384  rb_transcoding *last_tc;
2385  rb_econv_result_t ret;
2386  unsigned char *out_start = *out_pos;
2387  const unsigned char *ptr;
2388  int max_output;
2389  VALUE exc;
2390 
2391  ec = rb_econv_open_opts(src_encoding, dst_encoding, ecflags, ecopts);
2392  if (!ec)
2393  rb_exc_raise(rb_econv_open_exc(src_encoding, dst_encoding, ecflags));
2394 
2395  last_tc = ec->last_tc;
2396  max_output = last_tc ? last_tc->transcoder->max_output : 1;
2397 
2399  ptr = *in_pos;
2400  while (ret != econv_finished) {
2401  unsigned char input_byte;
2402  const unsigned char *p = &input_byte;
2403 
2404  if (ret == econv_source_buffer_empty) {
2405  if (ptr < in_stop) {
2406  input_byte = *ptr;
2407  ret = rb_econv_convert(ec, &p, p+1, out_pos, out_stop, ECONV_PARTIAL_INPUT);
2408  }
2409  else {
2410  ret = rb_econv_convert(ec, NULL, NULL, out_pos, out_stop, 0);
2411  }
2412  }
2413  else {
2414  ret = rb_econv_convert(ec, NULL, NULL, out_pos, out_stop, ECONV_PARTIAL_INPUT);
2415  }
2416  if (&input_byte != p)
2417  ptr += p - &input_byte;
2418  switch (ret) {
2422  exc = make_econv_exception(ec);
2423  rb_econv_close(ec);
2424  rb_exc_raise(exc);
2425  break;
2426 
2428  more_output_buffer(destination, resize_destination, max_output, &out_start, out_pos, &out_stop);
2429  break;
2430 
2432  break;
2433 
2434  case econv_finished:
2435  break;
2436  }
2437  }
2438  rb_econv_close(ec);
2439  *in_pos = in_stop;
2440  return;
2441 }
2442 #endif
2443 
2444 
2445 /*
2446  * String-specific code
2447  */
2448 
2449 static unsigned char *
2450 str_transcoding_resize(VALUE destination, size_t len, size_t new_len)
2451 {
2452  rb_str_resize(destination, new_len);
2453  return (unsigned char *)RSTRING_PTR(destination);
2454 }
2455 
2456 static int
2457 econv_opts(VALUE opt, int ecflags)
2458 {
2459  VALUE v;
2460  int newlineflag = 0;
2461 
2462  v = rb_hash_aref(opt, sym_invalid);
2463  if (NIL_P(v)) {
2464  }
2465  else if (v==sym_replace) {
2466  ecflags |= ECONV_INVALID_REPLACE;
2467  }
2468  else {
2469  rb_raise(rb_eArgError, "unknown value for invalid character option");
2470  }
2471 
2472  v = rb_hash_aref(opt, sym_undef);
2473  if (NIL_P(v)) {
2474  }
2475  else if (v==sym_replace) {
2476  ecflags |= ECONV_UNDEF_REPLACE;
2477  }
2478  else {
2479  rb_raise(rb_eArgError, "unknown value for undefined character option");
2480  }
2481 
2482  v = rb_hash_aref(opt, sym_replace);
2483  if (!NIL_P(v) && !(ecflags & ECONV_INVALID_REPLACE)) {
2484  ecflags |= ECONV_UNDEF_REPLACE;
2485  }
2486 
2487  v = rb_hash_aref(opt, sym_xml);
2488  if (!NIL_P(v)) {
2489  if (v==sym_text) {
2491  }
2492  else if (v==sym_attr) {
2494  }
2495  else if (SYMBOL_P(v)) {
2496  rb_raise(rb_eArgError, "unexpected value for xml option: %"PRIsVALUE, rb_sym2str(v));
2497  }
2498  else {
2499  rb_raise(rb_eArgError, "unexpected value for xml option");
2500  }
2501  }
2502 
2503 #ifdef ENABLE_ECONV_NEWLINE_OPTION
2504  v = rb_hash_aref(opt, sym_newline);
2505  if (!NIL_P(v)) {
2506  newlineflag = 2;
2507  ecflags &= ~ECONV_NEWLINE_DECORATOR_MASK;
2508  if (v == sym_universal) {
2510  }
2511  else if (v == sym_crlf) {
2512  ecflags |= ECONV_CRLF_NEWLINE_DECORATOR;
2513  }
2514  else if (v == sym_cr) {
2515  ecflags |= ECONV_CR_NEWLINE_DECORATOR;
2516  }
2517  else if (v == sym_lf) {
2518  /* ecflags |= ECONV_LF_NEWLINE_DECORATOR; */
2519  }
2520  else if (SYMBOL_P(v)) {
2521  rb_raise(rb_eArgError, "unexpected value for newline option: %"PRIsVALUE,
2522  rb_sym2str(v));
2523  }
2524  else {
2525  rb_raise(rb_eArgError, "unexpected value for newline option");
2526  }
2527  }
2528 #endif
2529  {
2530  int setflags = 0;
2531 
2532  v = rb_hash_aref(opt, sym_universal_newline);
2533  if (RTEST(v))
2535  newlineflag |= !NIL_P(v);
2536 
2537  v = rb_hash_aref(opt, sym_crlf_newline);
2538  if (RTEST(v))
2539  setflags |= ECONV_CRLF_NEWLINE_DECORATOR;
2540  newlineflag |= !NIL_P(v);
2541 
2542  v = rb_hash_aref(opt, sym_cr_newline);
2543  if (RTEST(v))
2544  setflags |= ECONV_CR_NEWLINE_DECORATOR;
2545  newlineflag |= !NIL_P(v);
2546 
2547  switch (newlineflag) {
2548  case 1:
2549  ecflags &= ~ECONV_NEWLINE_DECORATOR_MASK;
2550  ecflags |= setflags;
2551  break;
2552 
2553  case 3:
2554  rb_warning(":newline option precedes other newline options");
2555  break;
2556  }
2557  }
2558 
2559  return ecflags;
2560 }
2561 
2562 int
2563 rb_econv_prepare_options(VALUE opthash, VALUE *opts, int ecflags)
2564 {
2565  VALUE newhash = Qnil;
2566  VALUE v;
2567 
2568  if (NIL_P(opthash)) {
2569  *opts = Qnil;
2570  return ecflags;
2571  }
2572  ecflags = econv_opts(opthash, ecflags);
2573 
2574  v = rb_hash_aref(opthash, sym_replace);
2575  if (!NIL_P(v)) {
2576  StringValue(v);
2578  VALUE dumped = rb_str_dump(v);
2579  rb_raise(rb_eArgError, "replacement string is broken: %s as %s",
2580  StringValueCStr(dumped),
2581  rb_enc_name(rb_enc_get(v)));
2582  }
2583  v = rb_str_new_frozen(v);
2584  newhash = rb_hash_new();
2585  rb_hash_aset(newhash, sym_replace, v);
2586  }
2587 
2588  v = rb_hash_aref(opthash, sym_fallback);
2589  if (!NIL_P(v)) {
2590  VALUE h = rb_check_hash_type(v);
2591  if (NIL_P(h)
2592  ? (rb_obj_is_proc(v) || rb_obj_is_method(v) || rb_respond_to(v, idAREF))
2593  : (v = h, 1)) {
2594  if (NIL_P(newhash))
2595  newhash = rb_hash_new();
2596  rb_hash_aset(newhash, sym_fallback, v);
2597  }
2598  }
2599 
2600  if (!NIL_P(newhash))
2601  rb_hash_freeze(newhash);
2602  *opts = newhash;
2603 
2604  return ecflags;
2605 }
2606 
2607 int
2609 {
2610  return rb_econv_prepare_options(opthash, opts, 0);
2611 }
2612 
2613 rb_econv_t *
2614 rb_econv_open_opts(const char *source_encoding, const char *destination_encoding, int ecflags, VALUE opthash)
2615 {
2616  rb_econv_t *ec;
2617  VALUE replacement;
2618 
2619  if (NIL_P(opthash)) {
2620  replacement = Qnil;
2621  }
2622  else {
2623  if (!RB_TYPE_P(opthash, T_HASH) || !OBJ_FROZEN(opthash))
2624  rb_bug("rb_econv_open_opts called with invalid opthash");
2625  replacement = rb_hash_aref(opthash, sym_replace);
2626  }
2627 
2628  ec = rb_econv_open(source_encoding, destination_encoding, ecflags);
2629  if (!ec)
2630  return ec;
2631 
2632  if (!NIL_P(replacement)) {
2633  int ret;
2634  rb_encoding *enc = rb_enc_get(replacement);
2635 
2636  ret = rb_econv_set_replacement(ec,
2637  (const unsigned char *)RSTRING_PTR(replacement),
2638  RSTRING_LEN(replacement),
2639  rb_enc_name(enc));
2640  if (ret == -1) {
2641  rb_econv_close(ec);
2642  return NULL;
2643  }
2644  }
2645  return ec;
2646 }
2647 
2648 static int
2649 enc_arg(VALUE *arg, const char **name_p, rb_encoding **enc_p)
2650 {
2651  rb_encoding *enc;
2652  const char *n;
2653  int encidx;
2654  VALUE encval;
2655 
2656  if (((encidx = rb_to_encoding_index(encval = *arg)) < 0) ||
2657  !(enc = rb_enc_from_index(encidx))) {
2658  enc = NULL;
2659  encidx = 0;
2660  n = StringValueCStr(*arg);
2661  }
2662  else {
2663  n = rb_enc_name(enc);
2664  }
2665 
2666  *name_p = n;
2667  *enc_p = enc;
2668 
2669  return encidx;
2670 }
2671 
2672 static int
2673 str_transcode_enc_args(VALUE str, VALUE *arg1, VALUE *arg2,
2674  const char **sname_p, rb_encoding **senc_p,
2675  const char **dname_p, rb_encoding **denc_p)
2676 {
2677  rb_encoding *senc, *denc;
2678  const char *sname, *dname;
2679  int sencidx, dencidx;
2680 
2681  dencidx = enc_arg(arg1, &dname, &denc);
2682 
2683  if (NIL_P(*arg2)) {
2684  sencidx = rb_enc_get_index(str);
2685  senc = rb_enc_from_index(sencidx);
2686  sname = rb_enc_name(senc);
2687  }
2688  else {
2689  sencidx = enc_arg(arg2, &sname, &senc);
2690  }
2691 
2692  *sname_p = sname;
2693  *senc_p = senc;
2694  *dname_p = dname;
2695  *denc_p = denc;
2696  return dencidx;
2697 }
2698 
2699 static int
2700 str_transcode0(int argc, VALUE *argv, VALUE *self, int ecflags, VALUE ecopts)
2701 {
2702  VALUE dest;
2703  VALUE str = *self;
2704  VALUE arg1, arg2;
2705  long blen, slen;
2706  unsigned char *buf, *bp, *sp;
2707  const unsigned char *fromp;
2708  rb_encoding *senc, *denc;
2709  const char *sname, *dname;
2710  int dencidx;
2711  int explicitly_invalid_replace = TRUE;
2712 
2713  rb_check_arity(argc, 0, 2);
2714 
2715  if (argc == 0) {
2716  arg1 = rb_enc_default_internal();
2717  if (NIL_P(arg1)) {
2718  if (!ecflags) return -1;
2719  arg1 = rb_obj_encoding(str);
2720  }
2721  if (!(ecflags & ECONV_INVALID_MASK)) {
2722  explicitly_invalid_replace = FALSE;
2723  }
2725  }
2726  else {
2727  arg1 = argv[0];
2728  }
2729  arg2 = argc<=1 ? Qnil : argv[1];
2730  dencidx = str_transcode_enc_args(str, &arg1, &arg2, &sname, &senc, &dname, &denc);
2731 
2732  if ((ecflags & (ECONV_NEWLINE_DECORATOR_MASK|
2736  if (senc && senc == denc) {
2737  if ((ecflags & ECONV_INVALID_MASK) && explicitly_invalid_replace) {
2738  VALUE rep = Qnil;
2739  if (!NIL_P(ecopts)) {
2740  rep = rb_hash_aref(ecopts, sym_replace);
2741  }
2742  dest = rb_enc_str_scrub(senc, str, rep);
2743  if (NIL_P(dest)) dest = str;
2744  *self = dest;
2745  return dencidx;
2746  }
2747  return NIL_P(arg2) ? -1 : dencidx;
2748  }
2749  if (senc && denc && rb_enc_asciicompat(senc) && rb_enc_asciicompat(denc)) {
2751  return dencidx;
2752  }
2753  }
2754  if (encoding_equal(sname, dname)) {
2755  return NIL_P(arg2) ? -1 : dencidx;
2756  }
2757  }
2758  else {
2759  if (senc && denc && !rb_enc_asciicompat(senc) && !rb_enc_asciicompat(denc)) {
2760  rb_encoding *utf8 = rb_utf8_encoding();
2761  str = rb_str_conv_enc(str, senc, utf8);
2762  senc = utf8;
2763  sname = "UTF-8";
2764  }
2765  if (encoding_equal(sname, dname)) {
2766  sname = "";
2767  dname = "";
2768  }
2769  }
2770 
2771  fromp = sp = (unsigned char *)RSTRING_PTR(str);
2772  slen = RSTRING_LEN(str);
2773  blen = slen + 30; /* len + margin */
2774  dest = rb_str_tmp_new(blen);
2775  bp = (unsigned char *)RSTRING_PTR(dest);
2776 
2777  transcode_loop(&fromp, &bp, (sp+slen), (bp+blen), dest, str_transcoding_resize, sname, dname, ecflags, ecopts);
2778  if (fromp != sp+slen) {
2779  rb_raise(rb_eArgError, "not fully converted, %"PRIdPTRDIFF" bytes left", sp+slen-fromp);
2780  }
2781  buf = (unsigned char *)RSTRING_PTR(dest);
2782  *bp = '\0';
2783  rb_str_set_len(dest, bp - buf);
2784 
2785  /* set encoding */
2786  if (!denc) {
2787  dencidx = rb_define_dummy_encoding(dname);
2788  RB_GC_GUARD(arg1);
2789  RB_GC_GUARD(arg2);
2790  }
2791  *self = dest;
2792 
2793  return dencidx;
2794 }
2795 
2796 static int
2797 str_transcode(int argc, VALUE *argv, VALUE *self)
2798 {
2799  VALUE opt;
2800  int ecflags = 0;
2801  VALUE ecopts = Qnil;
2802 
2803  argc = rb_scan_args(argc, argv, "02:", NULL, NULL, &opt);
2804  if (!NIL_P(opt)) {
2805  ecflags = rb_econv_prepare_opts(opt, &ecopts);
2806  }
2807  return str_transcode0(argc, argv, self, ecflags, ecopts);
2808 }
2809 
2810 static inline VALUE
2811 str_encode_associate(VALUE str, int encidx)
2812 {
2813  int cr = 0;
2814 
2815  rb_enc_associate_index(str, encidx);
2816 
2817  /* transcoded string never be broken. */
2818  if (rb_enc_asciicompat(rb_enc_from_index(encidx))) {
2820  }
2821  else {
2822  cr = ENC_CODERANGE_VALID;
2823  }
2824  ENC_CODERANGE_SET(str, cr);
2825  return str;
2826 }
2827 
2828 /*
2829  * call-seq:
2830  * str.encode!(encoding, **options) -> str
2831  * str.encode!(dst_encoding, src_encoding, **options) -> str
2832  *
2833  * The first form transcodes the contents of <i>str</i> from
2834  * str.encoding to +encoding+.
2835  * The second form transcodes the contents of <i>str</i> from
2836  * src_encoding to dst_encoding.
2837  * The +options+ keyword arguments give details for conversion. See String#encode
2838  * for details.
2839  * Returns the string even if no changes were made.
2840  */
2841 
2842 static VALUE
2843 str_encode_bang(int argc, VALUE *argv, VALUE str)
2844 {
2845  VALUE newstr;
2846  int encidx;
2847 
2848  rb_check_frozen(str);
2849 
2850  newstr = str;
2851  encidx = str_transcode(argc, argv, &newstr);
2852 
2853  if (encidx < 0) return str;
2854  if (newstr == str) {
2855  rb_enc_associate_index(str, encidx);
2856  return str;
2857  }
2858  rb_str_shared_replace(str, newstr);
2859  return str_encode_associate(str, encidx);
2860 }
2861 
2862 static VALUE encoded_dup(VALUE newstr, VALUE str, int encidx);
2863 
2864 /*
2865  * call-seq:
2866  * str.encode(encoding, **options) -> str
2867  * str.encode(dst_encoding, src_encoding, **options) -> str
2868  * str.encode(**options) -> str
2869  *
2870  * The first form returns a copy of +str+ transcoded
2871  * to encoding +encoding+.
2872  * The second form returns a copy of +str+ transcoded
2873  * from src_encoding to dst_encoding.
2874  * The last form returns a copy of +str+ transcoded to
2875  * <tt>Encoding.default_internal</tt>.
2876  *
2877  * By default, the first and second form raise
2878  * Encoding::UndefinedConversionError for characters that are
2879  * undefined in the destination encoding, and
2880  * Encoding::InvalidByteSequenceError for invalid byte sequences
2881  * in the source encoding. The last form by default does not raise
2882  * exceptions but uses replacement strings.
2883  *
2884  * The +options+ keyword arguments give details for conversion.
2885  * The arguments are:
2886  *
2887  * :invalid ::
2888  * If the value is +:replace+, #encode replaces invalid byte sequences in
2889  * +str+ with the replacement character. The default is to raise the
2890  * Encoding::InvalidByteSequenceError exception
2891  * :undef ::
2892  * If the value is +:replace+, #encode replaces characters which are
2893  * undefined in the destination encoding with the replacement character.
2894  * The default is to raise the Encoding::UndefinedConversionError.
2895  * :replace ::
2896  * Sets the replacement string to the given value. The default replacement
2897  * string is "\uFFFD" for Unicode encoding forms, and "?" otherwise.
2898  * :fallback ::
2899  * Sets the replacement string by the given object for undefined
2900  * character. The object should be a Hash, a Proc, a Method, or an
2901  * object which has [] method.
2902  * Its key is an undefined character encoded in the source encoding
2903  * of current transcoder. Its value can be any encoding until it
2904  * can be converted into the destination encoding of the transcoder.
2905  * :xml ::
2906  * The value must be +:text+ or +:attr+.
2907  * If the value is +:text+ #encode replaces undefined characters with their
2908  * (upper-case hexadecimal) numeric character references. '&', '<', and '>'
2909  * are converted to "&amp;", "&lt;", and "&gt;", respectively.
2910  * If the value is +:attr+, #encode also quotes the replacement result
2911  * (using '"'), and replaces '"' with "&quot;".
2912  * :cr_newline ::
2913  * Replaces LF ("\n") with CR ("\r") if value is true.
2914  * :crlf_newline ::
2915  * Replaces LF ("\n") with CRLF ("\r\n") if value is true.
2916  * :universal_newline ::
2917  * Replaces CRLF ("\r\n") and CR ("\r") with LF ("\n") if value is true.
2918  */
2919 
2920 static VALUE
2921 str_encode(int argc, VALUE *argv, VALUE str)
2922 {
2923  VALUE newstr = str;
2924  int encidx = str_transcode(argc, argv, &newstr);
2925  return encoded_dup(newstr, str, encidx);
2926 }
2927 
2928 VALUE
2929 rb_str_encode(VALUE str, VALUE to, int ecflags, VALUE ecopts)
2930 {
2931  int argc = 1;
2932  VALUE *argv = &to;
2933  VALUE newstr = str;
2934  int encidx = str_transcode0(argc, argv, &newstr, ecflags, ecopts);
2935  return encoded_dup(newstr, str, encidx);
2936 }
2937 
2938 static VALUE
2939 encoded_dup(VALUE newstr, VALUE str, int encidx)
2940 {
2941  if (encidx < 0) return rb_str_dup(str);
2942  if (newstr == str) {
2943  newstr = rb_str_dup(str);
2944  rb_enc_associate_index(newstr, encidx);
2945  return newstr;
2946  }
2947  else {
2948  RBASIC_SET_CLASS(newstr, rb_obj_class(str));
2949  }
2950  return str_encode_associate(newstr, encidx);
2951 }
2952 
2953 /*
2954  * Document-class: Encoding::Converter
2955  *
2956  * Encoding conversion class.
2957  */
2958 static void
2959 econv_free(void *ptr)
2960 {
2961  rb_econv_t *ec = ptr;
2962  rb_econv_close(ec);
2963 }
2964 
2965 static size_t
2966 econv_memsize(const void *ptr)
2967 {
2968  return sizeof(rb_econv_t);
2969 }
2970 
2971 static const rb_data_type_t econv_data_type = {
2972  "econv",
2973  {0, econv_free, econv_memsize,},
2974  0, 0, RUBY_TYPED_FREE_IMMEDIATELY
2975 };
2976 
2977 static VALUE
2978 econv_s_allocate(VALUE klass)
2979 {
2980  return TypedData_Wrap_Struct(klass, &econv_data_type, NULL);
2981 }
2982 
2983 static rb_encoding *
2984 make_dummy_encoding(const char *name)
2985 {
2986  rb_encoding *enc;
2987  int idx;
2988  idx = rb_define_dummy_encoding(name);
2989  enc = rb_enc_from_index(idx);
2990  return enc;
2991 }
2992 
2993 static rb_encoding *
2994 make_encoding(const char *name)
2995 {
2996  rb_encoding *enc;
2997  enc = rb_enc_find(name);
2998  if (!enc)
2999  enc = make_dummy_encoding(name);
3000  return enc;
3001 }
3002 
3003 static VALUE
3004 make_encobj(const char *name)
3005 {
3006  return rb_enc_from_encoding(make_encoding(name));
3007 }
3008 
3009 /*
3010  * call-seq:
3011  * Encoding::Converter.asciicompat_encoding(string) -> encoding or nil
3012  * Encoding::Converter.asciicompat_encoding(encoding) -> encoding or nil
3013  *
3014  * Returns the corresponding ASCII compatible encoding.
3015  *
3016  * Returns nil if the argument is an ASCII compatible encoding.
3017  *
3018  * "corresponding ASCII compatible encoding" is an ASCII compatible encoding which
3019  * can represents exactly the same characters as the given ASCII incompatible encoding.
3020  * So, no conversion undefined error occurs when converting between the two encodings.
3021  *
3022  * Encoding::Converter.asciicompat_encoding("ISO-2022-JP") #=> #<Encoding:stateless-ISO-2022-JP>
3023  * Encoding::Converter.asciicompat_encoding("UTF-16BE") #=> #<Encoding:UTF-8>
3024  * Encoding::Converter.asciicompat_encoding("UTF-8") #=> nil
3025  *
3026  */
3027 static VALUE
3028 econv_s_asciicompat_encoding(VALUE klass, VALUE arg)
3029 {
3030  const char *arg_name, *result_name;
3031  rb_encoding *arg_enc, *result_enc;
3032 
3033  enc_arg(&arg, &arg_name, &arg_enc);
3034 
3035  result_name = rb_econv_asciicompat_encoding(arg_name);
3036 
3037  if (result_name == NULL)
3038  return Qnil;
3039 
3040  result_enc = make_encoding(result_name);
3041 
3042  return rb_enc_from_encoding(result_enc);
3043 }
3044 
3045 static void
3046 econv_args(int argc, VALUE *argv,
3047  VALUE *snamev_p, VALUE *dnamev_p,
3048  const char **sname_p, const char **dname_p,
3049  rb_encoding **senc_p, rb_encoding **denc_p,
3050  int *ecflags_p,
3051  VALUE *ecopts_p)
3052 {
3053  VALUE opt, flags_v, ecopts;
3054  int sidx, didx;
3055  const char *sname, *dname;
3056  rb_encoding *senc, *denc;
3057  int ecflags;
3058 
3059  argc = rb_scan_args(argc, argv, "21:", snamev_p, dnamev_p, &flags_v, &opt);
3060 
3061  if (!NIL_P(flags_v)) {
3062  if (!NIL_P(opt)) {
3063  rb_error_arity(argc + 1, 2, 3);
3064  }
3065  ecflags = NUM2INT(rb_to_int(flags_v));
3066  ecopts = Qnil;
3067  }
3068  else if (!NIL_P(opt)) {
3069  ecflags = rb_econv_prepare_opts(opt, &ecopts);
3070  }
3071  else {
3072  ecflags = 0;
3073  ecopts = Qnil;
3074  }
3075 
3076  senc = NULL;
3077  sidx = rb_to_encoding_index(*snamev_p);
3078  if (0 <= sidx) {
3079  senc = rb_enc_from_index(sidx);
3080  }
3081  else {
3082  StringValue(*snamev_p);
3083  }
3084 
3085  denc = NULL;
3086  didx = rb_to_encoding_index(*dnamev_p);
3087  if (0 <= didx) {
3088  denc = rb_enc_from_index(didx);
3089  }
3090  else {
3091  StringValue(*dnamev_p);
3092  }
3093 
3094  sname = senc ? rb_enc_name(senc) : StringValueCStr(*snamev_p);
3095  dname = denc ? rb_enc_name(denc) : StringValueCStr(*dnamev_p);
3096 
3097  *sname_p = sname;
3098  *dname_p = dname;
3099  *senc_p = senc;
3100  *denc_p = denc;
3101  *ecflags_p = ecflags;
3102  *ecopts_p = ecopts;
3103 }
3104 
3105 static int
3106 decorate_convpath(VALUE convpath, int ecflags)
3107 {
3108  int num_decorators;
3109  const char *decorators[MAX_ECFLAGS_DECORATORS];
3110  int i;
3111  int n, len;
3112 
3113  num_decorators = decorator_names(ecflags, decorators);
3114  if (num_decorators == -1)
3115  return -1;
3116 
3117  len = n = RARRAY_LENINT(convpath);
3118  if (n != 0) {
3119  VALUE pair = RARRAY_AREF(convpath, n-1);
3120  if (RB_TYPE_P(pair, T_ARRAY)) {
3121  const char *sname = rb_enc_name(rb_to_encoding(RARRAY_AREF(pair, 0)));
3122  const char *dname = rb_enc_name(rb_to_encoding(RARRAY_AREF(pair, 1)));
3123  transcoder_entry_t *entry = get_transcoder_entry(sname, dname);
3124  const rb_transcoder *tr = load_transcoder_entry(entry);
3125  if (!tr)
3126  return -1;
3127  if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding) &&
3128  tr->asciicompat_type == asciicompat_encoder) {
3129  n--;
3130  rb_ary_store(convpath, len + num_decorators - 1, pair);
3131  }
3132  }
3133  else {
3134  rb_ary_store(convpath, len + num_decorators - 1, pair);
3135  }
3136  }
3137 
3138  for (i = 0; i < num_decorators; i++)
3139  rb_ary_store(convpath, n + i, rb_str_new_cstr(decorators[i]));
3140 
3141  return 0;
3142 }
3143 
3144 static void
3145 search_convpath_i(const char *sname, const char *dname, int depth, void *arg)
3146 {
3147  VALUE *ary_p = arg;
3148  VALUE v;
3149 
3150  if (NIL_P(*ary_p)) {
3151  *ary_p = rb_ary_new();
3152  }
3153 
3154  if (DECORATOR_P(sname, dname)) {
3155  v = rb_str_new_cstr(dname);
3156  }
3157  else {
3158  v = rb_assoc_new(make_encobj(sname), make_encobj(dname));
3159  }
3160  rb_ary_store(*ary_p, depth, v);
3161 }
3162 
3163 /*
3164  * call-seq:
3165  * Encoding::Converter.search_convpath(source_encoding, destination_encoding) -> ary
3166  * Encoding::Converter.search_convpath(source_encoding, destination_encoding, opt) -> ary
3167  *
3168  * Returns a conversion path.
3169  *
3170  * p Encoding::Converter.search_convpath("ISO-8859-1", "EUC-JP")
3171  * #=> [[#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>],
3172  * # [#<Encoding:UTF-8>, #<Encoding:EUC-JP>]]
3173  *
3174  * p Encoding::Converter.search_convpath("ISO-8859-1", "EUC-JP", universal_newline: true)
3175  * or
3176  * p Encoding::Converter.search_convpath("ISO-8859-1", "EUC-JP", newline: :universal)
3177  * #=> [[#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>],
3178  * # [#<Encoding:UTF-8>, #<Encoding:EUC-JP>],
3179  * # "universal_newline"]
3180  *
3181  * p Encoding::Converter.search_convpath("ISO-8859-1", "UTF-32BE", universal_newline: true)
3182  * or
3183  * p Encoding::Converter.search_convpath("ISO-8859-1", "UTF-32BE", newline: :universal)
3184  * #=> [[#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>],
3185  * # "universal_newline",
3186  * # [#<Encoding:UTF-8>, #<Encoding:UTF-32BE>]]
3187  */
3188 static VALUE
3189 econv_s_search_convpath(int argc, VALUE *argv, VALUE klass)
3190 {
3191  VALUE snamev, dnamev;
3192  const char *sname, *dname;
3193  rb_encoding *senc, *denc;
3194  int ecflags;
3195  VALUE ecopts;
3196  VALUE convpath;
3197 
3198  econv_args(argc, argv, &snamev, &dnamev, &sname, &dname, &senc, &denc, &ecflags, &ecopts);
3199 
3200  convpath = Qnil;
3201  transcode_search_path(sname, dname, search_convpath_i, &convpath);
3202 
3203  if (NIL_P(convpath)) {
3204  VALUE exc = rb_econv_open_exc(sname, dname, ecflags);
3205  RB_GC_GUARD(snamev);
3206  RB_GC_GUARD(dnamev);
3207  rb_exc_raise(exc);
3208  }
3209 
3210  if (decorate_convpath(convpath, ecflags) == -1) {
3211  VALUE exc = rb_econv_open_exc(sname, dname, ecflags);
3212  RB_GC_GUARD(snamev);
3213  RB_GC_GUARD(dnamev);
3214  rb_exc_raise(exc);
3215  }
3216 
3217  return convpath;
3218 }
3219 
3220 /*
3221  * Check the existence of a conversion path.
3222  * Returns the number of converters in the conversion path.
3223  * result: >=0:success -1:failure
3224  */
3225 int
3226 rb_econv_has_convpath_p(const char* from_encoding, const char* to_encoding)
3227 {
3228  VALUE convpath = Qnil;
3229  transcode_search_path(from_encoding, to_encoding, search_convpath_i,
3230  &convpath);
3231  return RTEST(convpath);
3232 }
3233 
3235  rb_econv_t *ec;
3236  int index;
3237  int ret;
3238 };
3239 
3240 static void
3241 rb_econv_init_by_convpath_i(const char *sname, const char *dname, int depth, void *arg)
3242 {
3243  struct rb_econv_init_by_convpath_t *a = (struct rb_econv_init_by_convpath_t *)arg;
3244  int ret;
3245 
3246  if (a->ret == -1)
3247  return;
3248 
3249  ret = rb_econv_add_converter(a->ec, sname, dname, a->index);
3250 
3251  a->ret = ret;
3252  return;
3253 }
3254 
3255 static rb_econv_t *
3256 rb_econv_init_by_convpath(VALUE self, VALUE convpath,
3257  const char **sname_p, const char **dname_p,
3258  rb_encoding **senc_p, rb_encoding**denc_p)
3259 {
3260  rb_econv_t *ec;
3261  long i;
3262  int ret, first=1;
3263  VALUE elt;
3264  rb_encoding *senc = 0, *denc = 0;
3265  const char *sname, *dname;
3266 
3267  ec = rb_econv_alloc(RARRAY_LENINT(convpath));
3268  DATA_PTR(self) = ec;
3269 
3270  for (i = 0; i < RARRAY_LEN(convpath); i++) {
3271  VALUE snamev, dnamev;
3272  VALUE pair;
3273  elt = rb_ary_entry(convpath, i);
3274  if (!NIL_P(pair = rb_check_array_type(elt))) {
3275  if (RARRAY_LEN(pair) != 2)
3276  rb_raise(rb_eArgError, "not a 2-element array in convpath");
3277  snamev = rb_ary_entry(pair, 0);
3278  enc_arg(&snamev, &sname, &senc);
3279  dnamev = rb_ary_entry(pair, 1);
3280  enc_arg(&dnamev, &dname, &denc);
3281  }
3282  else {
3283  sname = "";
3284  dname = StringValueCStr(elt);
3285  }
3286  if (DECORATOR_P(sname, dname)) {
3287  ret = rb_econv_add_converter(ec, sname, dname, ec->num_trans);
3288  if (ret == -1) {
3289  VALUE msg = rb_sprintf("decoration failed: %s", dname);
3290  RB_GC_GUARD(snamev);
3291  RB_GC_GUARD(dnamev);
3293  }
3294  }
3295  else {
3296  int j = ec->num_trans;
3297  struct rb_econv_init_by_convpath_t arg;
3298  arg.ec = ec;
3299  arg.index = ec->num_trans;
3300  arg.ret = 0;
3301  ret = transcode_search_path(sname, dname, rb_econv_init_by_convpath_i, &arg);
3302  if (ret == -1 || arg.ret == -1) {
3303  VALUE msg = rb_sprintf("adding conversion failed: %s to %s", sname, dname);
3304  RB_GC_GUARD(snamev);
3305  RB_GC_GUARD(dnamev);
3307  }
3308  if (first) {
3309  first = 0;
3310  *senc_p = senc;
3311  *sname_p = ec->elems[j].tc->transcoder->src_encoding;
3312  }
3313  *denc_p = denc;
3314  *dname_p = ec->elems[ec->num_trans-1].tc->transcoder->dst_encoding;
3315  }
3316  }
3317 
3318  if (first) {
3319  *senc_p = NULL;
3320  *denc_p = NULL;
3321  *sname_p = "";
3322  *dname_p = "";
3323  }
3324 
3325  ec->source_encoding_name = *sname_p;
3326  ec->destination_encoding_name = *dname_p;
3327 
3328  return ec;
3329 }
3330 
3331 /*
3332  * call-seq:
3333  * Encoding::Converter.new(source_encoding, destination_encoding)
3334  * Encoding::Converter.new(source_encoding, destination_encoding, opt)
3335  * Encoding::Converter.new(convpath)
3336  *
3337  * possible options elements:
3338  * hash form:
3339  * :invalid => nil # raise error on invalid byte sequence (default)
3340  * :invalid => :replace # replace invalid byte sequence
3341  * :undef => nil # raise error on undefined conversion (default)
3342  * :undef => :replace # replace undefined conversion
3343  * :replace => string # replacement string ("?" or "\uFFFD" if not specified)
3344  * :newline => :universal # decorator for converting CRLF and CR to LF
3345  * :newline => :crlf # decorator for converting LF to CRLF
3346  * :newline => :cr # decorator for converting LF to CR
3347  * :universal_newline => true # decorator for converting CRLF and CR to LF
3348  * :crlf_newline => true # decorator for converting LF to CRLF
3349  * :cr_newline => true # decorator for converting LF to CR
3350  * :xml => :text # escape as XML CharData.
3351  * :xml => :attr # escape as XML AttValue
3352  * integer form:
3353  * Encoding::Converter::INVALID_REPLACE
3354  * Encoding::Converter::UNDEF_REPLACE
3355  * Encoding::Converter::UNDEF_HEX_CHARREF
3356  * Encoding::Converter::UNIVERSAL_NEWLINE_DECORATOR
3357  * Encoding::Converter::CRLF_NEWLINE_DECORATOR
3358  * Encoding::Converter::CR_NEWLINE_DECORATOR
3359  * Encoding::Converter::XML_TEXT_DECORATOR
3360  * Encoding::Converter::XML_ATTR_CONTENT_DECORATOR
3361  * Encoding::Converter::XML_ATTR_QUOTE_DECORATOR
3362  *
3363  * Encoding::Converter.new creates an instance of Encoding::Converter.
3364  *
3365  * Source_encoding and destination_encoding should be a string or
3366  * Encoding object.
3367  *
3368  * opt should be nil, a hash or an integer.
3369  *
3370  * convpath should be an array.
3371  * convpath may contain
3372  * - two-element arrays which contain encodings or encoding names, or
3373  * - strings representing decorator names.
3374  *
3375  * Encoding::Converter.new optionally takes an option.
3376  * The option should be a hash or an integer.
3377  * The option hash can contain :invalid => nil, etc.
3378  * The option integer should be logical-or of constants such as
3379  * Encoding::Converter::INVALID_REPLACE, etc.
3380  *
3381  * [:invalid => nil]
3382  * Raise error on invalid byte sequence. This is a default behavior.
3383  * [:invalid => :replace]
3384  * Replace invalid byte sequence by replacement string.
3385  * [:undef => nil]
3386  * Raise an error if a character in source_encoding is not defined in destination_encoding.
3387  * This is a default behavior.
3388  * [:undef => :replace]
3389  * Replace undefined character in destination_encoding with replacement string.
3390  * [:replace => string]
3391  * Specify the replacement string.
3392  * If not specified, "\uFFFD" is used for Unicode encodings and "?" for others.
3393  * [:universal_newline => true]
3394  * Convert CRLF and CR to LF.
3395  * [:crlf_newline => true]
3396  * Convert LF to CRLF.
3397  * [:cr_newline => true]
3398  * Convert LF to CR.
3399  * [:xml => :text]
3400  * Escape as XML CharData.
3401  * This form can be used as an HTML 4.0 #PCDATA.
3402  * - '&' -> '&amp;'
3403  * - '<' -> '&lt;'
3404  * - '>' -> '&gt;'
3405  * - undefined characters in destination_encoding -> hexadecimal CharRef such as &#xHH;
3406  * [:xml => :attr]
3407  * Escape as XML AttValue.
3408  * The converted result is quoted as "...".
3409  * This form can be used as an HTML 4.0 attribute value.
3410  * - '&' -> '&amp;'
3411  * - '<' -> '&lt;'
3412  * - '>' -> '&gt;'
3413  * - '"' -> '&quot;'
3414  * - undefined characters in destination_encoding -> hexadecimal CharRef such as &#xHH;
3415  *
3416  * Examples:
3417  * # UTF-16BE to UTF-8
3418  * ec = Encoding::Converter.new("UTF-16BE", "UTF-8")
3419  *
3420  * # Usually, decorators such as newline conversion are inserted last.
3421  * ec = Encoding::Converter.new("UTF-16BE", "UTF-8", :universal_newline => true)
3422  * p ec.convpath #=> [[#<Encoding:UTF-16BE>, #<Encoding:UTF-8>],
3423  * # "universal_newline"]
3424  *
3425  * # But, if the last encoding is ASCII incompatible,
3426  * # decorators are inserted before the last conversion.
3427  * ec = Encoding::Converter.new("UTF-8", "UTF-16BE", :crlf_newline => true)
3428  * p ec.convpath #=> ["crlf_newline",
3429  * # [#<Encoding:UTF-8>, #<Encoding:UTF-16BE>]]
3430  *
3431  * # Conversion path can be specified directly.
3432  * ec = Encoding::Converter.new(["universal_newline", ["EUC-JP", "UTF-8"], ["UTF-8", "UTF-16BE"]])
3433  * p ec.convpath #=> ["universal_newline",
3434  * # [#<Encoding:EUC-JP>, #<Encoding:UTF-8>],
3435  * # [#<Encoding:UTF-8>, #<Encoding:UTF-16BE>]]
3436  */
3437 static VALUE
3438 econv_init(int argc, VALUE *argv, VALUE self)
3439 {
3440  VALUE ecopts;
3441  VALUE snamev, dnamev;
3442  const char *sname, *dname;
3443  rb_encoding *senc, *denc;
3444  rb_econv_t *ec;
3445  int ecflags;
3446  VALUE convpath;
3447 
3448  if (rb_check_typeddata(self, &econv_data_type)) {
3449  rb_raise(rb_eTypeError, "already initialized");
3450  }
3451 
3452  if (argc == 1 && !NIL_P(convpath = rb_check_array_type(argv[0]))) {
3453  ec = rb_econv_init_by_convpath(self, convpath, &sname, &dname, &senc, &denc);
3454  ecflags = 0;
3455  ecopts = Qnil;
3456  }
3457  else {
3458  econv_args(argc, argv, &snamev, &dnamev, &sname, &dname, &senc, &denc, &ecflags, &ecopts);
3459  ec = rb_econv_open_opts(sname, dname, ecflags, ecopts);
3460  }
3461 
3462  if (!ec) {
3463  VALUE exc = rb_econv_open_exc(sname, dname, ecflags);
3464  RB_GC_GUARD(snamev);
3465  RB_GC_GUARD(dnamev);
3466  rb_exc_raise(exc);
3467  }
3468 
3469  if (!DECORATOR_P(sname, dname)) {
3470  if (!senc)
3471  senc = make_dummy_encoding(sname);
3472  if (!denc)
3473  denc = make_dummy_encoding(dname);
3474  RB_GC_GUARD(snamev);
3475  RB_GC_GUARD(dnamev);
3476  }
3477 
3478  ec->source_encoding = senc;
3479  ec->destination_encoding = denc;
3480 
3481  DATA_PTR(self) = ec;
3482 
3483  return self;
3484 }
3485 
3486 /*
3487  * call-seq:
3488  * ec.inspect -> string
3489  *
3490  * Returns a printable version of <i>ec</i>
3491  *
3492  * ec = Encoding::Converter.new("iso-8859-1", "utf-8")
3493  * puts ec.inspect #=> #<Encoding::Converter: ISO-8859-1 to UTF-8>
3494  *
3495  */
3496 static VALUE
3497 econv_inspect(VALUE self)
3498 {
3499  const char *cname = rb_obj_classname(self);
3500  rb_econv_t *ec;
3501 
3502  TypedData_Get_Struct(self, rb_econv_t, &econv_data_type, ec);
3503  if (!ec)
3504  return rb_sprintf("#<%s: uninitialized>", cname);
3505  else {
3506  const char *sname = ec->source_encoding_name;
3507  const char *dname = ec->destination_encoding_name;
3508  VALUE str;
3509  str = rb_sprintf("#<%s: ", cname);
3510  econv_description(sname, dname, ec->flags, str);
3511  rb_str_cat2(str, ">");
3512  return str;
3513  }
3514 }
3515 
3516 static rb_econv_t *
3517 check_econv(VALUE self)
3518 {
3519  rb_econv_t *ec;
3520 
3521  TypedData_Get_Struct(self, rb_econv_t, &econv_data_type, ec);
3522  if (!ec) {
3523  rb_raise(rb_eTypeError, "uninitialized encoding converter");
3524  }
3525  return ec;
3526 }
3527 
3528 /*
3529  * call-seq:
3530  * ec.source_encoding -> encoding
3531  *
3532  * Returns the source encoding as an Encoding object.
3533  */
3534 static VALUE
3535 econv_source_encoding(VALUE self)
3536 {
3537  rb_econv_t *ec = check_econv(self);
3538  if (!ec->source_encoding)
3539  return Qnil;
3540  return rb_enc_from_encoding(ec->source_encoding);
3541 }
3542 
3543 /*
3544  * call-seq:
3545  * ec.destination_encoding -> encoding
3546  *
3547  * Returns the destination encoding as an Encoding object.
3548  */
3549 static VALUE
3550 econv_destination_encoding(VALUE self)
3551 {
3552  rb_econv_t *ec = check_econv(self);
3553  if (!ec->destination_encoding)
3554  return Qnil;
3555  return rb_enc_from_encoding(ec->destination_encoding);
3556 }
3557 
3558 /*
3559  * call-seq:
3560  * ec.convpath -> ary
3561  *
3562  * Returns the conversion path of ec.
3563  *
3564  * The result is an array of conversions.
3565  *
3566  * ec = Encoding::Converter.new("ISO-8859-1", "EUC-JP", crlf_newline: true)
3567  * p ec.convpath
3568  * #=> [[#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>],
3569  * # [#<Encoding:UTF-8>, #<Encoding:EUC-JP>],
3570  * # "crlf_newline"]
3571  *
3572  * Each element of the array is a pair of encodings or a string.
3573  * A pair means an encoding conversion.
3574  * A string means a decorator.
3575  *
3576  * In the above example, [#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>] means
3577  * a converter from ISO-8859-1 to UTF-8.
3578  * "crlf_newline" means newline converter from LF to CRLF.
3579  */
3580 static VALUE
3581 econv_convpath(VALUE self)
3582 {
3583  rb_econv_t *ec = check_econv(self);
3584  VALUE result;
3585  int i;
3586 
3587  result = rb_ary_new();
3588  for (i = 0; i < ec->num_trans; i++) {
3589  const rb_transcoder *tr = ec->elems[i].tc->transcoder;
3590  VALUE v;
3591  if (DECORATOR_P(tr->src_encoding, tr->dst_encoding))
3592  v = rb_str_new_cstr(tr->dst_encoding);
3593  else
3594  v = rb_assoc_new(make_encobj(tr->src_encoding), make_encobj(tr->dst_encoding));
3595  rb_ary_push(result, v);
3596  }
3597  return result;
3598 }
3599 
3600 /*
3601  * call-seq:
3602  * ec == other -> true or false
3603  */
3604 static VALUE
3605 econv_equal(VALUE self, VALUE other)
3606 {
3607  rb_econv_t *ec1 = check_econv(self);
3608  rb_econv_t *ec2;
3609  int i;
3610 
3611  if (!rb_typeddata_is_kind_of(other, &econv_data_type)) {
3612  return Qnil;
3613  }
3614  ec2 = DATA_PTR(other);
3615  if (!ec2) return Qfalse;
3616  if (ec1->source_encoding_name != ec2->source_encoding_name &&
3617  strcmp(ec1->source_encoding_name, ec2->source_encoding_name))
3618  return Qfalse;
3619  if (ec1->destination_encoding_name != ec2->destination_encoding_name &&
3620  strcmp(ec1->destination_encoding_name, ec2->destination_encoding_name))
3621  return Qfalse;
3622  if (ec1->flags != ec2->flags) return Qfalse;
3623  if (ec1->replacement_enc != ec2->replacement_enc &&
3624  strcmp(ec1->replacement_enc, ec2->replacement_enc))
3625  return Qfalse;
3626  if (ec1->replacement_len != ec2->replacement_len) return Qfalse;
3627  if (ec1->replacement_str != ec2->replacement_str &&
3628  memcmp(ec1->replacement_str, ec2->replacement_str, ec2->replacement_len))
3629  return Qfalse;
3630 
3631  if (ec1->num_trans != ec2->num_trans) return Qfalse;
3632  for (i = 0; i < ec1->num_trans; i++) {
3633  if (ec1->elems[i].tc->transcoder != ec2->elems[i].tc->transcoder)
3634  return Qfalse;
3635  }
3636  return Qtrue;
3637 }
3638 
3639 static VALUE
3640 econv_result_to_symbol(rb_econv_result_t res)
3641 {
3642  switch (res) {
3643  case econv_invalid_byte_sequence: return sym_invalid_byte_sequence;
3644  case econv_incomplete_input: return sym_incomplete_input;
3645  case econv_undefined_conversion: return sym_undefined_conversion;
3646  case econv_destination_buffer_full: return sym_destination_buffer_full;
3647  case econv_source_buffer_empty: return sym_source_buffer_empty;
3648  case econv_finished: return sym_finished;
3649  case econv_after_output: return sym_after_output;
3650  default: return INT2NUM(res); /* should not be reached */
3651  }
3652 }
3653 
3654 /*
3655  * call-seq:
3656  * ec.primitive_convert(source_buffer, destination_buffer) -> symbol
3657  * ec.primitive_convert(source_buffer, destination_buffer, destination_byteoffset) -> symbol
3658  * ec.primitive_convert(source_buffer, destination_buffer, destination_byteoffset, destination_bytesize) -> symbol
3659  * ec.primitive_convert(source_buffer, destination_buffer, destination_byteoffset, destination_bytesize, opt) -> symbol
3660  *
3661  * possible opt elements:
3662  * hash form:
3663  * :partial_input => true # source buffer may be part of larger source
3664  * :after_output => true # stop conversion after output before input
3665  * integer form:
3666  * Encoding::Converter::PARTIAL_INPUT
3667  * Encoding::Converter::AFTER_OUTPUT
3668  *
3669  * possible results:
3670  * :invalid_byte_sequence
3671  * :incomplete_input
3672  * :undefined_conversion
3673  * :after_output
3674  * :destination_buffer_full
3675  * :source_buffer_empty
3676  * :finished
3677  *
3678  * primitive_convert converts source_buffer into destination_buffer.
3679  *
3680  * source_buffer should be a string or nil.
3681  * nil means an empty string.
3682  *
3683  * destination_buffer should be a string.
3684  *
3685  * destination_byteoffset should be an integer or nil.
3686  * nil means the end of destination_buffer.
3687  * If it is omitted, nil is assumed.
3688  *
3689  * destination_bytesize should be an integer or nil.
3690  * nil means unlimited.
3691  * If it is omitted, nil is assumed.
3692  *
3693  * opt should be nil, a hash or an integer.
3694  * nil means no flags.
3695  * If it is omitted, nil is assumed.
3696  *
3697  * primitive_convert converts the content of source_buffer from beginning
3698  * and store the result into destination_buffer.
3699  *
3700  * destination_byteoffset and destination_bytesize specify the region which
3701  * the converted result is stored.
3702  * destination_byteoffset specifies the start position in destination_buffer in bytes.
3703  * If destination_byteoffset is nil,
3704  * destination_buffer.bytesize is used for appending the result.
3705  * destination_bytesize specifies maximum number of bytes.
3706  * If destination_bytesize is nil,
3707  * destination size is unlimited.
3708  * After conversion, destination_buffer is resized to
3709  * destination_byteoffset + actually produced number of bytes.
3710  * Also destination_buffer's encoding is set to destination_encoding.
3711  *
3712  * primitive_convert drops the converted part of source_buffer.
3713  * the dropped part is converted in destination_buffer or
3714  * buffered in Encoding::Converter object.
3715  *
3716  * primitive_convert stops conversion when one of following condition met.
3717  * - invalid byte sequence found in source buffer (:invalid_byte_sequence)
3718  * +primitive_errinfo+ and +last_error+ methods returns the detail of the error.
3719  * - unexpected end of source buffer (:incomplete_input)
3720  * this occur only when :partial_input is not specified.
3721  * +primitive_errinfo+ and +last_error+ methods returns the detail of the error.
3722  * - character not representable in output encoding (:undefined_conversion)
3723  * +primitive_errinfo+ and +last_error+ methods returns the detail of the error.
3724  * - after some output is generated, before input is done (:after_output)
3725  * this occur only when :after_output is specified.
3726  * - destination buffer is full (:destination_buffer_full)
3727  * this occur only when destination_bytesize is non-nil.
3728  * - source buffer is empty (:source_buffer_empty)
3729  * this occur only when :partial_input is specified.
3730  * - conversion is finished (:finished)
3731  *
3732  * example:
3733  * ec = Encoding::Converter.new("UTF-8", "UTF-16BE")
3734  * ret = ec.primitive_convert(src="pi", dst="", nil, 100)
3735  * p [ret, src, dst] #=> [:finished, "", "\x00p\x00i"]
3736  *
3737  * ec = Encoding::Converter.new("UTF-8", "UTF-16BE")
3738  * ret = ec.primitive_convert(src="pi", dst="", nil, 1)
3739  * p [ret, src, dst] #=> [:destination_buffer_full, "i", "\x00"]
3740  * ret = ec.primitive_convert(src, dst="", nil, 1)
3741  * p [ret, src, dst] #=> [:destination_buffer_full, "", "p"]
3742  * ret = ec.primitive_convert(src, dst="", nil, 1)
3743  * p [ret, src, dst] #=> [:destination_buffer_full, "", "\x00"]
3744  * ret = ec.primitive_convert(src, dst="", nil, 1)
3745  * p [ret, src, dst] #=> [:finished, "", "i"]
3746  *
3747  */
3748 static VALUE
3749 econv_primitive_convert(int argc, VALUE *argv, VALUE self)
3750 {
3751  VALUE input, output, output_byteoffset_v, output_bytesize_v, opt, flags_v;
3752  rb_econv_t *ec = check_econv(self);
3753  rb_econv_result_t res;
3754  const unsigned char *ip, *is;
3755  unsigned char *op, *os;
3756  long output_byteoffset, output_bytesize;
3757  unsigned long output_byteend;
3758  int flags;
3759 
3760  argc = rb_scan_args(argc, argv, "23:", &input, &output, &output_byteoffset_v, &output_bytesize_v, &flags_v, &opt);
3761 
3762  if (NIL_P(output_byteoffset_v))
3763  output_byteoffset = 0; /* dummy */
3764  else
3765  output_byteoffset = NUM2LONG(output_byteoffset_v);
3766 
3767  if (NIL_P(output_bytesize_v))
3768  output_bytesize = 0; /* dummy */
3769  else
3770  output_bytesize = NUM2LONG(output_bytesize_v);
3771 
3772  if (!NIL_P(flags_v)) {
3773  if (!NIL_P(opt)) {
3774  rb_error_arity(argc + 1, 2, 5);
3775  }
3776  flags = NUM2INT(rb_to_int(flags_v));
3777  }
3778  else if (!NIL_P(opt)) {
3779  VALUE v;
3780  flags = 0;
3781  v = rb_hash_aref(opt, sym_partial_input);
3782  if (RTEST(v))
3783  flags |= ECONV_PARTIAL_INPUT;
3784  v = rb_hash_aref(opt, sym_after_output);
3785  if (RTEST(v))
3786  flags |= ECONV_AFTER_OUTPUT;
3787  }
3788  else {
3789  flags = 0;
3790  }
3791 
3792  StringValue(output);
3793  if (!NIL_P(input))
3794  StringValue(input);
3795  rb_str_modify(output);
3796 
3797  if (NIL_P(output_bytesize_v)) {
3798 #if USE_RVARGC
3799  output_bytesize = rb_str_capacity(output);
3800 #else
3801  output_bytesize = RSTRING_EMBED_LEN_MAX;
3802 #endif
3803  if (!NIL_P(input) && output_bytesize < RSTRING_LEN(input))
3804  output_bytesize = RSTRING_LEN(input);
3805  }
3806 
3807  retry:
3808 
3809  if (NIL_P(output_byteoffset_v))
3810  output_byteoffset = RSTRING_LEN(output);
3811 
3812  if (output_byteoffset < 0)
3813  rb_raise(rb_eArgError, "negative output_byteoffset");
3814 
3815  if (RSTRING_LEN(output) < output_byteoffset)
3816  rb_raise(rb_eArgError, "output_byteoffset too big");
3817 
3818  if (output_bytesize < 0)
3819  rb_raise(rb_eArgError, "negative output_bytesize");
3820 
3821  output_byteend = (unsigned long)output_byteoffset +
3822  (unsigned long)output_bytesize;
3823 
3824  if (output_byteend < (unsigned long)output_byteoffset ||
3825  LONG_MAX < output_byteend)
3826  rb_raise(rb_eArgError, "output_byteoffset+output_bytesize too big");
3827 
3828  if (rb_str_capacity(output) < output_byteend)
3829  rb_str_resize(output, output_byteend);
3830 
3831  if (NIL_P(input)) {
3832  ip = is = NULL;
3833  }
3834  else {
3835  ip = (const unsigned char *)RSTRING_PTR(input);
3836  is = ip + RSTRING_LEN(input);
3837  }
3838 
3839  op = (unsigned char *)RSTRING_PTR(output) + output_byteoffset;
3840  os = op + output_bytesize;
3841 
3842  res = rb_econv_convert(ec, &ip, is, &op, os, flags);
3843  rb_str_set_len(output, op-(unsigned char *)RSTRING_PTR(output));
3844  if (!NIL_P(input)) {
3845  rb_str_drop_bytes(input, ip - (unsigned char *)RSTRING_PTR(input));
3846  }
3847 
3848  if (NIL_P(output_bytesize_v) && res == econv_destination_buffer_full) {
3849  if (LONG_MAX / 2 < output_bytesize)
3850  rb_raise(rb_eArgError, "too long conversion result");
3851  output_bytesize *= 2;
3852  output_byteoffset_v = Qnil;
3853  goto retry;
3854  }
3855 
3856  if (ec->destination_encoding) {
3857  rb_enc_associate(output, ec->destination_encoding);
3858  }
3859 
3860  return econv_result_to_symbol(res);
3861 }
3862 
3863 /*
3864  * call-seq:
3865  * ec.convert(source_string) -> destination_string
3866  *
3867  * Convert source_string and return destination_string.
3868  *
3869  * source_string is assumed as a part of source.
3870  * i.e. :partial_input=>true is specified internally.
3871  * finish method should be used last.
3872  *
3873  * ec = Encoding::Converter.new("utf-8", "euc-jp")
3874  * puts ec.convert("\u3042").dump #=> "\xA4\xA2"
3875  * puts ec.finish.dump #=> ""
3876  *
3877  * ec = Encoding::Converter.new("euc-jp", "utf-8")
3878  * puts ec.convert("\xA4").dump #=> ""
3879  * puts ec.convert("\xA2").dump #=> "\xE3\x81\x82"
3880  * puts ec.finish.dump #=> ""
3881  *
3882  * ec = Encoding::Converter.new("utf-8", "iso-2022-jp")
3883  * puts ec.convert("\xE3").dump #=> "".force_encoding("ISO-2022-JP")
3884  * puts ec.convert("\x81").dump #=> "".force_encoding("ISO-2022-JP")
3885  * puts ec.convert("\x82").dump #=> "\e$B$\"".force_encoding("ISO-2022-JP")
3886  * puts ec.finish.dump #=> "\e(B".force_encoding("ISO-2022-JP")
3887  *
3888  * If a conversion error occur,
3889  * Encoding::UndefinedConversionError or
3890  * Encoding::InvalidByteSequenceError is raised.
3891  * Encoding::Converter#convert doesn't supply methods to recover or restart
3892  * from these exceptions.
3893  * When you want to handle these conversion errors,
3894  * use Encoding::Converter#primitive_convert.
3895  *
3896  */
3897 static VALUE
3898 econv_convert(VALUE self, VALUE source_string)
3899 {
3900  VALUE ret, dst;
3901  VALUE av[5];
3902  int ac;
3903  rb_econv_t *ec = check_econv(self);
3904 
3905  StringValue(source_string);
3906 
3907  dst = rb_str_new(NULL, 0);
3908 
3909  av[0] = rb_str_dup(source_string);
3910  av[1] = dst;
3911  av[2] = Qnil;
3912  av[3] = Qnil;
3913  av[4] = INT2NUM(ECONV_PARTIAL_INPUT);
3914  ac = 5;
3915 
3916  ret = econv_primitive_convert(ac, av, self);
3917 
3918  if (ret == sym_invalid_byte_sequence ||
3919  ret == sym_undefined_conversion ||
3920  ret == sym_incomplete_input) {
3921  VALUE exc = make_econv_exception(ec);
3922  rb_exc_raise(exc);
3923  }
3924 
3925  if (ret == sym_finished) {
3926  rb_raise(rb_eArgError, "converter already finished");
3927  }
3928 
3929  if (ret != sym_source_buffer_empty) {
3930  rb_bug("unexpected result of econv_primitive_convert");
3931  }
3932 
3933  return dst;
3934 }
3935 
3936 /*
3937  * call-seq:
3938  * ec.finish -> string
3939  *
3940  * Finishes the converter.
3941  * It returns the last part of the converted string.
3942  *
3943  * ec = Encoding::Converter.new("utf-8", "iso-2022-jp")
3944  * p ec.convert("\u3042") #=> "\e$B$\""
3945  * p ec.finish #=> "\e(B"
3946  */
3947 static VALUE
3948 econv_finish(VALUE self)
3949 {
3950  VALUE ret, dst;
3951  VALUE av[5];
3952  int ac;
3953  rb_econv_t *ec = check_econv(self);
3954 
3955  dst = rb_str_new(NULL, 0);
3956 
3957  av[0] = Qnil;
3958  av[1] = dst;
3959  av[2] = Qnil;
3960  av[3] = Qnil;
3961  av[4] = INT2FIX(0);
3962  ac = 5;
3963 
3964  ret = econv_primitive_convert(ac, av, self);
3965 
3966  if (ret == sym_invalid_byte_sequence ||
3967  ret == sym_undefined_conversion ||
3968  ret == sym_incomplete_input) {
3969  VALUE exc = make_econv_exception(ec);
3970  rb_exc_raise(exc);
3971  }
3972 
3973  if (ret != sym_finished) {
3974  rb_bug("unexpected result of econv_primitive_convert");
3975  }
3976 
3977  return dst;
3978 }
3979 
3980 /*
3981  * call-seq:
3982  * ec.primitive_errinfo -> array
3983  *
3984  * primitive_errinfo returns important information regarding the last error
3985  * as a 5-element array:
3986  *
3987  * [result, enc1, enc2, error_bytes, readagain_bytes]
3988  *
3989  * result is the last result of primitive_convert.
3990  *
3991  * Other elements are only meaningful when result is
3992  * :invalid_byte_sequence, :incomplete_input or :undefined_conversion.
3993  *
3994  * enc1 and enc2 indicate a conversion step as a pair of strings.
3995  * For example, a converter from EUC-JP to ISO-8859-1 converts
3996  * a string as follows: EUC-JP -> UTF-8 -> ISO-8859-1.
3997  * So [enc1, enc2] is either ["EUC-JP", "UTF-8"] or ["UTF-8", "ISO-8859-1"].
3998  *
3999  * error_bytes and readagain_bytes indicate the byte sequences which caused the error.
4000  * error_bytes is discarded portion.
4001  * readagain_bytes is buffered portion which is read again on next conversion.
4002  *
4003  * Example:
4004  *
4005  * # \xff is invalid as EUC-JP.
4006  * ec = Encoding::Converter.new("EUC-JP", "Shift_JIS")
4007  * ec.primitive_convert(src="\xff", dst="", nil, 10)
4008  * p ec.primitive_errinfo
4009  * #=> [:invalid_byte_sequence, "EUC-JP", "Shift_JIS", "\xFF", ""]
4010  *
4011  * # HIRAGANA LETTER A (\xa4\xa2 in EUC-JP) is not representable in ISO-8859-1.
4012  * # Since this error is occur in UTF-8 to ISO-8859-1 conversion,
4013  * # error_bytes is HIRAGANA LETTER A in UTF-8 (\xE3\x81\x82).
4014  * ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1")
4015  * ec.primitive_convert(src="\xa4\xa2", dst="", nil, 10)
4016  * p ec.primitive_errinfo
4017  * #=> [:undefined_conversion, "UTF-8", "ISO-8859-1", "\xE3\x81\x82", ""]
4018  *
4019  * # partial character is invalid
4020  * ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1")
4021  * ec.primitive_convert(src="\xa4", dst="", nil, 10)
4022  * p ec.primitive_errinfo
4023  * #=> [:incomplete_input, "EUC-JP", "UTF-8", "\xA4", ""]
4024  *
4025  * # Encoding::Converter::PARTIAL_INPUT prevents invalid errors by
4026  * # partial characters.
4027  * ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1")
4028  * ec.primitive_convert(src="\xa4", dst="", nil, 10, Encoding::Converter::PARTIAL_INPUT)
4029  * p ec.primitive_errinfo
4030  * #=> [:source_buffer_empty, nil, nil, nil, nil]
4031  *
4032  * # \xd8\x00\x00@ is invalid as UTF-16BE because
4033  * # no low surrogate after high surrogate (\xd8\x00).
4034  * # It is detected by 3rd byte (\00) which is part of next character.
4035  * # So the high surrogate (\xd8\x00) is discarded and
4036  * # the 3rd byte is read again later.
4037  * # Since the byte is buffered in ec, it is dropped from src.
4038  * ec = Encoding::Converter.new("UTF-16BE", "UTF-8")
4039  * ec.primitive_convert(src="\xd8\x00\x00@", dst="", nil, 10)
4040  * p ec.primitive_errinfo
4041  * #=> [:invalid_byte_sequence, "UTF-16BE", "UTF-8", "\xD8\x00", "\x00"]
4042  * p src
4043  * #=> "@"
4044  *
4045  * # Similar to UTF-16BE, \x00\xd8@\x00 is invalid as UTF-16LE.
4046  * # The problem is detected by 4th byte.
4047  * ec = Encoding::Converter.new("UTF-16LE", "UTF-8")
4048  * ec.primitive_convert(src="\x00\xd8@\x00", dst="", nil, 10)
4049  * p ec.primitive_errinfo
4050  * #=> [:invalid_byte_sequence, "UTF-16LE", "UTF-8", "\x00\xD8", "@\x00"]
4051  * p src
4052  * #=> ""
4053  *
4054  */
4055 static VALUE
4056 econv_primitive_errinfo(VALUE self)
4057 {
4058  rb_econv_t *ec = check_econv(self);
4059 
4060  VALUE ary;
4061 
4062  ary = rb_ary_new2(5);
4063 
4064  rb_ary_store(ary, 0, econv_result_to_symbol(ec->last_error.result));
4065  rb_ary_store(ary, 4, Qnil);
4066 
4067  if (ec->last_error.source_encoding)
4068  rb_ary_store(ary, 1, rb_str_new2(ec->last_error.source_encoding));
4069 
4070  if (ec->last_error.destination_encoding)
4071  rb_ary_store(ary, 2, rb_str_new2(ec->last_error.destination_encoding));
4072 
4073  if (ec->last_error.error_bytes_start) {
4074  rb_ary_store(ary, 3, rb_str_new((const char *)ec->last_error.error_bytes_start, ec->last_error.error_bytes_len));
4075  rb_ary_store(ary, 4, rb_str_new((const char *)ec->last_error.error_bytes_start + ec->last_error.error_bytes_len, ec->last_error.readagain_len));
4076  }
4077 
4078  return ary;
4079 }
4080 
4081 /*
4082  * call-seq:
4083  * ec.insert_output(string) -> nil
4084  *
4085  * Inserts string into the encoding converter.
4086  * The string will be converted to the destination encoding and
4087  * output on later conversions.
4088  *
4089  * If the destination encoding is stateful,
4090  * string is converted according to the state and the state is updated.
4091  *
4092  * This method should be used only when a conversion error occurs.
4093  *
4094  * ec = Encoding::Converter.new("utf-8", "iso-8859-1")
4095  * src = "HIRAGANA LETTER A is \u{3042}."
4096  * dst = ""
4097  * p ec.primitive_convert(src, dst) #=> :undefined_conversion
4098  * puts "[#{dst.dump}, #{src.dump}]" #=> ["HIRAGANA LETTER A is ", "."]
4099  * ec.insert_output("<err>")
4100  * p ec.primitive_convert(src, dst) #=> :finished
4101  * puts "[#{dst.dump}, #{src.dump}]" #=> ["HIRAGANA LETTER A is <err>.", ""]
4102  *
4103  * ec = Encoding::Converter.new("utf-8", "iso-2022-jp")
4104  * src = "\u{306F 3041 3068 2661 3002}" # U+2661 is not representable in iso-2022-jp
4105  * dst = ""
4106  * p ec.primitive_convert(src, dst) #=> :undefined_conversion
4107  * puts "[#{dst.dump}, #{src.dump}]" #=> ["\e$B$O$!$H".force_encoding("ISO-2022-JP"), "\xE3\x80\x82"]
4108  * ec.insert_output "?" # state change required to output "?".
4109  * p ec.primitive_convert(src, dst) #=> :finished
4110  * puts "[#{dst.dump}, #{src.dump}]" #=> ["\e$B$O$!$H\e(B?\e$B!#\e(B".force_encoding("ISO-2022-JP"), ""]
4111  *
4112  */
4113 static VALUE
4114 econv_insert_output(VALUE self, VALUE string)
4115 {
4116  const char *insert_enc;
4117 
4118  int ret;
4119 
4120  rb_econv_t *ec = check_econv(self);
4121 
4122  StringValue(string);
4123  insert_enc = rb_econv_encoding_to_insert_output(ec);
4124  string = rb_str_encode(string, rb_enc_from_encoding(rb_enc_find(insert_enc)), 0, Qnil);
4125 
4126  ret = rb_econv_insert_output(ec, (const unsigned char *)RSTRING_PTR(string), RSTRING_LEN(string), insert_enc);
4127  if (ret == -1) {
4128  rb_raise(rb_eArgError, "too big string");
4129  }
4130 
4131  return Qnil;
4132 }
4133 
4134 /*
4135  * call-seq:
4136  * ec.putback -> string
4137  * ec.putback(max_numbytes) -> string
4138  *
4139  * Put back the bytes which will be converted.
4140  *
4141  * The bytes are caused by invalid_byte_sequence error.
4142  * When invalid_byte_sequence error, some bytes are discarded and
4143  * some bytes are buffered to be converted later.
4144  * The latter bytes can be put back.
4145  * It can be observed by
4146  * Encoding::InvalidByteSequenceError#readagain_bytes and
4147  * Encoding::Converter#primitive_errinfo.
4148  *
4149  * ec = Encoding::Converter.new("utf-16le", "iso-8859-1")
4150  * src = "\x00\xd8\x61\x00"
4151  * dst = ""
4152  * p ec.primitive_convert(src, dst) #=> :invalid_byte_sequence
4153  * p ec.primitive_errinfo #=> [:invalid_byte_sequence, "UTF-16LE", "UTF-8", "\x00\xD8", "a\x00"]
4154  * p ec.putback #=> "a\x00"
4155  * p ec.putback #=> "" # no more bytes to put back
4156  *
4157  */
4158 static VALUE
4159 econv_putback(int argc, VALUE *argv, VALUE self)
4160 {
4161  rb_econv_t *ec = check_econv(self);
4162  int n;
4163  int putbackable;
4164  VALUE str, max;
4165 
4166  if (!rb_check_arity(argc, 0, 1) || NIL_P(max = argv[0])) {
4167  n = rb_econv_putbackable(ec);
4168  }
4169  else {
4170  n = NUM2INT(max);
4171  putbackable = rb_econv_putbackable(ec);
4172  if (putbackable < n)
4173  n = putbackable;
4174  }
4175 
4176  str = rb_str_new(NULL, n);
4177  rb_econv_putback(ec, (unsigned char *)RSTRING_PTR(str), n);
4178 
4179  if (ec->source_encoding) {
4180  rb_enc_associate(str, ec->source_encoding);
4181  }
4182 
4183  return str;
4184 }
4185 
4186 /*
4187  * call-seq:
4188  * ec.last_error -> exception or nil
4189  *
4190  * Returns an exception object for the last conversion.
4191  * Returns nil if the last conversion did not produce an error.
4192  *
4193  * "error" means that
4194  * Encoding::InvalidByteSequenceError and Encoding::UndefinedConversionError for
4195  * Encoding::Converter#convert and
4196  * :invalid_byte_sequence, :incomplete_input and :undefined_conversion for
4197  * Encoding::Converter#primitive_convert.
4198  *
4199  * ec = Encoding::Converter.new("utf-8", "iso-8859-1")
4200  * p ec.primitive_convert(src="\xf1abcd", dst="") #=> :invalid_byte_sequence
4201  * p ec.last_error #=> #<Encoding::InvalidByteSequenceError: "\xF1" followed by "a" on UTF-8>
4202  * p ec.primitive_convert(src, dst, nil, 1) #=> :destination_buffer_full
4203  * p ec.last_error #=> nil
4204  *
4205  */
4206 static VALUE
4207 econv_last_error(VALUE self)
4208 {
4209  rb_econv_t *ec = check_econv(self);
4210  VALUE exc;
4211 
4212  exc = make_econv_exception(ec);
4213  if (NIL_P(exc))
4214  return Qnil;
4215  return exc;
4216 }
4217 
4218 /*
4219  * call-seq:
4220  * ec.replacement -> string
4221  *
4222  * Returns the replacement string.
4223  *
4224  * ec = Encoding::Converter.new("euc-jp", "us-ascii")
4225  * p ec.replacement #=> "?"
4226  *
4227  * ec = Encoding::Converter.new("euc-jp", "utf-8")
4228  * p ec.replacement #=> "\uFFFD"
4229  */
4230 static VALUE
4231 econv_get_replacement(VALUE self)
4232 {
4233  rb_econv_t *ec = check_econv(self);
4234  int ret;
4235  rb_encoding *enc;
4236 
4237  ret = make_replacement(ec);
4238  if (ret == -1) {
4239  rb_raise(rb_eUndefinedConversionError, "replacement character setup failed");
4240  }
4241 
4242  enc = rb_enc_find(ec->replacement_enc);
4243  return rb_enc_str_new((const char *)ec->replacement_str, (long)ec->replacement_len, enc);
4244 }
4245 
4246 /*
4247  * call-seq:
4248  * ec.replacement = string
4249  *
4250  * Sets the replacement string.
4251  *
4252  * ec = Encoding::Converter.new("utf-8", "us-ascii", :undef => :replace)
4253  * ec.replacement = "<undef>"
4254  * p ec.convert("a \u3042 b") #=> "a <undef> b"
4255  */
4256 static VALUE
4257 econv_set_replacement(VALUE self, VALUE arg)
4258 {
4259  rb_econv_t *ec = check_econv(self);
4260  VALUE string = arg;
4261  int ret;
4262  rb_encoding *enc;
4263 
4264  StringValue(string);
4265  enc = rb_enc_get(string);
4266 
4267  ret = rb_econv_set_replacement(ec,
4268  (const unsigned char *)RSTRING_PTR(string),
4269  RSTRING_LEN(string),
4270  rb_enc_name(enc));
4271 
4272  if (ret == -1) {
4273  /* xxx: rb_eInvalidByteSequenceError? */
4274  rb_raise(rb_eUndefinedConversionError, "replacement character setup failed");
4275  }
4276 
4277  return arg;
4278 }
4279 
4280 VALUE
4282 {
4283  return make_econv_exception(ec);
4284 }
4285 
4286 void
4288 {
4289  VALUE exc;
4290 
4291  exc = make_econv_exception(ec);
4292  if (NIL_P(exc))
4293  return;
4294  rb_exc_raise(exc);
4295 }
4296 
4297 /*
4298  * call-seq:
4299  * ecerr.source_encoding_name -> string
4300  *
4301  * Returns the source encoding name as a string.
4302  */
4303 static VALUE
4304 ecerr_source_encoding_name(VALUE self)
4305 {
4306  return rb_attr_get(self, id_source_encoding_name);
4307 }
4308 
4309 /*
4310  * call-seq:
4311  * ecerr.source_encoding -> encoding
4312  *
4313  * Returns the source encoding as an encoding object.
4314  *
4315  * Note that the result may not be equal to the source encoding of
4316  * the encoding converter if the conversion has multiple steps.
4317  *
4318  * ec = Encoding::Converter.new("ISO-8859-1", "EUC-JP") # ISO-8859-1 -> UTF-8 -> EUC-JP
4319  * begin
4320  * ec.convert("\xa0") # NO-BREAK SPACE, which is available in UTF-8 but not in EUC-JP.
4321  * rescue Encoding::UndefinedConversionError
4322  * p $!.source_encoding #=> #<Encoding:UTF-8>
4323  * p $!.destination_encoding #=> #<Encoding:EUC-JP>
4324  * p $!.source_encoding_name #=> "UTF-8"
4325  * p $!.destination_encoding_name #=> "EUC-JP"
4326  * end
4327  *
4328  */
4329 static VALUE
4330 ecerr_source_encoding(VALUE self)
4331 {
4332  return rb_attr_get(self, id_source_encoding);
4333 }
4334 
4335 /*
4336  * call-seq:
4337  * ecerr.destination_encoding_name -> string
4338  *
4339  * Returns the destination encoding name as a string.
4340  */
4341 static VALUE
4342 ecerr_destination_encoding_name(VALUE self)
4343 {
4344  return rb_attr_get(self, id_destination_encoding_name);
4345 }
4346 
4347 /*
4348  * call-seq:
4349  * ecerr.destination_encoding -> string
4350  *
4351  * Returns the destination encoding as an encoding object.
4352  */
4353 static VALUE
4354 ecerr_destination_encoding(VALUE self)
4355 {
4356  return rb_attr_get(self, id_destination_encoding);
4357 }
4358 
4359 /*
4360  * call-seq:
4361  * ecerr.error_char -> string
4362  *
4363  * Returns the one-character string which cause Encoding::UndefinedConversionError.
4364  *
4365  * ec = Encoding::Converter.new("ISO-8859-1", "EUC-JP")
4366  * begin
4367  * ec.convert("\xa0")
4368  * rescue Encoding::UndefinedConversionError
4369  * puts $!.error_char.dump #=> "\xC2\xA0"
4370  * p $!.error_char.encoding #=> #<Encoding:UTF-8>
4371  * end
4372  *
4373  */
4374 static VALUE
4375 ecerr_error_char(VALUE self)
4376 {
4377  return rb_attr_get(self, id_error_char);
4378 }
4379 
4380 /*
4381  * call-seq:
4382  * ecerr.error_bytes -> string
4383  *
4384  * Returns the discarded bytes when Encoding::InvalidByteSequenceError occurs.
4385  *
4386  * ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1")
4387  * begin
4388  * ec.convert("abc\xA1\xFFdef")
4389  * rescue Encoding::InvalidByteSequenceError
4390  * p $! #=> #<Encoding::InvalidByteSequenceError: "\xA1" followed by "\xFF" on EUC-JP>
4391  * puts $!.error_bytes.dump #=> "\xA1"
4392  * puts $!.readagain_bytes.dump #=> "\xFF"
4393  * end
4394  */
4395 static VALUE
4396 ecerr_error_bytes(VALUE self)
4397 {
4398  return rb_attr_get(self, id_error_bytes);
4399 }
4400 
4401 /*
4402  * call-seq:
4403  * ecerr.readagain_bytes -> string
4404  *
4405  * Returns the bytes to be read again when Encoding::InvalidByteSequenceError occurs.
4406  */
4407 static VALUE
4408 ecerr_readagain_bytes(VALUE self)
4409 {
4410  return rb_attr_get(self, id_readagain_bytes);
4411 }
4412 
4413 /*
4414  * call-seq:
4415  * ecerr.incomplete_input? -> true or false
4416  *
4417  * Returns true if the invalid byte sequence error is caused by
4418  * premature end of string.
4419  *
4420  * ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1")
4421  *
4422  * begin
4423  * ec.convert("abc\xA1z")
4424  * rescue Encoding::InvalidByteSequenceError
4425  * p $! #=> #<Encoding::InvalidByteSequenceError: "\xA1" followed by "z" on EUC-JP>
4426  * p $!.incomplete_input? #=> false
4427  * end
4428  *
4429  * begin
4430  * ec.convert("abc\xA1")
4431  * ec.finish
4432  * rescue Encoding::InvalidByteSequenceError
4433  * p $! #=> #<Encoding::InvalidByteSequenceError: incomplete "\xA1" on EUC-JP>
4434  * p $!.incomplete_input? #=> true
4435  * end
4436  */
4437 static VALUE
4438 ecerr_incomplete_input(VALUE self)
4439 {
4440  return rb_attr_get(self, id_incomplete_input);
4441 }
4442 
4443 /*
4444  * Document-class: Encoding::UndefinedConversionError
4445  *
4446  * Raised by Encoding and String methods when a transcoding operation
4447  * fails.
4448  */
4449 
4450 /*
4451  * Document-class: Encoding::InvalidByteSequenceError
4452  *
4453  * Raised by Encoding and String methods when the string being
4454  * transcoded contains a byte invalid for the either the source or
4455  * target encoding.
4456  */
4457 
4458 /*
4459  * Document-class: Encoding::ConverterNotFoundError
4460  *
4461  * Raised by transcoding methods when a named encoding does not
4462  * correspond with a known converter.
4463  */
4464 
4465 void
4466 Init_transcode(void)
4467 {
4468  transcoder_table = st_init_strcasetable();
4469 
4470  id_destination_encoding = rb_intern_const("destination_encoding");
4471  id_destination_encoding_name = rb_intern_const("destination_encoding_name");
4472  id_error_bytes = rb_intern_const("error_bytes");
4473  id_error_char = rb_intern_const("error_char");
4474  id_incomplete_input = rb_intern_const("incomplete_input");
4475  id_readagain_bytes = rb_intern_const("readagain_bytes");
4476  id_source_encoding = rb_intern_const("source_encoding");
4477  id_source_encoding_name = rb_intern_const("source_encoding_name");
4478 
4479  sym_invalid = ID2SYM(rb_intern_const("invalid"));
4480  sym_undef = ID2SYM(rb_intern_const("undef"));
4481  sym_replace = ID2SYM(rb_intern_const("replace"));
4482  sym_fallback = ID2SYM(rb_intern_const("fallback"));
4483  sym_xml = ID2SYM(rb_intern_const("xml"));
4484  sym_text = ID2SYM(rb_intern_const("text"));
4485  sym_attr = ID2SYM(rb_intern_const("attr"));
4486 
4487  sym_invalid_byte_sequence = ID2SYM(rb_intern_const("invalid_byte_sequence"));
4488  sym_undefined_conversion = ID2SYM(rb_intern_const("undefined_conversion"));
4489  sym_destination_buffer_full = ID2SYM(rb_intern_const("destination_buffer_full"));
4490  sym_source_buffer_empty = ID2SYM(rb_intern_const("source_buffer_empty"));
4491  sym_finished = ID2SYM(rb_intern_const("finished"));
4492  sym_after_output = ID2SYM(rb_intern_const("after_output"));
4493  sym_incomplete_input = ID2SYM(rb_intern_const("incomplete_input"));
4494  sym_universal_newline = ID2SYM(rb_intern_const("universal_newline"));
4495  sym_crlf_newline = ID2SYM(rb_intern_const("crlf_newline"));
4496  sym_cr_newline = ID2SYM(rb_intern_const("cr_newline"));
4497  sym_partial_input = ID2SYM(rb_intern_const("partial_input"));
4498 
4499 #ifdef ENABLE_ECONV_NEWLINE_OPTION
4500  sym_newline = ID2SYM(rb_intern_const("newline"));
4501  sym_universal = ID2SYM(rb_intern_const("universal"));
4502  sym_crlf = ID2SYM(rb_intern_const("crlf"));
4503  sym_cr = ID2SYM(rb_intern_const("cr"));
4504  sym_lf = ID2SYM(rb_intern_const("lf"));
4505 #endif
4506 
4507  InitVM(transcode);
4508 }
4509 
4510 void
4511 InitVM_transcode(void)
4512 {
4513  rb_eUndefinedConversionError = rb_define_class_under(rb_cEncoding, "UndefinedConversionError", rb_eEncodingError);
4514  rb_eInvalidByteSequenceError = rb_define_class_under(rb_cEncoding, "InvalidByteSequenceError", rb_eEncodingError);
4515  rb_eConverterNotFoundError = rb_define_class_under(rb_cEncoding, "ConverterNotFoundError", rb_eEncodingError);
4516 
4517  rb_define_method(rb_cString, "encode", str_encode, -1);
4518  rb_define_method(rb_cString, "encode!", str_encode_bang, -1);
4519 
4520  rb_cEncodingConverter = rb_define_class_under(rb_cEncoding, "Converter", rb_cObject);
4521  rb_define_alloc_func(rb_cEncodingConverter, econv_s_allocate);
4522  rb_define_singleton_method(rb_cEncodingConverter, "asciicompat_encoding", econv_s_asciicompat_encoding, 1);
4523  rb_define_singleton_method(rb_cEncodingConverter, "search_convpath", econv_s_search_convpath, -1);
4524  rb_define_method(rb_cEncodingConverter, "initialize", econv_init, -1);
4525  rb_define_method(rb_cEncodingConverter, "inspect", econv_inspect, 0);
4526  rb_define_method(rb_cEncodingConverter, "convpath", econv_convpath, 0);
4527  rb_define_method(rb_cEncodingConverter, "source_encoding", econv_source_encoding, 0);
4528  rb_define_method(rb_cEncodingConverter, "destination_encoding", econv_destination_encoding, 0);
4529  rb_define_method(rb_cEncodingConverter, "primitive_convert", econv_primitive_convert, -1);
4530  rb_define_method(rb_cEncodingConverter, "convert", econv_convert, 1);
4531  rb_define_method(rb_cEncodingConverter, "finish", econv_finish, 0);
4532  rb_define_method(rb_cEncodingConverter, "primitive_errinfo", econv_primitive_errinfo, 0);
4533  rb_define_method(rb_cEncodingConverter, "insert_output", econv_insert_output, 1);
4534  rb_define_method(rb_cEncodingConverter, "putback", econv_putback, -1);
4535  rb_define_method(rb_cEncodingConverter, "last_error", econv_last_error, 0);
4536  rb_define_method(rb_cEncodingConverter, "replacement", econv_get_replacement, 0);
4537  rb_define_method(rb_cEncodingConverter, "replacement=", econv_set_replacement, 1);
4538  rb_define_method(rb_cEncodingConverter, "==", econv_equal, 1);
4539 
4540  /* Document-const: INVALID_MASK
4541  *
4542  * Mask for invalid byte sequences
4543  */
4544  rb_define_const(rb_cEncodingConverter, "INVALID_MASK", INT2FIX(ECONV_INVALID_MASK));
4545 
4546  /* Document-const: INVALID_REPLACE
4547  *
4548  * Replace invalid byte sequences
4549  */
4550  rb_define_const(rb_cEncodingConverter, "INVALID_REPLACE", INT2FIX(ECONV_INVALID_REPLACE));
4551 
4552  /* Document-const: UNDEF_MASK
4553  *
4554  * Mask for a valid character in the source encoding but no related
4555  * character(s) in destination encoding.
4556  */
4557  rb_define_const(rb_cEncodingConverter, "UNDEF_MASK", INT2FIX(ECONV_UNDEF_MASK));
4558 
4559  /* Document-const: UNDEF_REPLACE
4560  *
4561  * Replace byte sequences that are undefined in the destination encoding.
4562  */
4563  rb_define_const(rb_cEncodingConverter, "UNDEF_REPLACE", INT2FIX(ECONV_UNDEF_REPLACE));
4564 
4565  /* Document-const: UNDEF_HEX_CHARREF
4566  *
4567  * Replace byte sequences that are undefined in the destination encoding
4568  * with an XML hexadecimal character reference. This is valid for XML
4569  * conversion.
4570  */
4571  rb_define_const(rb_cEncodingConverter, "UNDEF_HEX_CHARREF", INT2FIX(ECONV_UNDEF_HEX_CHARREF));
4572 
4573  /* Document-const: PARTIAL_INPUT
4574  *
4575  * Indicates the source may be part of a larger string. See
4576  * primitive_convert for an example.
4577  */
4578  rb_define_const(rb_cEncodingConverter, "PARTIAL_INPUT", INT2FIX(ECONV_PARTIAL_INPUT));
4579 
4580  /* Document-const: AFTER_OUTPUT
4581  *
4582  * Stop converting after some output is complete but before all of the
4583  * input was consumed. See primitive_convert for an example.
4584  */
4585  rb_define_const(rb_cEncodingConverter, "AFTER_OUTPUT", INT2FIX(ECONV_AFTER_OUTPUT));
4586 
4587  /* Document-const: UNIVERSAL_NEWLINE_DECORATOR
4588  *
4589  * Decorator for converting CRLF and CR to LF
4590  */
4591  rb_define_const(rb_cEncodingConverter, "UNIVERSAL_NEWLINE_DECORATOR", INT2FIX(ECONV_UNIVERSAL_NEWLINE_DECORATOR));
4592 
4593  /* Document-const: CRLF_NEWLINE_DECORATOR
4594  *
4595  * Decorator for converting LF to CRLF
4596  */
4597  rb_define_const(rb_cEncodingConverter, "CRLF_NEWLINE_DECORATOR", INT2FIX(ECONV_CRLF_NEWLINE_DECORATOR));
4598 
4599  /* Document-const: CR_NEWLINE_DECORATOR
4600  *
4601  * Decorator for converting LF to CR
4602  */
4603  rb_define_const(rb_cEncodingConverter, "CR_NEWLINE_DECORATOR", INT2FIX(ECONV_CR_NEWLINE_DECORATOR));
4604 
4605  /* Document-const: XML_TEXT_DECORATOR
4606  *
4607  * Escape as XML CharData
4608  */
4609  rb_define_const(rb_cEncodingConverter, "XML_TEXT_DECORATOR", INT2FIX(ECONV_XML_TEXT_DECORATOR));
4610 
4611  /* Document-const: XML_ATTR_CONTENT_DECORATOR
4612  *
4613  * Escape as XML AttValue
4614  */
4615  rb_define_const(rb_cEncodingConverter, "XML_ATTR_CONTENT_DECORATOR", INT2FIX(ECONV_XML_ATTR_CONTENT_DECORATOR));
4616 
4617  /* Document-const: XML_ATTR_QUOTE_DECORATOR
4618  *
4619  * Escape as XML AttValue
4620  */
4621  rb_define_const(rb_cEncodingConverter, "XML_ATTR_QUOTE_DECORATOR", INT2FIX(ECONV_XML_ATTR_QUOTE_DECORATOR));
4622 
4623  rb_define_method(rb_eUndefinedConversionError, "source_encoding_name", ecerr_source_encoding_name, 0);
4624  rb_define_method(rb_eUndefinedConversionError, "destination_encoding_name", ecerr_destination_encoding_name, 0);
4625  rb_define_method(rb_eUndefinedConversionError, "source_encoding", ecerr_source_encoding, 0);
4626  rb_define_method(rb_eUndefinedConversionError, "destination_encoding", ecerr_destination_encoding, 0);
4627  rb_define_method(rb_eUndefinedConversionError, "error_char", ecerr_error_char, 0);
4628 
4629  rb_define_method(rb_eInvalidByteSequenceError, "source_encoding_name", ecerr_source_encoding_name, 0);
4630  rb_define_method(rb_eInvalidByteSequenceError, "destination_encoding_name", ecerr_destination_encoding_name, 0);
4631  rb_define_method(rb_eInvalidByteSequenceError, "source_encoding", ecerr_source_encoding, 0);
4632  rb_define_method(rb_eInvalidByteSequenceError, "destination_encoding", ecerr_destination_encoding, 0);
4633  rb_define_method(rb_eInvalidByteSequenceError, "error_bytes", ecerr_error_bytes, 0);
4634  rb_define_method(rb_eInvalidByteSequenceError, "readagain_bytes", ecerr_readagain_bytes, 0);
4635  rb_define_method(rb_eInvalidByteSequenceError, "incomplete_input?", ecerr_incomplete_input, 0);
4636 
4637  Init_newline();
4638 }
ruby_coderange_type
What rb_enc_str_coderange() returns.
Definition: coderange.h:33
#define rb_define_singleton_method(klass, mid, func, arity)
Defines klass.mid.
Definition: cxxanyargs.hpp:685
VALUE rb_define_class_under(VALUE outer, const char *name, VALUE super)
Defines a class under the namespace of outer.
Definition: class.c:869
int rb_scan_args(int argc, const VALUE *argv, const char *fmt,...)
Retrieves argument from argc and argv to given VALUE references according to the format string.
Definition: class.c:2406
void rb_define_method(VALUE klass, const char *name, VALUE(*func)(ANYARGS), int argc)
Defines a method.
Definition: class.c:1914
#define ECONV_XML_ATTR_QUOTE_DECORATOR
Old name of RUBY_ECONV_XML_ATTR_QUOTE_DECORATOR.
Definition: transcode.h:535
#define ECONV_AFTER_OUTPUT
Old name of RUBY_ECONV_AFTER_OUTPUT.
Definition: transcode.h:551
#define rb_str_new2
Old name of rb_str_new_cstr.
Definition: string.h:1738
#define ENC_CODERANGE_7BIT
Old name of RUBY_ENC_CODERANGE_7BIT.
Definition: coderange.h:180
#define ENC_CODERANGE_VALID
Old name of RUBY_ENC_CODERANGE_VALID.
Definition: coderange.h:181
#define ECONV_UNIVERSAL_NEWLINE_DECORATOR
Old name of RUBY_ECONV_UNIVERSAL_NEWLINE_DECORATOR.
Definition: transcode.h:529
#define REALLOC_N
Old name of RB_REALLOC_N.
Definition: memory.h:397
#define ALLOC
Old name of RB_ALLOC.
Definition: memory.h:394
#define xfree
Old name of ruby_xfree.
Definition: xmalloc.h:58
#define Qundef
Old name of RUBY_Qundef.
#define INT2FIX
Old name of RB_INT2FIX.
Definition: long.h:48
#define OBJ_FROZEN
Old name of RB_OBJ_FROZEN.
Definition: fl_type.h:145
#define ECONV_XML_ATTR_CONTENT_DECORATOR
Old name of RUBY_ECONV_XML_ATTR_CONTENT_DECORATOR.
Definition: transcode.h:533
#define ECONV_INVALID_MASK
Old name of RUBY_ECONV_INVALID_MASK.
Definition: transcode.h:520
#define ECONV_CRLF_NEWLINE_DECORATOR
Old name of RUBY_ECONV_CRLF_NEWLINE_DECORATOR.
Definition: transcode.h:530
#define xrealloc
Old name of ruby_xrealloc.
Definition: xmalloc.h:56
#define ID2SYM
Old name of RB_ID2SYM.
Definition: symbol.h:44
#define OBJ_FREEZE
Old name of RB_OBJ_FREEZE.
Definition: fl_type.h:143
#define ECONV_UNDEF_REPLACE
Old name of RUBY_ECONV_UNDEF_REPLACE.
Definition: transcode.h:523
#define ECONV_XML_TEXT_DECORATOR
Old name of RUBY_ECONV_XML_TEXT_DECORATOR.
Definition: transcode.h:532
#define rb_ary_new4
Old name of rb_ary_new_from_values.
Definition: array.h:653
#define ENC_CODERANGE_UNKNOWN
Old name of RUBY_ENC_CODERANGE_UNKNOWN.
Definition: coderange.h:179
#define ECONV_CR_NEWLINE_DECORATOR
Old name of RUBY_ECONV_CR_NEWLINE_DECORATOR.
Definition: transcode.h:531
#define xmalloc
Old name of ruby_xmalloc.
Definition: xmalloc.h:53
#define ECONV_INVALID_REPLACE
Old name of RUBY_ECONV_INVALID_REPLACE.
Definition: transcode.h:521
#define T_HASH
Old name of RUBY_T_HASH.
Definition: value_type.h:65
#define ALLOC_N
Old name of RB_ALLOC_N.
Definition: memory.h:393
#define MBCLEN_CHARFOUND_LEN(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_LEN.
Definition: encoding.h:533
#define rb_exc_new3
Old name of rb_exc_new_str.
Definition: error.h:38
#define ECONV_UNDEF_MASK
Old name of RUBY_ECONV_UNDEF_MASK.
Definition: transcode.h:522
#define Qtrue
Old name of RUBY_Qtrue.
#define ECONV_PARTIAL_INPUT
Old name of RUBY_ECONV_PARTIAL_INPUT.
Definition: transcode.h:550
#define NUM2INT
Old name of RB_NUM2INT.
Definition: int.h:44
#define ECONV_ERROR_HANDLER_MASK
Old name of RUBY_ECONV_ERROR_HANDLER_MASK.
Definition: transcode.h:519
#define INT2NUM
Old name of RB_INT2NUM.
Definition: int.h:43
#define Qnil
Old name of RUBY_Qnil.
#define Qfalse
Old name of RUBY_Qfalse.
#define ENC_CODERANGE_BROKEN
Old name of RUBY_ENC_CODERANGE_BROKEN.
Definition: coderange.h:182
#define T_ARRAY
Old name of RUBY_T_ARRAY.
Definition: value_type.h:56
#define NIL_P
Old name of RB_NIL_P.
#define MBCLEN_CHARFOUND_P(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_P.
Definition: encoding.h:532
#define ECONV_UNDEF_HEX_CHARREF
Old name of RUBY_ECONV_UNDEF_HEX_CHARREF.
Definition: transcode.h:524
#define NUM2LONG
Old name of RB_NUM2LONG.
Definition: long.h:51
#define ECONV_NEWLINE_DECORATOR_MASK
Old name of RUBY_ECONV_NEWLINE_DECORATOR_MASK.
Definition: transcode.h:526
#define rb_ary_new2
Old name of rb_ary_new_capa.
Definition: array.h:651
#define ENC_CODERANGE_SET(obj, cr)
Old name of RB_ENC_CODERANGE_SET.
Definition: coderange.h:186
#define SYMBOL_P
Old name of RB_SYMBOL_P.
Definition: value_type.h:88
void * rb_check_typeddata(VALUE obj, const rb_data_type_t *data_type)
Identical to rb_typeddata_is_kind_of(), except it raises exceptions instead of returning false.
Definition: error.c:1066
void rb_raise(VALUE exc, const char *fmt,...)
Exception entry point.
Definition: error.c:3025
void rb_exc_raise(VALUE mesg)
Raises an exception in the current thread.
Definition: eval.c:675
int rb_typeddata_is_kind_of(VALUE obj, const rb_data_type_t *data_type)
Checks if the given object is of given kind.
Definition: error.c:1049
void rb_bug(const char *fmt,...)
Interpreter panic switch.
Definition: error.c:802
VALUE rb_eTypeError
TypeError exception.
Definition: error.c:1099
VALUE rb_eRuntimeError
RuntimeError exception.
Definition: error.c:1097
VALUE rb_exc_new_str(VALUE etype, VALUE str)
Identical to rb_exc_new_cstr(), except it takes a Ruby's string instead of C's.
Definition: error.c:1150
VALUE rb_eArgError
ArgumentError exception.
Definition: error.c:1100
VALUE rb_eEncodingError
EncodingError exception.
Definition: error.c:1105
void rb_warning(const char *fmt,...)
Issues a warning.
Definition: error.c:449
VALUE rb_obj_class(VALUE obj)
Queries the class of an object.
Definition: object.c:188
VALUE rb_cEncoding
Encoding class.
Definition: encoding.c:57
VALUE rb_cString
String class.
Definition: string.c:80
VALUE rb_to_int(VALUE val)
Identical to rb_check_to_int(), except it raises in case of conversion mismatch.
Definition: object.c:2998
Encoding relates APIs.
int rb_enc_precise_mbclen(const char *p, const char *e, rb_encoding *enc)
Queries the number of bytes of the character at the passed pointer.
Definition: encoding.c:1234
int rb_enc_get_index(VALUE obj)
Queries the index of the encoding of the passed object, if any.
Definition: encoding.c:979
int rb_to_encoding_index(VALUE obj)
Obtains a encoding index from a wider range of objects (than rb_enc_find_index()).
Definition: encoding.c:267
VALUE rb_enc_associate(VALUE obj, rb_encoding *enc)
Identical to rb_enc_associate(), except it takes an encoding itself instead of its index.
Definition: encoding.c:1066
rb_encoding * rb_enc_find(const char *name)
Identical to rb_find_encoding(), except it takes a C's string instead of Ruby's.
Definition: encoding.c:918
rb_encoding * rb_to_encoding(VALUE obj)
Identical to rb_find_encoding(), except it raises an exception instead of returning NULL.
Definition: encoding.c:329
VALUE rb_enc_from_encoding(rb_encoding *enc)
Queries the Ruby-level counterpart instance of rb_cEncoding that corresponds to the passed encoding.
Definition: encoding.c:188
static bool rb_enc_asciicompat(rb_encoding *enc)
Queries if the passed encoding is in some sense compatible with ASCII.
Definition: encoding.h:782
int rb_define_dummy_encoding(const char *name)
Creates a new "dummy" encoding.
Definition: encoding.c:617
rb_encoding * rb_utf8_encoding(void)
Queries the encoding that represents UTF-8.
Definition: encoding.c:1527
rb_encoding * rb_enc_from_index(int idx)
Identical to rb_find_encoding(), except it takes an encoding index instead of a Ruby object.
Definition: encoding.c:414
VALUE rb_enc_default_internal(void)
Identical to rb_default_internal_encoding(), except it returns the Ruby-level counterpart instance of...
Definition: encoding.c:1733
VALUE rb_enc_associate_index(VALUE obj, int encindex)
Identical to rb_enc_set_index(), except it additionally does contents fix-up depending on the passed ...
Definition: encoding.c:1038
rb_encoding * rb_enc_get(VALUE obj)
Identical to rb_enc_get_index(), except the return type.
Definition: encoding.c:1072
static OnigCodePoint rb_enc_mbc_to_codepoint(const char *p, const char *e, rb_encoding *enc)
Identical to rb_enc_codepoint(), except it assumes the passed character is not broken.
Definition: encoding.h:607
static const char * rb_enc_name(rb_encoding *enc)
Queries the (canonical) name of the passed encoding.
Definition: encoding.h:433
int rb_enc_find_index(const char *name)
Queries the index of the encoding.
Definition: encoding.c:881
VALUE rb_str_conv_enc(VALUE str, rb_encoding *from, rb_encoding *to)
Encoding conversion main routine.
Definition: string.c:1182
int rb_enc_str_coderange(VALUE str)
Scans the passed string to collect its code range.
Definition: string.c:776
VALUE rb_enc_str_new(const char *ptr, long len, rb_encoding *enc)
Identical to rb_enc_str_new(), except it additionally takes an encoding.
Definition: string.c:940
VALUE rb_obj_encoding(VALUE obj)
Identical to rb_enc_get_index(), except the return type.
Definition: encoding.c:1206
long rb_str_coderange_scan_restartable(const char *str, const char *end, rb_encoding *enc, int *cr)
Scans the passed string until it finds something odd.
Definition: string.c:668
int rb_econv_prepare_options(VALUE opthash, VALUE *ecopts, int ecflags)
Identical to rb_econv_prepare_opts(), except it additionally takes the initial value of flags.
Definition: transcode.c:2563
VALUE rb_econv_open_exc(const char *senc, const char *denc, int ecflags)
Creates a rb_eConverterNotFoundError exception object (but does not raise).
Definition: transcode.c:2065
int rb_econv_prepare_opts(VALUE opthash, VALUE *ecopts)
Splits a keyword arguments hash (that for instance String#encode took) into a set of enum ruby_econv_...
Definition: transcode.c:2608
rb_econv_result_t rb_econv_convert(rb_econv_t *ec, const unsigned char **source_buffer_ptr, const unsigned char *source_buffer_end, unsigned char **destination_buffer_ptr, unsigned char *destination_buffer_end, int flags)
Converts a string from an encoding to another.
Definition: transcode.c:1449
rb_econv_result_t
return value of rb_econv_convert()
Definition: transcode.h:30
@ econv_incomplete_input
The conversion stopped in middle of reading a character, possibly due to a partial read of a socket e...
Definition: transcode.h:69
@ econv_finished
The conversion stopped after converting everything.
Definition: transcode.h:57
@ econv_undefined_conversion
The conversion stopped when it found a character in the input which cannot be representable in the ou...
Definition: transcode.h:41
@ econv_after_output
The conversion stopped after writing something to somewhere, before reading everything.
Definition: transcode.h:63
@ econv_source_buffer_empty
The conversion stopped because there is no input.
Definition: transcode.h:51
@ econv_destination_buffer_full
The conversion stopped because there is no destination.
Definition: transcode.h:46
@ econv_invalid_byte_sequence
The conversion stopped when it found an invalid sequence.
Definition: transcode.h:35
int rb_econv_putbackable(rb_econv_t *ec)
Queries if rb_econv_putback() makes sense, i.e.
Definition: transcode.c:1745
int rb_econv_has_convpath_p(const char *from_encoding, const char *to_encoding)
Queries if there is more than one way to convert between the passed two encodings.
Definition: transcode.c:3226
VALUE rb_econv_str_append(rb_econv_t *ec, VALUE src, VALUE dst, int flags)
Identical to rb_econv_str_convert(), except it appends the conversion result to the additionally pass...
Definition: transcode.c:1894
VALUE rb_econv_substr_append(rb_econv_t *ec, VALUE src, long byteoff, long bytesize, VALUE dst, int flags)
Identical to rb_econv_str_append(), except it appends only a part of the passed string with conversio...
Definition: transcode.c:1885
int rb_econv_insert_output(rb_econv_t *ec, const unsigned char *str, size_t len, const char *str_encoding)
Appends the passed string to the passed converter's output buffer.
Definition: transcode.c:1590
VALUE rb_econv_str_convert(rb_econv_t *ec, VALUE src, int flags)
Identical to rb_econv_convert(), except it takes Ruby's string instead of C's pointer.
Definition: transcode.c:1906
int rb_econv_decorate_at_last(rb_econv_t *ec, const char *decorator_name)
Identical to rb_econv_decorate_at_first(), except it adds to the opposite direction.
Definition: transcode.c:1954
void rb_econv_binmode(rb_econv_t *ec)
This badly named function does not set the destination encoding to binary, but instead just nullifies...
Definition: transcode.c:1971
int rb_econv_decorate_at_first(rb_econv_t *ec, const char *decorator_name)
"Decorate"s a converter.
Definition: transcode.c:1937
VALUE rb_str_encode(VALUE str, VALUE to, int ecflags, VALUE ecopts)
Converts the contents of the passed string from its encoding to the passed one.
Definition: transcode.c:2929
VALUE rb_econv_make_exception(rb_econv_t *ec)
This function makes sense right after rb_econv_convert() returns.
Definition: transcode.c:4281
struct rb_econv_t rb_econv_t
An opaque struct that represents a lowest level of encoding conversion.
Definition: transcode.h:73
void rb_econv_check_error(rb_econv_t *ec)
This is a rb_econv_make_exception() + rb_exc_raise() combo.
Definition: transcode.c:4287
const char * rb_econv_asciicompat_encoding(const char *encname)
Queries the passed encoding's corresponding ASCII compatible encoding.
Definition: transcode.c:1789
VALUE rb_econv_substr_convert(rb_econv_t *ec, VALUE src, long byteoff, long bytesize, int flags)
Identical to rb_econv_str_convert(), except it converts only a part of the passed string.
Definition: transcode.c:1900
rb_econv_t * rb_econv_open_opts(const char *source_encoding, const char *destination_encoding, int ecflags, VALUE ecopts)
Identical to rb_econv_open(), except it additionally takes a hash of optional strings.
Definition: transcode.c:2614
void rb_econv_close(rb_econv_t *ec)
Destructs a converter.
Definition: transcode.c:1705
VALUE rb_econv_append(rb_econv_t *ec, const char *bytesrc, long bytesize, VALUE dst, int flags)
Converts the passed C's pointer according to the passed converter, then append the conversion result ...
Definition: transcode.c:1822
void rb_econv_putback(rb_econv_t *ec, unsigned char *p, int n)
Puts back the bytes.
Definition: transcode.c:1756
int rb_econv_set_replacement(rb_econv_t *ec, const unsigned char *str, size_t len, const char *encname)
Assigns the replacement string.
Definition: transcode.c:2227
rb_econv_t * rb_econv_open(const char *source_encoding, const char *destination_encoding, int ecflags)
Creates a new instance of struct rb_econv_t.
Definition: transcode.c:1072
const char * rb_econv_encoding_to_insert_output(rb_econv_t *ec)
Queries an encoding name which best suits for rb_econv_insert_output()'s last parameter.
Definition: transcode.c:1506
VALUE rb_funcallv_public(VALUE recv, ID mid, int argc, const VALUE *argv)
Identical to rb_funcallv(), except it only takes public methods into account.
Definition: vm_eval.c:1153
VALUE rb_check_array_type(VALUE obj)
Try converting an object to its array representation using its to_ary method, if any.
Definition: array.c:989
VALUE rb_ary_new(void)
Allocates a new, empty array.
Definition: array.c:750
VALUE rb_ary_push(VALUE ary, VALUE elem)
Special case of rb_ary_cat() that it adds only one element.
Definition: array.c:1308
VALUE rb_ary_entry(VALUE ary, long off)
Queries an element of an array.
Definition: array.c:1679
VALUE rb_assoc_new(VALUE car, VALUE cdr)
Identical to rb_ary_new_from_values(), except it expects exactly two parameters.
Definition: array.c:976
void rb_ary_store(VALUE ary, long key, VALUE val)
Destructively stores the passed value to the passed array's passed index.
Definition: array.c:1148
#define rb_check_frozen
Just another name of rb_check_frozen.
Definition: error.h:278
static int rb_check_arity(int argc, int min, int max)
Ensures that the passed integer is in the passed range.
Definition: error.h:294
VALUE rb_check_hash_type(VALUE obj)
Try converting an object to its hash representation using its to_hash method, if any.
Definition: hash.c:1896
VALUE rb_hash_freeze(VALUE obj)
Just another name of rb_obj_freeze.
Definition: hash.c:87
VALUE rb_hash_aref(VALUE hash, VALUE key)
Queries the given key in the given hash table.
Definition: hash.c:2082
VALUE rb_hash_aset(VALUE hash, VALUE key, VALUE val)
Inserts or replaces ("upsert"s) the objects into the given hash table.
Definition: hash.c:2903
VALUE rb_hash_new(void)
Creates a new, empty hash object.
Definition: hash.c:1529
VALUE rb_proc_call(VALUE recv, VALUE args)
Evaluates the passed proc with the passed arguments.
Definition: proc.c:1003
VALUE rb_obj_is_method(VALUE recv)
Queries if the given object is a method.
Definition: proc.c:1600
VALUE rb_method_call(int argc, const VALUE *argv, VALUE recv)
Evaluates the passed method with the passed arguments.
Definition: proc.c:2423
VALUE rb_obj_is_proc(VALUE recv)
Queries if the given object is a proc.
Definition: proc.c:175
VALUE rb_str_tmp_new(long len)
Allocates a "temporary" string.
Definition: string.c:1540
void rb_str_shared_replace(VALUE dst, VALUE src)
Replaces the contents of the former with the latter.
Definition: string.c:1593
size_t rb_str_capacity(VALUE str)
Queries the capacity of the given string.
Definition: string.c:828
VALUE rb_str_new_frozen(VALUE str)
Creates a frozen copy of the string, if necessary.
Definition: string.c:1356
VALUE rb_str_cat2(VALUE, const char *)
Just another name of rb_str_cat_cstr.
VALUE rb_str_dup(VALUE str)
Duplicates a string.
Definition: string.c:1808
void rb_str_modify(VALUE str)
Declares that the string is about to be modified.
Definition: string.c:2459
void rb_str_set_len(VALUE str, long len)
Overwrites the length of the string.
Definition: string.c:3039
VALUE rb_str_new(const char *ptr, long len)
Allocates an instance of rb_cString.
Definition: string.c:918
VALUE rb_str_new_cstr(const char *ptr)
Identical to rb_str_new(), except it assumes the passed pointer is a pointer to a C string.
Definition: string.c:952
VALUE rb_str_resize(VALUE str, long len)
Overwrites the length of the string.
Definition: string.c:3056
void rb_str_modify_expand(VALUE str, long capa)
Identical to rb_str_modify(), except it additionally expands the capacity of the receiver.
Definition: string.c:2467
VALUE rb_str_dump(VALUE str)
"Inverse" of rb_eval_string().
Definition: string.c:6567
VALUE rb_str_buf_new(long capa)
Allocates a "string buffer".
Definition: string.c:1506
VALUE rb_str_drop_bytes(VALUE str, long len)
Shrinks the given string for the given number of bytes.
Definition: string.c:5071
VALUE rb_attr_get(VALUE obj, ID name)
Identical to rb_ivar_get()
Definition: variable.c:1293
VALUE rb_ivar_set(VALUE obj, ID name, VALUE val)
Identical to rb_iv_set(), except it accepts the name as an ID instead of a C string.
Definition: variable.c:1575
int rb_respond_to(VALUE obj, ID mid)
Queries if the object responds to the method.
Definition: vm_method.c:2765
void rb_define_alloc_func(VALUE klass, rb_alloc_func_t func)
Sets the allocator function of a class.
static ID rb_intern_const(const char *str)
This is a "tiny optimisation" over rb_intern().
Definition: symbol.h:276
VALUE rb_sym2str(VALUE id)
Identical to rb_id2str(), except it takes an instance of rb_cSymbol rather than an ID.
Definition: symbol.c:924
void rb_define_const(VALUE klass, const char *name, VALUE val)
Defines a Ruby level constant under a namespace.
Definition: variable.c:3253
VALUE rb_sprintf(const char *fmt,...)
Ruby's extended sprintf(3).
Definition: sprintf.c:1201
VALUE rb_str_catf(VALUE dst, const char *fmt,...)
Identical to rb_sprintf(), except it renders the output to the specified object rather than creating ...
Definition: sprintf.c:1241
#define MEMCPY(p1, p2, type, n)
Handy macro to call memcpy.
Definition: memory.h:366
#define ALLOCA_N(type, n)
Definition: memory.h:286
#define RB_GC_GUARD(v)
Prevents premature destruction of local objects.
Definition: memory.h:161
#define MEMMOVE(p1, p2, type, n)
Handy macro to call memmove.
Definition: memory.h:378
int st_foreach(st_table *q, int_type *w, st_data_t e)
Iteration over the given table.
Definition: cxxanyargs.hpp:432
#define RARRAY_LEN
Just another name of rb_array_len.
Definition: rarray.h:68
static int RARRAY_LENINT(VALUE ary)
Identical to rb_array_len(), except it differs for the return type.
Definition: rarray.h:324
#define RARRAY_AREF(a, i)
Definition: rarray.h:588
#define DATA_PTR(obj)
Convenient getter macro.
Definition: rdata.h:71
@ RSTRING_EMBED_LEN_MAX
Max possible number of characters that can be embedded.
Definition: rstring.h:215
#define StringValue(v)
Ensures that the parameter object is a String.
Definition: rstring.h:72
static char * RSTRING_END(VALUE str)
Queries the end of the contents pointer of the string.
Definition: rstring.h:527
static char * RSTRING_PTR(VALUE str)
Queries the contents pointer of the string.
Definition: rstring.h:497
static long RSTRING_LEN(VALUE str)
Queries the length of the string.
Definition: rstring.h:483
#define StringValueCStr(v)
Identical to StringValuePtr, except it additionally checks for the contents for viability as a C stri...
Definition: rstring.h:95
#define TypedData_Get_Struct(obj, type, data_type, sval)
Obtains a C struct from inside of a wrapper Ruby object.
Definition: rtypeddata.h:507
#define TypedData_Wrap_Struct(klass, data_type, sval)
Converts sval, a pointer to your struct, into a Ruby object.
Definition: rtypeddata.h:441
const char * rb_obj_classname(VALUE obj)
Queries the name of the class of the passed object.
Definition: variable.c:309
#define InitVM(ext)
This macro is for internal use.
Definition: ruby.h:229
#define RTEST
This is an old name of RB_TEST.
This is the struct that holds necessary info for a struct.
Definition: rtypeddata.h:190
Definition: st.h:79
Definition: string.c:7522
Definition: transcode.c:174
uintptr_t ID
Type that represents a Ruby identifier such as a variable name.
Definition: value.h:52
uintptr_t VALUE
Type that represents a Ruby object.
Definition: value.h:40
static bool RB_TYPE_P(VALUE obj, enum ruby_value_type t)
Queries if the given object is of given type.
Definition: value_type.h:375