Ruby 3.1.3p185 (2022-11-24 revision 1a6b16756e0ba6b95ab71a441357ed5484e33498)
encoding.c
1/**********************************************************************
2
3 encoding.c -
4
5 $Author$
6 created at: Thu May 24 17:23:27 JST 2007
7
8 Copyright (C) 2007 Yukihiro Matsumoto
9
10**********************************************************************/
11
12#include "ruby/internal/config.h"
13
14#include <ctype.h>
15
16#include "encindex.h"
17#include "internal.h"
18#include "internal/enc.h"
19#include "internal/encoding.h"
20#include "internal/inits.h"
21#include "internal/load.h"
22#include "internal/object.h"
23#include "internal/string.h"
24#include "internal/vm.h"
25#include "regenc.h"
26#include "ruby/encoding.h"
27#include "ruby/util.h"
28#include "ruby_assert.h"
29#include "vm_sync.h"
30
31#ifndef ENC_DEBUG
32#define ENC_DEBUG 0
33#endif
34#define ENC_ASSERT(expr) RUBY_ASSERT_WHEN(ENC_DEBUG, expr)
35#define MUST_STRING(str) (ENC_ASSERT(RB_TYPE_P(str, T_STRING)), str)
36
37#undef rb_ascii8bit_encindex
38#undef rb_utf8_encindex
39#undef rb_usascii_encindex
40
42
43#if defined __GNUC__ && __GNUC__ >= 4
44#pragma GCC visibility push(default)
45int rb_enc_register(const char *name, rb_encoding *encoding);
46void rb_enc_set_base(const char *name, const char *orig);
47int rb_enc_set_dummy(int index);
48void rb_encdb_declare(const char *name);
49int rb_encdb_replicate(const char *name, const char *orig);
50int rb_encdb_dummy(const char *name);
51int rb_encdb_alias(const char *alias, const char *orig);
52void rb_encdb_set_unicode(int index);
53#pragma GCC visibility pop
54#endif
55
56static ID id_encoding;
58
59#define DEFAULT_ENCODING_LIST_CAPA 128
60static VALUE rb_default_encoding_list;
61static VALUE rb_additional_encoding_list;
62
64 const char *name;
65 rb_encoding *enc;
66 rb_encoding *base;
67};
68
69static struct enc_table {
70 struct rb_encoding_entry *list;
71 int count;
72 int size;
73 st_table *names;
74} global_enc_table;
75
76static rb_encoding *global_enc_ascii,
77 *global_enc_utf_8,
78 *global_enc_us_ascii;
79
80#define GLOBAL_ENC_TABLE_ENTER(enc_table) struct enc_table *enc_table = &global_enc_table; RB_VM_LOCK_ENTER()
81#define GLOBAL_ENC_TABLE_LEAVE() RB_VM_LOCK_LEAVE()
82#define GLOBAL_ENC_TABLE_EVAL(enc_table, expr) do { \
83 GLOBAL_ENC_TABLE_ENTER(enc_table); \
84 { \
85 expr; \
86 } \
87 GLOBAL_ENC_TABLE_LEAVE(); \
88} while (0)
89
90
91#define ENC_DUMMY_FLAG (1<<24)
92#define ENC_INDEX_MASK (~(~0U<<24))
93
94#define ENC_TO_ENCINDEX(enc) (int)((enc)->ruby_encoding_index & ENC_INDEX_MASK)
95#define ENC_DUMMY_P(enc) ((enc)->ruby_encoding_index & ENC_DUMMY_FLAG)
96#define ENC_SET_DUMMY(enc) ((enc)->ruby_encoding_index |= ENC_DUMMY_FLAG)
97
98#define ENCODING_COUNT ENCINDEX_BUILTIN_MAX
99#define UNSPECIFIED_ENCODING INT_MAX
100
101#define ENCODING_NAMELEN_MAX 63
102#define valid_encoding_name_p(name) ((name) && strlen(name) <= ENCODING_NAMELEN_MAX)
103
104static const rb_data_type_t encoding_data_type = {
105 "encoding",
106 {0, 0, 0,},
107 0, 0, RUBY_TYPED_FREE_IMMEDIATELY
108};
109
110#define is_data_encoding(obj) (RTYPEDDATA_P(obj) && RTYPEDDATA_TYPE(obj) == &encoding_data_type)
111#define is_obj_encoding(obj) (RB_TYPE_P((obj), T_DATA) && is_data_encoding(obj))
112
113int
114rb_data_is_encoding(VALUE obj)
115{
116 return is_data_encoding(obj);
117}
118
119static VALUE
120enc_new(rb_encoding *encoding)
121{
122 VALUE enc = TypedData_Wrap_Struct(rb_cEncoding, &encoding_data_type, (void *)encoding);
123 rb_obj_freeze(enc);
125 return enc;
126}
127
128static void
129enc_list_update(int index, rb_raw_encoding *encoding)
130{
131 if (index < DEFAULT_ENCODING_LIST_CAPA) {
132 VALUE list = rb_default_encoding_list;
133 if (list && NIL_P(rb_ary_entry(list, index))) {
134 /* initialize encoding data */
135 rb_ary_store(list, index, enc_new(encoding));
136 }
137 }
138 else {
139 RB_VM_LOCK_ENTER();
140 {
141 VALUE list = rb_additional_encoding_list;
142 if (list && NIL_P(rb_ary_entry(list, index))) {
143 /* initialize encoding data */
144 rb_ary_store(list, index - DEFAULT_ENCODING_LIST_CAPA, enc_new(encoding));
145 }
146 }
147 RB_VM_LOCK_LEAVE();
148 }
149}
150
151static VALUE
152enc_list_lookup(int idx)
153{
154 VALUE list, enc;
155
156 if (idx < DEFAULT_ENCODING_LIST_CAPA) {
157 if (!(list = rb_default_encoding_list)) {
158 rb_bug("rb_enc_from_encoding_index(%d): no rb_default_encoding_list", idx);
159 }
160 enc = rb_ary_entry(list, idx);
161 }
162 else {
163 RB_VM_LOCK_ENTER();
164 {
165 if (!(list = rb_additional_encoding_list)) {
166 rb_bug("rb_enc_from_encoding_index(%d): no rb_additional_encoding_list", idx);
167 }
168 enc = rb_ary_entry(list, idx - DEFAULT_ENCODING_LIST_CAPA);
169 }
170 RB_VM_LOCK_LEAVE();
171 }
172
173 if (NIL_P(enc)) {
174 rb_bug("rb_enc_from_encoding_index(%d): not created yet", idx);
175 }
176 else {
177 return enc;
178 }
179}
180
181static VALUE
182rb_enc_from_encoding_index(int idx)
183{
184 return enc_list_lookup(idx);
185}
186
187VALUE
189{
190 int idx;
191 if (!encoding) return Qnil;
192 idx = ENC_TO_ENCINDEX(encoding);
193 return rb_enc_from_encoding_index(idx);
194}
195
196int
198{
199 return enc ? ENC_TO_ENCINDEX(enc) : 0;
200}
201
202int
204{
205 return ENC_DUMMY_P(enc) != 0;
206}
207
208static int
209check_encoding(rb_encoding *enc)
210{
211 int index = rb_enc_to_index(enc);
212 if (rb_enc_from_index(index) != enc)
213 return -1;
214 if (rb_enc_autoload_p(enc)) {
215 index = rb_enc_autoload(enc);
216 }
217 return index;
218}
219
220static int
221enc_check_encoding(VALUE obj)
222{
223 if (!is_obj_encoding(obj)) {
224 return -1;
225 }
226 return check_encoding(RDATA(obj)->data);
227}
228
229NORETURN(static void not_encoding(VALUE enc));
230static void
231not_encoding(VALUE enc)
232{
233 rb_raise(rb_eTypeError, "wrong argument type %"PRIsVALUE" (expected Encoding)",
234 rb_obj_class(enc));
235}
236
237static rb_encoding *
238must_encoding(VALUE enc)
239{
240 int index = enc_check_encoding(enc);
241 if (index < 0) {
242 not_encoding(enc);
243 }
244 return DATA_PTR(enc);
245}
246
247static rb_encoding *
248must_encindex(int index)
249{
250 rb_encoding *enc = rb_enc_from_index(index);
251 if (!enc) {
252 rb_raise(rb_eEncodingError, "encoding index out of bound: %d",
253 index);
254 }
255 if (ENC_TO_ENCINDEX(enc) != (int)(index & ENC_INDEX_MASK)) {
256 rb_raise(rb_eEncodingError, "wrong encoding index %d for %s (expected %d)",
257 index, rb_enc_name(enc), ENC_TO_ENCINDEX(enc));
258 }
259 if (rb_enc_autoload_p(enc) && rb_enc_autoload(enc) == -1) {
260 rb_loaderror("failed to load encoding (%s)",
261 rb_enc_name(enc));
262 }
263 return enc;
264}
265
266int
268{
269 int idx;
270 const char *name;
271
272 idx = enc_check_encoding(enc);
273 if (idx >= 0) {
274 return idx;
275 }
276 else if (NIL_P(enc = rb_check_string_type(enc))) {
277 return -1;
278 }
279 if (!rb_enc_asciicompat(rb_enc_get(enc))) {
280 return -1;
281 }
282 if (!(name = rb_str_to_cstr(enc))) {
283 return -1;
284 }
285 return rb_enc_find_index(name);
286}
287
288static const char *
289name_for_encoding(volatile VALUE *enc)
290{
291 VALUE name = StringValue(*enc);
292 const char *n;
293
294 if (!rb_enc_asciicompat(rb_enc_get(name))) {
295 rb_raise(rb_eArgError, "invalid encoding name (non ASCII)");
296 }
297 if (!(n = rb_str_to_cstr(name))) {
298 rb_raise(rb_eArgError, "invalid encoding name (NUL byte)");
299 }
300 return n;
301}
302
303/* Returns encoding index or UNSPECIFIED_ENCODING */
304static int
305str_find_encindex(VALUE enc)
306{
307 int idx = rb_enc_find_index(name_for_encoding(&enc));
308 RB_GC_GUARD(enc);
309 return idx;
310}
311
312static int
313str_to_encindex(VALUE enc)
314{
315 int idx = str_find_encindex(enc);
316 if (idx < 0) {
317 rb_raise(rb_eArgError, "unknown encoding name - %"PRIsVALUE, enc);
318 }
319 return idx;
320}
321
322static rb_encoding *
323str_to_encoding(VALUE enc)
324{
325 return rb_enc_from_index(str_to_encindex(enc));
326}
327
330{
331 if (enc_check_encoding(enc) >= 0) return RDATA(enc)->data;
332 return str_to_encoding(enc);
333}
334
337{
338 int idx;
339 if (enc_check_encoding(enc) >= 0) return RDATA(enc)->data;
340 idx = str_find_encindex(enc);
341 if (idx < 0) return NULL;
342 return rb_enc_from_index(idx);
343}
344
345static int
346enc_table_expand(struct enc_table *enc_table, int newsize)
347{
348 struct rb_encoding_entry *ent;
349 int count = newsize;
350
351 if (enc_table->size >= newsize) return newsize;
352 newsize = (newsize + 7) / 8 * 8;
353 ent = REALLOC_N(enc_table->list, struct rb_encoding_entry, newsize);
354 memset(ent + enc_table->size, 0, sizeof(*ent)*(newsize - enc_table->size));
355 enc_table->list = ent;
356 enc_table->size = newsize;
357 return count;
358}
359
360static int
361enc_register_at(struct enc_table *enc_table, int index, const char *name, rb_encoding *base_encoding)
362{
363 struct rb_encoding_entry *ent = &enc_table->list[index];
364 rb_raw_encoding *encoding;
365
366 if (!valid_encoding_name_p(name)) return -1;
367 if (!ent->name) {
368 ent->name = name = strdup(name);
369 }
370 else if (STRCASECMP(name, ent->name)) {
371 return -1;
372 }
373 encoding = (rb_raw_encoding *)ent->enc;
374 if (!encoding) {
375 encoding = xmalloc(sizeof(rb_encoding));
376 }
377 if (base_encoding) {
378 *encoding = *base_encoding;
379 }
380 else {
381 memset(encoding, 0, sizeof(*ent->enc));
382 }
383 encoding->name = name;
384 encoding->ruby_encoding_index = index;
385 ent->enc = encoding;
386 st_insert(enc_table->names, (st_data_t)name, (st_data_t)index);
387
388 enc_list_update(index, encoding);
389 return index;
390}
391
392static int
393enc_register(struct enc_table *enc_table, const char *name, rb_encoding *encoding)
394{
395 int index = enc_table->count;
396
397 enc_table->count = enc_table_expand(enc_table, index + 1);
398 return enc_register_at(enc_table, index, name, encoding);
399}
400
401static void set_encoding_const(const char *, rb_encoding *);
402static int enc_registered(struct enc_table *enc_table, const char *name);
403
404static rb_encoding *
405enc_from_index(struct enc_table *enc_table, int index)
406{
407 if (UNLIKELY(index < 0 || enc_table->count <= (index &= ENC_INDEX_MASK))) {
408 return 0;
409 }
410 return enc_table->list[index].enc;
411}
412
415{
416 rb_encoding *enc;
417
418 switch (index) {
419 case ENCINDEX_ASCII: return global_enc_ascii;
420 case ENCINDEX_UTF_8: return global_enc_utf_8;
421 case ENCINDEX_US_ASCII: return global_enc_us_ascii;
422 default:
423 GLOBAL_ENC_TABLE_EVAL(enc_table,
424 enc = enc_from_index(enc_table, index));
425 return enc;
426 }
427}
428
429int
430rb_enc_register(const char *name, rb_encoding *encoding)
431{
432 int index;
433
434 GLOBAL_ENC_TABLE_ENTER(enc_table);
435 {
436 index = enc_registered(enc_table, name);
437
438 if (index >= 0) {
439 rb_encoding *oldenc = enc_from_index(enc_table, index);
440 if (STRCASECMP(name, rb_enc_name(oldenc))) {
441 index = enc_register(enc_table, name, encoding);
442 }
443 else if (rb_enc_autoload_p(oldenc) || !ENC_DUMMY_P(oldenc)) {
444 enc_register_at(enc_table, index, name, encoding);
445 }
446 else {
447 rb_raise(rb_eArgError, "encoding %s is already registered", name);
448 }
449 }
450 else {
451 index = enc_register(enc_table, name, encoding);
452 set_encoding_const(name, rb_enc_from_index(index));
453 }
454 }
455 GLOBAL_ENC_TABLE_LEAVE();
456 return index;
457}
458
459int
460enc_registered(struct enc_table *enc_table, const char *name)
461{
462 st_data_t idx = 0;
463
464 if (!name) return -1;
465 if (!enc_table->list) return -1;
466 if (st_lookup(enc_table->names, (st_data_t)name, &idx)) {
467 return (int)idx;
468 }
469 return -1;
470}
471
472void
473rb_encdb_declare(const char *name)
474{
475 GLOBAL_ENC_TABLE_ENTER(enc_table);
476 {
477 int idx = enc_registered(enc_table, name);
478 if (idx < 0) {
479 idx = enc_register(enc_table, name, 0);
480 }
481 set_encoding_const(name, rb_enc_from_index(idx));
482 }
483 GLOBAL_ENC_TABLE_LEAVE();
484}
485
486static void
487enc_check_duplication(struct enc_table *enc_table, const char *name)
488{
489 if (enc_registered(enc_table, name) >= 0) {
490 rb_raise(rb_eArgError, "encoding %s is already registered", name);
491 }
492}
493
494static rb_encoding*
495set_base_encoding(struct enc_table *enc_table, int index, rb_encoding *base)
496{
497 rb_encoding *enc = enc_table->list[index].enc;
498
499 ASSUME(enc);
500 enc_table->list[index].base = base;
501 if (ENC_DUMMY_P(base)) ENC_SET_DUMMY((rb_raw_encoding *)enc);
502 return enc;
503}
504
505/* for encdb.h
506 * Set base encoding for encodings which are not replicas
507 * but not in their own files.
508 */
509void
510rb_enc_set_base(const char *name, const char *orig)
511{
512 GLOBAL_ENC_TABLE_ENTER(enc_table);
513 {
514 int idx = enc_registered(enc_table, name);
515 int origidx = enc_registered(enc_table, orig);
516 set_base_encoding(enc_table, idx, rb_enc_from_index(origidx));
517 }
518 GLOBAL_ENC_TABLE_LEAVE();
519}
520
521/* for encdb.h
522 * Set encoding dummy.
523 */
524int
525rb_enc_set_dummy(int index)
526{
527 rb_encoding *enc;
528
529 GLOBAL_ENC_TABLE_EVAL(enc_table,
530 enc = enc_table->list[index].enc);
531
532 ENC_SET_DUMMY((rb_raw_encoding *)enc);
533 return index;
534}
535
536static int
537enc_replicate(struct enc_table *enc_table, const char *name, rb_encoding *encoding)
538{
539 int idx;
540
541 enc_check_duplication(enc_table, name);
542 idx = enc_register(enc_table, name, encoding);
543 if (idx < 0) rb_raise(rb_eArgError, "invalid encoding name: %s", name);
544 set_base_encoding(enc_table, idx, encoding);
545 set_encoding_const(name, rb_enc_from_index(idx));
546 return idx;
547}
548
549int
550rb_enc_replicate(const char *name, rb_encoding *encoding)
551{
552 int r;
553
554 GLOBAL_ENC_TABLE_EVAL(enc_table,
555 r = enc_replicate(enc_table, name, encoding));
556
557 return r;
558}
559
560/*
561 * call-seq:
562 * enc.replicate(name) -> encoding
563 *
564 * Returns a replicated encoding of _enc_ whose name is _name_.
565 * The new encoding should have the same byte structure of _enc_.
566 * If _name_ is used by another encoding, raise ArgumentError.
567 *
568 */
569static VALUE
570enc_replicate_m(VALUE encoding, VALUE name)
571{
572 int idx = rb_enc_replicate(name_for_encoding(&name), rb_to_encoding(encoding));
573 RB_GC_GUARD(name);
574 return rb_enc_from_encoding_index(idx);
575}
576
577static int
578enc_replicate_with_index(struct enc_table *enc_table, const char *name, rb_encoding *origenc, int idx)
579{
580 if (idx < 0) {
581 idx = enc_register(enc_table, name, origenc);
582 }
583 else {
584 idx = enc_register_at(enc_table, idx, name, origenc);
585 }
586 if (idx >= 0) {
587 set_base_encoding(enc_table, idx, origenc);
588 set_encoding_const(name, rb_enc_from_index(idx));
589 }
590 else {
591 rb_raise(rb_eArgError, "failed to replicate encoding");
592 }
593 return idx;
594}
595
596int
597rb_encdb_replicate(const char *name, const char *orig)
598{
599 int r;
600
601 GLOBAL_ENC_TABLE_ENTER(enc_table);
602 {
603 int origidx = enc_registered(enc_table, orig);
604 int idx = enc_registered(enc_table, name);
605
606 if (origidx < 0) {
607 origidx = enc_register(enc_table, orig, 0);
608 }
609 r = enc_replicate_with_index(enc_table, name, rb_enc_from_index(origidx), idx);
610 }
611 GLOBAL_ENC_TABLE_LEAVE();
612
613 return r;
614}
615
616int
618{
619 int index;
620
621 GLOBAL_ENC_TABLE_ENTER(enc_table);
622 {
623 index = enc_replicate(enc_table, name, rb_ascii8bit_encoding());
624 rb_encoding *enc = enc_table->list[index].enc;
625 ENC_SET_DUMMY((rb_raw_encoding *)enc);
626 }
627 GLOBAL_ENC_TABLE_LEAVE();
628
629 return index;
630}
631
632int
633rb_encdb_dummy(const char *name)
634{
635 int index;
636
637 GLOBAL_ENC_TABLE_ENTER(enc_table);
638 {
639 index = enc_replicate_with_index(enc_table, name,
641 enc_registered(enc_table, name));
642 rb_encoding *enc = enc_table->list[index].enc;
643 ENC_SET_DUMMY((rb_raw_encoding *)enc);
644 }
645 GLOBAL_ENC_TABLE_LEAVE();
646
647 return index;
648}
649
650/*
651 * call-seq:
652 * enc.dummy? -> true or false
653 *
654 * Returns true for dummy encodings.
655 * A dummy encoding is an encoding for which character handling is not properly
656 * implemented.
657 * It is used for stateful encodings.
658 *
659 * Encoding::ISO_2022_JP.dummy? #=> true
660 * Encoding::UTF_8.dummy? #=> false
661 *
662 */
663static VALUE
664enc_dummy_p(VALUE enc)
665{
666 return RBOOL(ENC_DUMMY_P(must_encoding(enc)));
667}
668
669/*
670 * call-seq:
671 * enc.ascii_compatible? -> true or false
672 *
673 * Returns whether ASCII-compatible or not.
674 *
675 * Encoding::UTF_8.ascii_compatible? #=> true
676 * Encoding::UTF_16BE.ascii_compatible? #=> false
677 *
678 */
679static VALUE
680enc_ascii_compatible_p(VALUE enc)
681{
682 return RBOOL(rb_enc_asciicompat(must_encoding(enc)));
683}
684
685/*
686 * Returns non-zero when the encoding is Unicode series other than UTF-7 else 0.
687 */
688int
690{
691 return ONIGENC_IS_UNICODE(enc);
692}
693
694static st_data_t
695enc_dup_name(st_data_t name)
696{
697 return (st_data_t)strdup((const char *)name);
698}
699
700/*
701 * Returns copied alias name when the key is added for st_table,
702 * else returns NULL.
703 */
704static int
705enc_alias_internal(struct enc_table *enc_table, const char *alias, int idx)
706{
707 return st_insert2(enc_table->names, (st_data_t)alias, (st_data_t)idx,
708 enc_dup_name);
709}
710
711static int
712enc_alias(struct enc_table *enc_table, const char *alias, int idx)
713{
714 if (!valid_encoding_name_p(alias)) return -1;
715 if (!enc_alias_internal(enc_table, alias, idx))
716 set_encoding_const(alias, enc_from_index(enc_table, idx));
717 return idx;
718}
719
720int
721rb_enc_alias(const char *alias, const char *orig)
722{
723 int idx, r;
724
725 GLOBAL_ENC_TABLE_ENTER(enc_table);
726 {
727 enc_check_duplication(enc_table, alias);
728 if ((idx = rb_enc_find_index(orig)) < 0) {
729 r = -1;
730 }
731 else {
732 r = enc_alias(enc_table, alias, idx);
733 }
734 }
735 GLOBAL_ENC_TABLE_LEAVE();
736
737 return r;
738}
739
740int
741rb_encdb_alias(const char *alias, const char *orig)
742{
743 int r;
744
745 GLOBAL_ENC_TABLE_ENTER(enc_table);
746 {
747 int idx = enc_registered(enc_table, orig);
748
749 if (idx < 0) {
750 idx = enc_register(enc_table, orig, 0);
751 }
752 r = enc_alias(enc_table, alias, idx);
753 }
754 GLOBAL_ENC_TABLE_LEAVE();
755
756 return r;
757}
758
759void
760rb_encdb_set_unicode(int index)
761{
763 ASSUME(enc);
764 enc->flags |= ONIGENC_FLAG_UNICODE;
765}
766
767static void
768rb_enc_init(struct enc_table *enc_table)
769{
770 enc_table_expand(enc_table, ENCODING_COUNT + 1);
771 if (!enc_table->names) {
772 enc_table->names = st_init_strcasetable();
773 }
774#define ENC_REGISTER(enc) enc_register_at(enc_table, ENCINDEX_##enc, rb_enc_name(&OnigEncoding##enc), &OnigEncoding##enc)
775 ENC_REGISTER(ASCII);
776 ENC_REGISTER(UTF_8);
777 ENC_REGISTER(US_ASCII);
778 global_enc_ascii = enc_table->list[ENCINDEX_ASCII].enc;
779 global_enc_utf_8 = enc_table->list[ENCINDEX_UTF_8].enc;
780 global_enc_us_ascii = enc_table->list[ENCINDEX_US_ASCII].enc;
781#undef ENC_REGISTER
782#define ENCDB_REGISTER(name, enc) enc_register_at(enc_table, ENCINDEX_##enc, name, NULL)
783 ENCDB_REGISTER("UTF-16BE", UTF_16BE);
784 ENCDB_REGISTER("UTF-16LE", UTF_16LE);
785 ENCDB_REGISTER("UTF-32BE", UTF_32BE);
786 ENCDB_REGISTER("UTF-32LE", UTF_32LE);
787 ENCDB_REGISTER("UTF-16", UTF_16);
788 ENCDB_REGISTER("UTF-32", UTF_32);
789 ENCDB_REGISTER("UTF8-MAC", UTF8_MAC);
790
791 ENCDB_REGISTER("EUC-JP", EUC_JP);
792 ENCDB_REGISTER("Windows-31J", Windows_31J);
793#undef ENCDB_REGISTER
794 enc_table->count = ENCINDEX_BUILTIN_MAX;
795}
796
798rb_enc_get_from_index(int index)
799{
800 return must_encindex(index);
801}
802
803int rb_require_internal_silent(VALUE fname);
804
805static int
806load_encoding(const char *name)
807{
808 VALUE enclib = rb_sprintf("enc/%s.so", name);
809 VALUE debug = ruby_debug;
810 VALUE errinfo;
811 char *s = RSTRING_PTR(enclib) + 4, *e = RSTRING_END(enclib) - 3;
812 int loaded;
813 int idx;
814
815 while (s < e) {
816 if (!ISALNUM(*s)) *s = '_';
817 else if (ISUPPER(*s)) *s = (char)TOLOWER(*s);
818 ++s;
819 }
820 enclib = rb_fstring(enclib);
822 errinfo = rb_errinfo();
823 loaded = rb_require_internal_silent(enclib);
824 ruby_debug = debug;
825 rb_set_errinfo(errinfo);
826
827 GLOBAL_ENC_TABLE_ENTER(enc_table);
828 {
829 if (loaded < 0 || 1 < loaded) {
830 idx = -1;
831 }
832 else if ((idx = enc_registered(enc_table, name)) < 0) {
833 idx = -1;
834 }
835 else if (rb_enc_autoload_p(enc_table->list[idx].enc)) {
836 idx = -1;
837 }
838 }
839 GLOBAL_ENC_TABLE_LEAVE();
840
841 return idx;
842}
843
844static int
845enc_autoload_body(struct enc_table *enc_table, rb_encoding *enc)
846{
847 rb_encoding *base = enc_table->list[ENC_TO_ENCINDEX(enc)].base;
848
849 if (base) {
850 int i = 0;
851 do {
852 if (i >= enc_table->count) return -1;
853 } while (enc_table->list[i].enc != base && (++i, 1));
854 if (rb_enc_autoload_p(base)) {
855 if (rb_enc_autoload(base) < 0) return -1;
856 }
857 i = enc->ruby_encoding_index;
858 enc_register_at(enc_table, i & ENC_INDEX_MASK, rb_enc_name(enc), base);
859 ((rb_raw_encoding *)enc)->ruby_encoding_index = i;
860 i &= ENC_INDEX_MASK;
861 return i;
862 }
863 else {
864 return -2;
865 }
866}
867
868int
869rb_enc_autoload(rb_encoding *enc)
870{
871 int i;
872 GLOBAL_ENC_TABLE_EVAL(enc_table, i = enc_autoload_body(enc_table, enc));
873 if (i == -2) {
874 i = load_encoding(rb_enc_name(enc));
875 }
876 return i;
877}
878
879/* Return encoding index or UNSPECIFIED_ENCODING from encoding name */
880int
881rb_enc_find_index(const char *name)
882{
883 int i;
884 rb_encoding *enc;
885
886 GLOBAL_ENC_TABLE_EVAL(enc_table, i = enc_registered(enc_table, name));
887
888 if (i < 0) {
889 i = load_encoding(name);
890 }
891 else if (!(enc = rb_enc_from_index(i))) {
892 if (i != UNSPECIFIED_ENCODING) {
893 rb_raise(rb_eArgError, "encoding %s is not registered", name);
894 }
895 }
896 else if (rb_enc_autoload_p(enc)) {
897 if (rb_enc_autoload(enc) < 0) {
898 rb_warn("failed to load encoding (%s); use ASCII-8BIT instead",
899 name);
900 return 0;
901 }
902 }
903 return i;
904}
905
906int
907rb_enc_find_index2(const char *name, long len)
908{
909 char buf[ENCODING_NAMELEN_MAX+1];
910
911 if (len > ENCODING_NAMELEN_MAX) return -1;
912 memcpy(buf, name, len);
913 buf[len] = '\0';
914 return rb_enc_find_index(buf);
915}
916
918rb_enc_find(const char *name)
919{
920 int idx = rb_enc_find_index(name);
921 if (idx < 0) idx = 0;
922 return rb_enc_from_index(idx);
923}
924
925static inline int
926enc_capable(VALUE obj)
927{
928 if (SPECIAL_CONST_P(obj)) return SYMBOL_P(obj);
929 switch (BUILTIN_TYPE(obj)) {
930 case T_STRING:
931 case T_REGEXP:
932 case T_FILE:
933 case T_SYMBOL:
934 return TRUE;
935 case T_DATA:
936 if (is_data_encoding(obj)) return TRUE;
937 default:
938 return FALSE;
939 }
940}
941
942int
944{
945 return enc_capable(obj);
946}
947
948ID
949rb_id_encoding(void)
950{
951 CONST_ID(id_encoding, "encoding");
952 return id_encoding;
953}
954
955static int
956enc_get_index_str(VALUE str)
957{
958 int i = ENCODING_GET_INLINED(str);
959 if (i == ENCODING_INLINE_MAX) {
960 VALUE iv;
961
962#if 0
963 iv = rb_ivar_get(str, rb_id_encoding());
964 i = NUM2INT(iv);
965#else
966 /*
967 * Tentatively, assume ASCII-8BIT, if encoding index instance
968 * variable is not found. This can happen when freeing after
969 * all instance variables are removed in `obj_free`.
970 */
971 iv = rb_attr_get(str, rb_id_encoding());
972 i = NIL_P(iv) ? ENCINDEX_ASCII : NUM2INT(iv);
973#endif
974 }
975 return i;
976}
977
978int
980{
981 int i = -1;
982 VALUE tmp;
983
984 if (SPECIAL_CONST_P(obj)) {
985 if (!SYMBOL_P(obj)) return -1;
986 obj = rb_sym2str(obj);
987 }
988 switch (BUILTIN_TYPE(obj)) {
989 case T_STRING:
990 case T_SYMBOL:
991 case T_REGEXP:
992 i = enc_get_index_str(obj);
993 break;
994 case T_FILE:
995 tmp = rb_funcallv(obj, rb_intern("internal_encoding"), 0, 0);
996 if (NIL_P(tmp)) {
997 tmp = rb_funcallv(obj, rb_intern("external_encoding"), 0, 0);
998 }
999 if (is_obj_encoding(tmp)) {
1000 i = enc_check_encoding(tmp);
1001 }
1002 break;
1003 case T_DATA:
1004 if (is_data_encoding(obj)) {
1005 i = enc_check_encoding(obj);
1006 }
1007 break;
1008 default:
1009 break;
1010 }
1011 return i;
1012}
1013
1014static void
1015enc_set_index(VALUE obj, int idx)
1016{
1017 if (!enc_capable(obj)) {
1018 rb_raise(rb_eArgError, "cannot set encoding on non-encoding capable object");
1019 }
1020
1021 if (idx < ENCODING_INLINE_MAX) {
1022 ENCODING_SET_INLINED(obj, idx);
1023 return;
1024 }
1026 rb_ivar_set(obj, rb_id_encoding(), INT2NUM(idx));
1027}
1028
1029void
1030rb_enc_set_index(VALUE obj, int idx)
1031{
1032 rb_check_frozen(obj);
1033 must_encindex(idx);
1034 enc_set_index(obj, idx);
1035}
1036
1037VALUE
1038rb_enc_associate_index(VALUE obj, int idx)
1039{
1040 rb_encoding *enc;
1041 int oldidx, oldtermlen, termlen;
1042
1043/* enc_check_capable(obj);*/
1044 rb_check_frozen(obj);
1045 oldidx = rb_enc_get_index(obj);
1046 if (oldidx == idx)
1047 return obj;
1048 if (SPECIAL_CONST_P(obj)) {
1049 rb_raise(rb_eArgError, "cannot set encoding");
1050 }
1051 enc = must_encindex(idx);
1052 if (!ENC_CODERANGE_ASCIIONLY(obj) ||
1053 !rb_enc_asciicompat(enc)) {
1055 }
1056 termlen = rb_enc_mbminlen(enc);
1057 oldtermlen = rb_enc_mbminlen(rb_enc_from_index(oldidx));
1058 if (oldtermlen != termlen && RB_TYPE_P(obj, T_STRING)) {
1059 rb_str_change_terminator_length(obj, oldtermlen, termlen);
1060 }
1061 enc_set_index(obj, idx);
1062 return obj;
1063}
1064
1065VALUE
1067{
1068 return rb_enc_associate_index(obj, rb_enc_to_index(enc));
1069}
1070
1072rb_enc_get(VALUE obj)
1073{
1075}
1076
1077static rb_encoding*
1078rb_encoding_check(rb_encoding* enc, VALUE str1, VALUE str2)
1079{
1080 if (!enc)
1081 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
1082 rb_enc_name(rb_enc_get(str1)),
1083 rb_enc_name(rb_enc_get(str2)));
1084 return enc;
1085}
1086
1087static rb_encoding* enc_compatible_str(VALUE str1, VALUE str2);
1088
1090rb_enc_check_str(VALUE str1, VALUE str2)
1091{
1092 rb_encoding *enc = enc_compatible_str(MUST_STRING(str1), MUST_STRING(str2));
1093 return rb_encoding_check(enc, str1, str2);
1094}
1095
1097rb_enc_check(VALUE str1, VALUE str2)
1098{
1099 rb_encoding *enc = rb_enc_compatible(str1, str2);
1100 return rb_encoding_check(enc, str1, str2);
1101}
1102
1103static rb_encoding*
1104enc_compatible_latter(VALUE str1, VALUE str2, int idx1, int idx2)
1105{
1106 int isstr1, isstr2;
1107 rb_encoding *enc1 = rb_enc_from_index(idx1);
1108 rb_encoding *enc2 = rb_enc_from_index(idx2);
1109
1110 isstr2 = RB_TYPE_P(str2, T_STRING);
1111 if (isstr2 && RSTRING_LEN(str2) == 0)
1112 return enc1;
1113 isstr1 = RB_TYPE_P(str1, T_STRING);
1114 if (isstr1 && isstr2 && RSTRING_LEN(str1) == 0)
1115 return (rb_enc_asciicompat(enc1) && rb_enc_str_asciionly_p(str2)) ? enc1 : enc2;
1116 if (!rb_enc_asciicompat(enc1) || !rb_enc_asciicompat(enc2)) {
1117 return 0;
1118 }
1119
1120 /* objects whose encoding is the same of contents */
1121 if (!isstr2 && idx2 == ENCINDEX_US_ASCII)
1122 return enc1;
1123 if (!isstr1 && idx1 == ENCINDEX_US_ASCII)
1124 return enc2;
1125
1126 if (!isstr1) {
1127 VALUE tmp = str1;
1128 int idx0 = idx1;
1129 str1 = str2;
1130 str2 = tmp;
1131 idx1 = idx2;
1132 idx2 = idx0;
1133 idx0 = isstr1;
1134 isstr1 = isstr2;
1135 isstr2 = idx0;
1136 }
1137 if (isstr1) {
1138 int cr1, cr2;
1139
1140 cr1 = rb_enc_str_coderange(str1);
1141 if (isstr2) {
1142 cr2 = rb_enc_str_coderange(str2);
1143 if (cr1 != cr2) {
1144 /* may need to handle ENC_CODERANGE_BROKEN */
1145 if (cr1 == ENC_CODERANGE_7BIT) return enc2;
1146 if (cr2 == ENC_CODERANGE_7BIT) return enc1;
1147 }
1148 if (cr2 == ENC_CODERANGE_7BIT) {
1149 return enc1;
1150 }
1151 }
1152 if (cr1 == ENC_CODERANGE_7BIT)
1153 return enc2;
1154 }
1155 return 0;
1156}
1157
1158static rb_encoding*
1159enc_compatible_str(VALUE str1, VALUE str2)
1160{
1161 int idx1 = enc_get_index_str(str1);
1162 int idx2 = enc_get_index_str(str2);
1163
1164 if (idx1 < 0 || idx2 < 0)
1165 return 0;
1166
1167 if (idx1 == idx2) {
1168 return rb_enc_from_index(idx1);
1169 }
1170 else {
1171 return enc_compatible_latter(str1, str2, idx1, idx2);
1172 }
1173}
1174
1176rb_enc_compatible(VALUE str1, VALUE str2)
1177{
1178 int idx1 = rb_enc_get_index(str1);
1179 int idx2 = rb_enc_get_index(str2);
1180
1181 if (idx1 < 0 || idx2 < 0)
1182 return 0;
1183
1184 if (idx1 == idx2) {
1185 return rb_enc_from_index(idx1);
1186 }
1187
1188 return enc_compatible_latter(str1, str2, idx1, idx2);
1189}
1190
1191void
1192rb_enc_copy(VALUE obj1, VALUE obj2)
1193{
1195}
1196
1197
1198/*
1199 * call-seq:
1200 * obj.encoding -> encoding
1201 *
1202 * Returns the Encoding object that represents the encoding of obj.
1203 */
1204
1205VALUE
1207{
1208 int idx = rb_enc_get_index(obj);
1209 if (idx < 0) {
1210 rb_raise(rb_eTypeError, "unknown encoding");
1211 }
1212 return rb_enc_from_encoding_index(idx & ENC_INDEX_MASK);
1213}
1214
1215int
1216rb_enc_fast_mbclen(const char *p, const char *e, rb_encoding *enc)
1217{
1218 return ONIGENC_MBC_ENC_LEN(enc, (UChar*)p, (UChar*)e);
1219}
1220
1221int
1222rb_enc_mbclen(const char *p, const char *e, rb_encoding *enc)
1223{
1224 int n = ONIGENC_PRECISE_MBC_ENC_LEN(enc, (UChar*)p, (UChar*)e);
1225 if (MBCLEN_CHARFOUND_P(n) && MBCLEN_CHARFOUND_LEN(n) <= e-p)
1226 return MBCLEN_CHARFOUND_LEN(n);
1227 else {
1228 int min = rb_enc_mbminlen(enc);
1229 return min <= e-p ? min : (int)(e-p);
1230 }
1231}
1232
1233int
1234rb_enc_precise_mbclen(const char *p, const char *e, rb_encoding *enc)
1235{
1236 int n;
1237 if (e <= p)
1238 return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(1);
1239 n = ONIGENC_PRECISE_MBC_ENC_LEN(enc, (UChar*)p, (UChar*)e);
1240 if (e-p < n)
1241 return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(n-(int)(e-p));
1242 return n;
1243}
1244
1245int
1246rb_enc_ascget(const char *p, const char *e, int *len, rb_encoding *enc)
1247{
1248 unsigned int c;
1249 int l;
1250 if (e <= p)
1251 return -1;
1252 if (rb_enc_asciicompat(enc)) {
1253 c = (unsigned char)*p;
1254 if (!ISASCII(c))
1255 return -1;
1256 if (len) *len = 1;
1257 return c;
1258 }
1259 l = rb_enc_precise_mbclen(p, e, enc);
1260 if (!MBCLEN_CHARFOUND_P(l))
1261 return -1;
1262 c = rb_enc_mbc_to_codepoint(p, e, enc);
1263 if (!rb_enc_isascii(c, enc))
1264 return -1;
1265 if (len) *len = l;
1266 return c;
1267}
1268
1269unsigned int
1270rb_enc_codepoint_len(const char *p, const char *e, int *len_p, rb_encoding *enc)
1271{
1272 int r;
1273 if (e <= p)
1274 rb_raise(rb_eArgError, "empty string");
1275 r = rb_enc_precise_mbclen(p, e, enc);
1276 if (!MBCLEN_CHARFOUND_P(r)) {
1277 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(enc));
1278 }
1279 if (len_p) *len_p = MBCLEN_CHARFOUND_LEN(r);
1280 return rb_enc_mbc_to_codepoint(p, e, enc);
1281}
1282
1283int
1285{
1286 int n = ONIGENC_CODE_TO_MBCLEN(enc,c);
1287 if (n == 0) {
1288 rb_raise(rb_eArgError, "invalid codepoint 0x%x in %s", c, rb_enc_name(enc));
1289 }
1290 return n;
1291}
1292
1293int
1295{
1296 return (ONIGENC_IS_ASCII_CODE(c)?ONIGENC_ASCII_CODE_TO_UPPER_CASE(c):(c));
1297}
1298
1299int
1301{
1302 return (ONIGENC_IS_ASCII_CODE(c)?ONIGENC_ASCII_CODE_TO_LOWER_CASE(c):(c));
1303}
1304
1305/*
1306 * call-seq:
1307 * enc.inspect -> string
1308 *
1309 * Returns a string which represents the encoding for programmers.
1310 *
1311 * Encoding::UTF_8.inspect #=> "#<Encoding:UTF-8>"
1312 * Encoding::ISO_2022_JP.inspect #=> "#<Encoding:ISO-2022-JP (dummy)>"
1313 */
1314static VALUE
1315enc_inspect(VALUE self)
1316{
1317 rb_encoding *enc;
1318
1319 if (!is_data_encoding(self)) {
1320 not_encoding(self);
1321 }
1322 if (!(enc = DATA_PTR(self)) || rb_enc_from_index(rb_enc_to_index(enc)) != enc) {
1323 rb_raise(rb_eTypeError, "broken Encoding");
1324 }
1326 "#<%"PRIsVALUE":%s%s%s>", rb_obj_class(self),
1327 rb_enc_name(enc),
1328 (ENC_DUMMY_P(enc) ? " (dummy)" : ""),
1329 rb_enc_autoload_p(enc) ? " (autoload)" : "");
1330}
1331
1332/*
1333 * call-seq:
1334 * enc.name -> string
1335 * enc.to_s -> string
1336 *
1337 * Returns the name of the encoding.
1338 *
1339 * Encoding::UTF_8.name #=> "UTF-8"
1340 */
1341static VALUE
1342enc_name(VALUE self)
1343{
1344 return rb_fstring_cstr(rb_enc_name((rb_encoding*)DATA_PTR(self)));
1345}
1346
1347static int
1348enc_names_i(st_data_t name, st_data_t idx, st_data_t args)
1349{
1350 VALUE *arg = (VALUE *)args;
1351
1352 if ((int)idx == (int)arg[0]) {
1353 VALUE str = rb_fstring_cstr((char *)name);
1354 rb_ary_push(arg[1], str);
1355 }
1356 return ST_CONTINUE;
1357}
1358
1359/*
1360 * call-seq:
1361 * enc.names -> array
1362 *
1363 * Returns the list of name and aliases of the encoding.
1364 *
1365 * Encoding::WINDOWS_31J.names #=> ["Windows-31J", "CP932", "csWindows31J", "SJIS", "PCK"]
1366 */
1367static VALUE
1368enc_names(VALUE self)
1369{
1370 VALUE args[2];
1371
1372 args[0] = (VALUE)rb_to_encoding_index(self);
1373 args[1] = rb_ary_new2(0);
1374
1375 GLOBAL_ENC_TABLE_EVAL(enc_table,
1376 st_foreach(enc_table->names, enc_names_i, (st_data_t)args));
1377
1378 return args[1];
1379}
1380
1381/*
1382 * call-seq:
1383 * Encoding.list -> [enc1, enc2, ...]
1384 *
1385 * Returns the list of loaded encodings.
1386 *
1387 * Encoding.list
1388 * #=> [#<Encoding:ASCII-8BIT>, #<Encoding:UTF-8>,
1389 * #<Encoding:ISO-2022-JP (dummy)>]
1390 *
1391 * Encoding.find("US-ASCII")
1392 * #=> #<Encoding:US-ASCII>
1393 *
1394 * Encoding.list
1395 * #=> [#<Encoding:ASCII-8BIT>, #<Encoding:UTF-8>,
1396 * #<Encoding:US-ASCII>, #<Encoding:ISO-2022-JP (dummy)>]
1397 *
1398 */
1399static VALUE
1400enc_list(VALUE klass)
1401{
1402 VALUE ary = rb_ary_new2(0);
1403
1404 RB_VM_LOCK_ENTER();
1405 {
1406 rb_ary_replace(ary, rb_default_encoding_list);
1407 rb_ary_concat(ary, rb_additional_encoding_list);
1408 }
1409 RB_VM_LOCK_LEAVE();
1410
1411 return ary;
1412}
1413
1414/*
1415 * call-seq:
1416 * Encoding.find(string) -> enc
1417 *
1418 * Search the encoding with specified <i>name</i>.
1419 * <i>name</i> should be a string.
1420 *
1421 * Encoding.find("US-ASCII") #=> #<Encoding:US-ASCII>
1422 *
1423 * Names which this method accept are encoding names and aliases
1424 * including following special aliases
1425 *
1426 * "external":: default external encoding
1427 * "internal":: default internal encoding
1428 * "locale":: locale encoding
1429 * "filesystem":: filesystem encoding
1430 *
1431 * An ArgumentError is raised when no encoding with <i>name</i>.
1432 * Only <code>Encoding.find("internal")</code> however returns nil
1433 * when no encoding named "internal", in other words, when Ruby has no
1434 * default internal encoding.
1435 */
1436static VALUE
1437enc_find(VALUE klass, VALUE enc)
1438{
1439 int idx;
1440 if (is_obj_encoding(enc))
1441 return enc;
1442 idx = str_to_encindex(enc);
1443 if (idx == UNSPECIFIED_ENCODING) return Qnil;
1444 return rb_enc_from_encoding_index(idx);
1445}
1446
1447/*
1448 * call-seq:
1449 * Encoding.compatible?(obj1, obj2) -> enc or nil
1450 *
1451 * Checks the compatibility of two objects.
1452 *
1453 * If the objects are both strings they are compatible when they are
1454 * concatenatable. The encoding of the concatenated string will be returned
1455 * if they are compatible, nil if they are not.
1456 *
1457 * Encoding.compatible?("\xa1".force_encoding("iso-8859-1"), "b")
1458 * #=> #<Encoding:ISO-8859-1>
1459 *
1460 * Encoding.compatible?(
1461 * "\xa1".force_encoding("iso-8859-1"),
1462 * "\xa1\xa1".force_encoding("euc-jp"))
1463 * #=> nil
1464 *
1465 * If the objects are non-strings their encodings are compatible when they
1466 * have an encoding and:
1467 * * Either encoding is US-ASCII compatible
1468 * * One of the encodings is a 7-bit encoding
1469 *
1470 */
1471static VALUE
1472enc_compatible_p(VALUE klass, VALUE str1, VALUE str2)
1473{
1474 rb_encoding *enc;
1475
1476 if (!enc_capable(str1)) return Qnil;
1477 if (!enc_capable(str2)) return Qnil;
1478 enc = rb_enc_compatible(str1, str2);
1479 if (!enc) return Qnil;
1480 return rb_enc_from_encoding(enc);
1481}
1482
1483NORETURN(static VALUE enc_s_alloc(VALUE klass));
1484/* :nodoc: */
1485static VALUE
1486enc_s_alloc(VALUE klass)
1487{
1488 rb_undefined_alloc(klass);
1490}
1491
1492/* :nodoc: */
1493static VALUE
1494enc_dump(int argc, VALUE *argv, VALUE self)
1495{
1496 rb_check_arity(argc, 0, 1);
1497 return enc_name(self);
1498}
1499
1500/* :nodoc: */
1501static VALUE
1502enc_load(VALUE klass, VALUE str)
1503{
1504 return str;
1505}
1506
1507/* :nodoc: */
1508static VALUE
1509enc_m_loader(VALUE klass, VALUE str)
1510{
1511 return enc_find(klass, str);
1512}
1513
1516{
1517 return global_enc_ascii;
1518}
1519
1520int
1522{
1523 return ENCINDEX_ASCII;
1524}
1525
1528{
1529 return global_enc_utf_8;
1530}
1531
1532int
1534{
1535 return ENCINDEX_UTF_8;
1536}
1537
1540{
1541 return global_enc_us_ascii;
1542}
1543
1544int
1546{
1547 return ENCINDEX_US_ASCII;
1548}
1549
1550int rb_locale_charmap_index(void);
1551
1552int
1554{
1555 int idx = rb_locale_charmap_index();
1556
1557 if (idx < 0) idx = ENCINDEX_UTF_8;
1558
1559 GLOBAL_ENC_TABLE_ENTER(enc_table);
1560 if (enc_registered(enc_table, "locale") < 0) {
1561# if defined _WIN32
1562 void Init_w32_codepage(void);
1563 Init_w32_codepage();
1564# endif
1565 enc_alias_internal(enc_table, "locale", idx);
1566 }
1567 GLOBAL_ENC_TABLE_LEAVE();
1568
1569 return idx;
1570}
1571
1574{
1576}
1577
1578int
1580{
1581 int idx;
1582
1583 GLOBAL_ENC_TABLE_EVAL(enc_table,
1584 idx = enc_registered(enc_table, "filesystem"));
1585
1586 if (idx < 0)
1587 idx = ENCINDEX_ASCII;
1588 return idx;
1589}
1590
1593{
1595}
1596
1598 int index; /* -2 => not yet set, -1 => nil */
1599 rb_encoding *enc;
1600};
1601
1602static struct default_encoding default_external = {0};
1603
1604static int
1605enc_set_default_encoding(struct default_encoding *def, VALUE encoding, const char *name)
1606{
1607 int overridden = FALSE;
1608
1609 if (def->index != -2)
1610 /* Already set */
1611 overridden = TRUE;
1612
1613 GLOBAL_ENC_TABLE_ENTER(enc_table);
1614 {
1615 if (NIL_P(encoding)) {
1616 def->index = -1;
1617 def->enc = 0;
1618 st_insert(enc_table->names, (st_data_t)strdup(name),
1619 (st_data_t)UNSPECIFIED_ENCODING);
1620 }
1621 else {
1622 def->index = rb_enc_to_index(rb_to_encoding(encoding));
1623 def->enc = 0;
1624 enc_alias_internal(enc_table, name, def->index);
1625 }
1626
1627 if (def == &default_external) {
1628 enc_alias_internal(enc_table, "filesystem", Init_enc_set_filesystem_encoding());
1629 }
1630 }
1631 GLOBAL_ENC_TABLE_LEAVE();
1632
1633 return overridden;
1634}
1635
1638{
1639 if (default_external.enc) return default_external.enc;
1640
1641 if (default_external.index >= 0) {
1642 default_external.enc = rb_enc_from_index(default_external.index);
1643 return default_external.enc;
1644 }
1645 else {
1646 return rb_locale_encoding();
1647 }
1648}
1649
1650VALUE
1652{
1654}
1655
1656/*
1657 * call-seq:
1658 * Encoding.default_external -> enc
1659 *
1660 * Returns default external encoding.
1661 *
1662 * The default external encoding is used by default for strings created from
1663 * the following locations:
1664 *
1665 * * CSV
1666 * * File data read from disk
1667 * * SDBM
1668 * * StringIO
1669 * * Zlib::GzipReader
1670 * * Zlib::GzipWriter
1671 * * String#inspect
1672 * * Regexp#inspect
1673 *
1674 * While strings created from these locations will have this encoding, the
1675 * encoding may not be valid. Be sure to check String#valid_encoding?.
1676 *
1677 * File data written to disk will be transcoded to the default external
1678 * encoding when written, if default_internal is not nil.
1679 *
1680 * The default external encoding is initialized by the -E option.
1681 * If -E isn't set, it is initialized to UTF-8 on Windows and the locale on
1682 * other operating systems.
1683 */
1684static VALUE
1685get_default_external(VALUE klass)
1686{
1687 return rb_enc_default_external();
1688}
1689
1690void
1692{
1693 if (NIL_P(encoding)) {
1694 rb_raise(rb_eArgError, "default external can not be nil");
1695 }
1696 enc_set_default_encoding(&default_external, encoding,
1697 "external");
1698}
1699
1700/*
1701 * call-seq:
1702 * Encoding.default_external = enc
1703 *
1704 * Sets default external encoding. You should not set
1705 * Encoding::default_external in ruby code as strings created before changing
1706 * the value may have a different encoding from strings created after the value
1707 * was changed., instead you should use <tt>ruby -E</tt> to invoke ruby with
1708 * the correct default_external.
1709 *
1710 * See Encoding::default_external for information on how the default external
1711 * encoding is used.
1712 */
1713static VALUE
1714set_default_external(VALUE klass, VALUE encoding)
1715{
1716 rb_warning("setting Encoding.default_external");
1718 return encoding;
1719}
1720
1721static struct default_encoding default_internal = {-2};
1722
1725{
1726 if (!default_internal.enc && default_internal.index >= 0) {
1727 default_internal.enc = rb_enc_from_index(default_internal.index);
1728 }
1729 return default_internal.enc; /* can be NULL */
1730}
1731
1732VALUE
1734{
1735 /* Note: These functions cope with default_internal not being set */
1737}
1738
1739/*
1740 * call-seq:
1741 * Encoding.default_internal -> enc
1742 *
1743 * Returns default internal encoding. Strings will be transcoded to the
1744 * default internal encoding in the following places if the default internal
1745 * encoding is not nil:
1746 *
1747 * * CSV
1748 * * Etc.sysconfdir and Etc.systmpdir
1749 * * File data read from disk
1750 * * File names from Dir
1751 * * Integer#chr
1752 * * String#inspect and Regexp#inspect
1753 * * Strings returned from Readline
1754 * * Strings returned from SDBM
1755 * * Time#zone
1756 * * Values from ENV
1757 * * Values in ARGV including $PROGRAM_NAME
1758 *
1759 * Additionally String#encode and String#encode! use the default internal
1760 * encoding if no encoding is given.
1761 *
1762 * The script encoding (__ENCODING__), not default_internal, is used as the
1763 * encoding of created strings.
1764 *
1765 * Encoding::default_internal is initialized with -E option or nil otherwise.
1766 */
1767static VALUE
1768get_default_internal(VALUE klass)
1769{
1770 return rb_enc_default_internal();
1771}
1772
1773void
1775{
1776 enc_set_default_encoding(&default_internal, encoding,
1777 "internal");
1778}
1779
1780/*
1781 * call-seq:
1782 * Encoding.default_internal = enc or nil
1783 *
1784 * Sets default internal encoding or removes default internal encoding when
1785 * passed nil. You should not set Encoding::default_internal in ruby code as
1786 * strings created before changing the value may have a different encoding
1787 * from strings created after the change. Instead you should use
1788 * <tt>ruby -E</tt> to invoke ruby with the correct default_internal.
1789 *
1790 * See Encoding::default_internal for information on how the default internal
1791 * encoding is used.
1792 */
1793static VALUE
1794set_default_internal(VALUE klass, VALUE encoding)
1795{
1796 rb_warning("setting Encoding.default_internal");
1798 return encoding;
1799}
1800
1801static void
1802set_encoding_const(const char *name, rb_encoding *enc)
1803{
1804 VALUE encoding = rb_enc_from_encoding(enc);
1805 char *s = (char *)name;
1806 int haslower = 0, hasupper = 0, valid = 0;
1807
1808 if (ISDIGIT(*s)) return;
1809 if (ISUPPER(*s)) {
1810 hasupper = 1;
1811 while (*++s && (ISALNUM(*s) || *s == '_')) {
1812 if (ISLOWER(*s)) haslower = 1;
1813 }
1814 }
1815 if (!*s) {
1816 if (s - name > ENCODING_NAMELEN_MAX) return;
1817 valid = 1;
1818 rb_define_const(rb_cEncoding, name, encoding);
1819 }
1820 if (!valid || haslower) {
1821 size_t len = s - name;
1822 if (len > ENCODING_NAMELEN_MAX) return;
1823 if (!haslower || !hasupper) {
1824 do {
1825 if (ISLOWER(*s)) haslower = 1;
1826 if (ISUPPER(*s)) hasupper = 1;
1827 } while (*++s && (!haslower || !hasupper));
1828 len = s - name;
1829 }
1830 len += strlen(s);
1831 if (len++ > ENCODING_NAMELEN_MAX) return;
1832 MEMCPY(s = ALLOCA_N(char, len), name, char, len);
1833 name = s;
1834 if (!valid) {
1835 if (ISLOWER(*s)) *s = ONIGENC_ASCII_CODE_TO_UPPER_CASE((int)*s);
1836 for (; *s; ++s) {
1837 if (!ISALNUM(*s)) *s = '_';
1838 }
1839 if (hasupper) {
1840 rb_define_const(rb_cEncoding, name, encoding);
1841 }
1842 }
1843 if (haslower) {
1844 for (s = (char *)name; *s; ++s) {
1845 if (ISLOWER(*s)) *s = ONIGENC_ASCII_CODE_TO_UPPER_CASE((int)*s);
1846 }
1847 rb_define_const(rb_cEncoding, name, encoding);
1848 }
1849 }
1850}
1851
1852static int
1853rb_enc_name_list_i(st_data_t name, st_data_t idx, st_data_t arg)
1854{
1855 VALUE ary = (VALUE)arg;
1856 VALUE str = rb_fstring_cstr((char *)name);
1857 rb_ary_push(ary, str);
1858 return ST_CONTINUE;
1859}
1860
1861/*
1862 * call-seq:
1863 * Encoding.name_list -> ["enc1", "enc2", ...]
1864 *
1865 * Returns the list of available encoding names.
1866 *
1867 * Encoding.name_list
1868 * #=> ["US-ASCII", "ASCII-8BIT", "UTF-8",
1869 * "ISO-8859-1", "Shift_JIS", "EUC-JP",
1870 * "Windows-31J",
1871 * "BINARY", "CP932", "eucJP"]
1872 *
1873 */
1874
1875static VALUE
1876rb_enc_name_list(VALUE klass)
1877{
1878 VALUE ary;
1879
1880 GLOBAL_ENC_TABLE_ENTER(enc_table);
1881 {
1882 ary = rb_ary_new2(enc_table->names->num_entries);
1883 st_foreach(enc_table->names, rb_enc_name_list_i, (st_data_t)ary);
1884 }
1885 GLOBAL_ENC_TABLE_LEAVE();
1886
1887 return ary;
1888}
1889
1890static int
1891rb_enc_aliases_enc_i(st_data_t name, st_data_t orig, st_data_t arg)
1892{
1893 VALUE *p = (VALUE *)arg;
1894 VALUE aliases = p[0], ary = p[1];
1895 int idx = (int)orig;
1896 VALUE key, str = rb_ary_entry(ary, idx);
1897
1898 if (NIL_P(str)) {
1899 rb_encoding *enc = rb_enc_from_index(idx);
1900
1901 if (!enc) return ST_CONTINUE;
1902 if (STRCASECMP((char*)name, rb_enc_name(enc)) == 0) {
1903 return ST_CONTINUE;
1904 }
1905 str = rb_fstring_cstr(rb_enc_name(enc));
1906 rb_ary_store(ary, idx, str);
1907 }
1908 key = rb_fstring_cstr((char *)name);
1909 rb_hash_aset(aliases, key, str);
1910 return ST_CONTINUE;
1911}
1912
1913/*
1914 * call-seq:
1915 * Encoding.aliases -> {"alias1" => "orig1", "alias2" => "orig2", ...}
1916 *
1917 * Returns the hash of available encoding alias and original encoding name.
1918 *
1919 * Encoding.aliases
1920 * #=> {"BINARY"=>"ASCII-8BIT", "ASCII"=>"US-ASCII", "ANSI_X3.4-1968"=>"US-ASCII",
1921 * "SJIS"=>"Windows-31J", "eucJP"=>"EUC-JP", "CP932"=>"Windows-31J"}
1922 *
1923 */
1924
1925static VALUE
1926rb_enc_aliases(VALUE klass)
1927{
1928 VALUE aliases[2];
1929 aliases[0] = rb_hash_new();
1930 aliases[1] = rb_ary_new();
1931
1932 GLOBAL_ENC_TABLE_EVAL(enc_table,
1933 st_foreach(enc_table->names, rb_enc_aliases_enc_i, (st_data_t)aliases));
1934
1935 return aliases[0];
1936}
1937
1938/*
1939 * An Encoding instance represents a character encoding usable in Ruby. It is
1940 * defined as a constant under the Encoding namespace. It has a name and
1941 * optionally, aliases:
1942 *
1943 * Encoding::ISO_8859_1.name
1944 * #=> "ISO-8859-1"
1945 *
1946 * Encoding::ISO_8859_1.names
1947 * #=> ["ISO-8859-1", "ISO8859-1"]
1948 *
1949 * Ruby methods dealing with encodings return or accept Encoding instances as
1950 * arguments (when a method accepts an Encoding instance as an argument, it
1951 * can be passed an Encoding name or alias instead).
1952 *
1953 * "some string".encoding
1954 * #=> #<Encoding:UTF-8>
1955 *
1956 * string = "some string".encode(Encoding::ISO_8859_1)
1957 * #=> "some string"
1958 * string.encoding
1959 * #=> #<Encoding:ISO-8859-1>
1960 *
1961 * "some string".encode "ISO-8859-1"
1962 * #=> "some string"
1963 *
1964 * Encoding::ASCII_8BIT is a special encoding that is usually used for
1965 * a byte string, not a character string. But as the name insists, its
1966 * characters in the range of ASCII are considered as ASCII
1967 * characters. This is useful when you use ASCII-8BIT characters with
1968 * other ASCII compatible characters.
1969 *
1970 * == Changing an encoding
1971 *
1972 * The associated Encoding of a String can be changed in two different ways.
1973 *
1974 * First, it is possible to set the Encoding of a string to a new Encoding
1975 * without changing the internal byte representation of the string, with
1976 * String#force_encoding. This is how you can tell Ruby the correct encoding
1977 * of a string.
1978 *
1979 * string
1980 * #=> "R\xC3\xA9sum\xC3\xA9"
1981 * string.encoding
1982 * #=> #<Encoding:ISO-8859-1>
1983 * string.force_encoding(Encoding::UTF_8)
1984 * #=> "R\u00E9sum\u00E9"
1985 *
1986 * Second, it is possible to transcode a string, i.e. translate its internal
1987 * byte representation to another encoding. Its associated encoding is also
1988 * set to the other encoding. See String#encode for the various forms of
1989 * transcoding, and the Encoding::Converter class for additional control over
1990 * the transcoding process.
1991 *
1992 * string
1993 * #=> "R\u00E9sum\u00E9"
1994 * string.encoding
1995 * #=> #<Encoding:UTF-8>
1996 * string = string.encode!(Encoding::ISO_8859_1)
1997 * #=> "R\xE9sum\xE9"
1998 * string.encoding
1999 * #=> #<Encoding::ISO-8859-1>
2000 *
2001 * == Script encoding
2002 *
2003 * All Ruby script code has an associated Encoding which any String literal
2004 * created in the source code will be associated to.
2005 *
2006 * The default script encoding is Encoding::UTF_8 after v2.0, but it
2007 * can be changed by a magic comment on the first line of the source
2008 * code file (or second line, if there is a shebang line on the
2009 * first). The comment must contain the word <code>coding</code> or
2010 * <code>encoding</code>, followed by a colon, space and the Encoding
2011 * name or alias:
2012 *
2013 * # encoding: UTF-8
2014 *
2015 * "some string".encoding
2016 * #=> #<Encoding:UTF-8>
2017 *
2018 * The <code>__ENCODING__</code> keyword returns the script encoding of the file
2019 * which the keyword is written:
2020 *
2021 * # encoding: ISO-8859-1
2022 *
2023 * __ENCODING__
2024 * #=> #<Encoding:ISO-8859-1>
2025 *
2026 * <code>ruby -K</code> will change the default locale encoding, but this is
2027 * not recommended. Ruby source files should declare its script encoding by a
2028 * magic comment even when they only depend on US-ASCII strings or regular
2029 * expressions.
2030 *
2031 * == Locale encoding
2032 *
2033 * The default encoding of the environment. Usually derived from locale.
2034 *
2035 * see Encoding.locale_charmap, Encoding.find('locale')
2036 *
2037 * == Filesystem encoding
2038 *
2039 * The default encoding of strings from the filesystem of the environment.
2040 * This is used for strings of file names or paths.
2041 *
2042 * see Encoding.find('filesystem')
2043 *
2044 * == External encoding
2045 *
2046 * Each IO object has an external encoding which indicates the encoding that
2047 * Ruby will use to read its data. By default Ruby sets the external encoding
2048 * of an IO object to the default external encoding. The default external
2049 * encoding is set by locale encoding or the interpreter <code>-E</code> option.
2050 * Encoding.default_external returns the current value of the external
2051 * encoding.
2052 *
2053 * ENV["LANG"]
2054 * #=> "UTF-8"
2055 * Encoding.default_external
2056 * #=> #<Encoding:UTF-8>
2057 *
2058 * $ ruby -E ISO-8859-1 -e "p Encoding.default_external"
2059 * #<Encoding:ISO-8859-1>
2060 *
2061 * $ LANG=C ruby -e 'p Encoding.default_external'
2062 * #<Encoding:US-ASCII>
2063 *
2064 * The default external encoding may also be set through
2065 * Encoding.default_external=, but you should not do this as strings created
2066 * before and after the change will have inconsistent encodings. Instead use
2067 * <code>ruby -E</code> to invoke ruby with the correct external encoding.
2068 *
2069 * When you know that the actual encoding of the data of an IO object is not
2070 * the default external encoding, you can reset its external encoding with
2071 * IO#set_encoding or set it at IO object creation (see IO.new options).
2072 *
2073 * == Internal encoding
2074 *
2075 * To process the data of an IO object which has an encoding different
2076 * from its external encoding, you can set its internal encoding. Ruby will use
2077 * this internal encoding to transcode the data when it is read from the IO
2078 * object.
2079 *
2080 * Conversely, when data is written to the IO object it is transcoded from the
2081 * internal encoding to the external encoding of the IO object.
2082 *
2083 * The internal encoding of an IO object can be set with
2084 * IO#set_encoding or at IO object creation (see IO.new options).
2085 *
2086 * The internal encoding is optional and when not set, the Ruby default
2087 * internal encoding is used. If not explicitly set this default internal
2088 * encoding is +nil+ meaning that by default, no transcoding occurs.
2089 *
2090 * The default internal encoding can be set with the interpreter option
2091 * <code>-E</code>. Encoding.default_internal returns the current internal
2092 * encoding.
2093 *
2094 * $ ruby -e 'p Encoding.default_internal'
2095 * nil
2096 *
2097 * $ ruby -E ISO-8859-1:UTF-8 -e "p [Encoding.default_external, \
2098 * Encoding.default_internal]"
2099 * [#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>]
2100 *
2101 * The default internal encoding may also be set through
2102 * Encoding.default_internal=, but you should not do this as strings created
2103 * before and after the change will have inconsistent encodings. Instead use
2104 * <code>ruby -E</code> to invoke ruby with the correct internal encoding.
2105 *
2106 * == IO encoding example
2107 *
2108 * In the following example a UTF-8 encoded string "R\u00E9sum\u00E9" is transcoded for
2109 * output to ISO-8859-1 encoding, then read back in and transcoded to UTF-8:
2110 *
2111 * string = "R\u00E9sum\u00E9"
2112 *
2113 * open("transcoded.txt", "w:ISO-8859-1") do |io|
2114 * io.write(string)
2115 * end
2116 *
2117 * puts "raw text:"
2118 * p File.binread("transcoded.txt")
2119 * puts
2120 *
2121 * open("transcoded.txt", "r:ISO-8859-1:UTF-8") do |io|
2122 * puts "transcoded text:"
2123 * p io.read
2124 * end
2125 *
2126 * While writing the file, the internal encoding is not specified as it is
2127 * only necessary for reading. While reading the file both the internal and
2128 * external encoding must be specified to obtain the correct result.
2129 *
2130 * $ ruby t.rb
2131 * raw text:
2132 * "R\xE9sum\xE9"
2133 *
2134 * transcoded text:
2135 * "R\u00E9sum\u00E9"
2136 *
2137 */
2138
2139void
2140Init_Encoding(void)
2141{
2142 VALUE list;
2143 int i;
2144
2145 rb_cEncoding = rb_define_class("Encoding", rb_cObject);
2146 rb_define_alloc_func(rb_cEncoding, enc_s_alloc);
2148 rb_define_method(rb_cEncoding, "to_s", enc_name, 0);
2149 rb_define_method(rb_cEncoding, "inspect", enc_inspect, 0);
2150 rb_define_method(rb_cEncoding, "name", enc_name, 0);
2151 rb_define_method(rb_cEncoding, "names", enc_names, 0);
2152 rb_define_method(rb_cEncoding, "dummy?", enc_dummy_p, 0);
2153 rb_define_method(rb_cEncoding, "ascii_compatible?", enc_ascii_compatible_p, 0);
2154 rb_define_method(rb_cEncoding, "replicate", enc_replicate_m, 1);
2155 rb_define_singleton_method(rb_cEncoding, "list", enc_list, 0);
2156 rb_define_singleton_method(rb_cEncoding, "name_list", rb_enc_name_list, 0);
2157 rb_define_singleton_method(rb_cEncoding, "aliases", rb_enc_aliases, 0);
2158 rb_define_singleton_method(rb_cEncoding, "find", enc_find, 1);
2159 rb_define_singleton_method(rb_cEncoding, "compatible?", enc_compatible_p, 2);
2160
2161 rb_define_method(rb_cEncoding, "_dump", enc_dump, -1);
2162 rb_define_singleton_method(rb_cEncoding, "_load", enc_load, 1);
2163
2164 rb_define_singleton_method(rb_cEncoding, "default_external", get_default_external, 0);
2165 rb_define_singleton_method(rb_cEncoding, "default_external=", set_default_external, 1);
2166 rb_define_singleton_method(rb_cEncoding, "default_internal", get_default_internal, 0);
2167 rb_define_singleton_method(rb_cEncoding, "default_internal=", set_default_internal, 1);
2168 rb_define_singleton_method(rb_cEncoding, "locale_charmap", rb_locale_charmap, 0); /* in localeinit.c */
2169
2170 struct enc_table *enc_table = &global_enc_table;
2171
2172 if (DEFAULT_ENCODING_LIST_CAPA < enc_table->count) rb_bug("DEFAULT_ENCODING_LIST_CAPA is too small");
2173
2174 list = rb_additional_encoding_list = rb_ary_new();
2175 RBASIC_CLEAR_CLASS(list);
2177
2178 list = rb_default_encoding_list = rb_ary_new2(DEFAULT_ENCODING_LIST_CAPA);
2179 RBASIC_CLEAR_CLASS(list);
2181
2182 for (i = 0; i < enc_table->count; ++i) {
2183 rb_ary_push(list, enc_new(enc_table->list[i].enc));
2184 }
2185
2186 rb_marshal_define_compat(rb_cEncoding, Qnil, 0, enc_m_loader);
2187}
2188
2189void
2190Init_encodings(void)
2191{
2192 rb_enc_init(&global_enc_table);
2193}
2194
2195/* locale insensitive ctype functions */
2196
2197void
2198rb_enc_foreach_name(int (*func)(st_data_t name, st_data_t idx, st_data_t arg), st_data_t arg)
2199{
2200 GLOBAL_ENC_TABLE_EVAL(enc_table, st_foreach(enc_table->names, func, arg));
2201}
#define rb_define_singleton_method(klass, mid, func, arity)
Defines klass.mid.
Definition: cxxanyargs.hpp:685
static bool rb_enc_isascii(OnigCodePoint c, rb_encoding *enc)
Identical to rb_isascii(), except it additionally takes an encoding.
Definition: ctype.h:82
int rb_enc_tolower(int c, rb_encoding *enc)
Identical to rb_tolower(), except it additionally takes an encoding.
Definition: encoding.c:1300
int rb_enc_toupper(int c, rb_encoding *enc)
Identical to rb_toupper(), except it additionally takes an encoding.
Definition: encoding.c:1294
VALUE rb_enc_sprintf(rb_encoding *enc, const char *fmt,...)
Identical to rb_sprintf(), except it additionally takes an encoding.
Definition: sprintf.c:1182
@ RUBY_FL_SHAREABLE
This flag has something to do with Ractor.
Definition: fl_type.h:298
VALUE rb_define_class(const char *name, VALUE super)
Defines a top-level class.
Definition: class.c:837
void rb_undef_method(VALUE klass, const char *name)
Defines an undef of a method.
Definition: class.c:1938
void rb_define_method(VALUE klass, const char *name, VALUE(*func)(ANYARGS), int argc)
Defines a method.
Definition: class.c:1914
#define ENCODING_SET_INLINED(obj, i)
Old name of RB_ENCODING_SET_INLINED.
Definition: encoding.h:105
#define ENC_CODERANGE_7BIT
Old name of RUBY_ENC_CODERANGE_7BIT.
Definition: coderange.h:180
#define T_FILE
Old name of RUBY_T_FILE.
Definition: value_type.h:62
#define REALLOC_N
Old name of RB_REALLOC_N.
Definition: memory.h:397
#define T_STRING
Old name of RUBY_T_STRING.
Definition: value_type.h:78
#define ISUPPER
Old name of rb_isupper.
Definition: ctype.h:89
#define SPECIAL_CONST_P
Old name of RB_SPECIAL_CONST_P.
#define UNREACHABLE_RETURN
Old name of RBIMPL_UNREACHABLE_RETURN.
Definition: assume.h:31
#define T_DATA
Old name of RUBY_T_DATA.
Definition: value_type.h:60
#define CLASS_OF
Old name of rb_class_of.
Definition: globals.h:203
#define xmalloc
Old name of ruby_xmalloc.
Definition: xmalloc.h:53
#define ISDIGIT
Old name of rb_isdigit.
Definition: ctype.h:93
#define ISLOWER
Old name of rb_islower.
Definition: ctype.h:90
#define ASSUME
Old name of RBIMPL_ASSUME.
Definition: assume.h:29
#define MBCLEN_CHARFOUND_LEN(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_LEN.
Definition: encoding.h:533
#define ENCODING_INLINE_MAX
Old name of RUBY_ENCODING_INLINE_MAX.
Definition: encoding.h:66
#define STRCASECMP
Old name of st_locale_insensitive_strcasecmp.
Definition: ctype.h:102
#define ISASCII
Old name of rb_isascii.
Definition: ctype.h:85
#define TOLOWER
Old name of rb_tolower.
Definition: ctype.h:101
#define NUM2INT
Old name of RB_NUM2INT.
Definition: int.h:44
#define INT2NUM
Old name of RB_INT2NUM.
Definition: int.h:43
#define Qnil
Old name of RUBY_Qnil.
#define Qfalse
Old name of RUBY_Qfalse.
#define NIL_P
Old name of RB_NIL_P.
#define MBCLEN_CHARFOUND_P(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_P.
Definition: encoding.h:532
#define T_SYMBOL
Old name of RUBY_T_SYMBOL.
Definition: value_type.h:80
#define ENC_CODERANGE_ASCIIONLY(obj)
Old name of RB_ENC_CODERANGE_ASCIIONLY.
Definition: coderange.h:185
#define BUILTIN_TYPE
Old name of RB_BUILTIN_TYPE.
Definition: value_type.h:85
#define ENCODING_GET_INLINED(obj)
Old name of RB_ENCODING_GET_INLINED.
Definition: encoding.h:107
#define ENC_CODERANGE_CLEAR(obj)
Old name of RB_ENC_CODERANGE_CLEAR.
Definition: coderange.h:187
#define CONST_ID
Old name of RUBY_CONST_ID.
Definition: symbol.h:47
#define rb_ary_new2
Old name of rb_ary_new_capa.
Definition: array.h:651
#define ISALNUM
Old name of rb_isalnum.
Definition: ctype.h:91
#define FL_SET_RAW
Old name of RB_FL_SET_RAW.
Definition: fl_type.h:138
#define SYMBOL_P
Old name of RB_SYMBOL_P.
Definition: value_type.h:88
#define T_REGEXP
Old name of RUBY_T_REGEXP.
Definition: value_type.h:77
#define ruby_debug
This variable controls whether the interpreter is in debug mode.
Definition: error.h:470
void rb_raise(VALUE exc, const char *fmt,...)
Exception entry point.
Definition: error.c:3021
void rb_bug(const char *fmt,...)
Interpreter panic switch.
Definition: error.c:802
void rb_set_errinfo(VALUE err)
Sets the current exception ($!) to the given value.
Definition: eval.c:1760
void rb_warn(const char *fmt,...)
Identical to rb_warning(), except it reports always regardless of runtime -W flag.
Definition: error.c:418
VALUE rb_errinfo(void)
This is the same as $! in Ruby.
Definition: eval.c:1754
void rb_warning(const char *fmt,...)
Issues a warning.
Definition: error.c:449
VALUE rb_cEncoding
Encoding class.
Definition: encoding.c:57
Encoding relates APIs.
int rb_enc_dummy_p(rb_encoding *enc)
Queries if the passed encoding is dummy.
Definition: encoding.c:203
int rb_enc_precise_mbclen(const char *p, const char *e, rb_encoding *enc)
Queries the number of bytes of the character at the passed pointer.
Definition: encoding.c:1234
int rb_enc_get_index(VALUE obj)
Queries the index of the encoding of the passed object, if any.
Definition: encoding.c:979
int rb_to_encoding_index(VALUE obj)
Obtains a encoding index from a wider range of objects (than rb_enc_find_index()).
Definition: encoding.c:267
int rb_filesystem_encindex(void)
Identical to rb_filesystem_encoding(), except it returns the encoding's index instead of the encoding...
Definition: encoding.c:1579
VALUE rb_enc_associate(VALUE obj, rb_encoding *enc)
Identical to rb_enc_associate(), except it takes an encoding itself instead of its index.
Definition: encoding.c:1066
rb_encoding * rb_utf8_encoding(void)
Queries the encoding that represents UTF-8.
Definition: encoding.c:1527
rb_encoding * rb_ascii8bit_encoding(void)
Queries the encoding that represents ASCII-8BIT a.k.a.
Definition: encoding.c:1515
int rb_enc_codelen(int code, rb_encoding *enc)
Queries the number of bytes requested to represent the passed code point using the passed encoding.
Definition: encoding.c:1284
rb_encoding * rb_to_encoding(VALUE obj)
Identical to rb_find_encoding(), except it raises an exception instead of returning NULL.
Definition: encoding.c:329
rb_encoding * rb_filesystem_encoding(void)
Queries the "filesystem" encoding.
Definition: encoding.c:1592
static const char * rb_enc_name(rb_encoding *enc)
Queries the (canonical) name of the passed encoding.
Definition: encoding.h:433
rb_encoding * rb_default_internal_encoding(void)
Queries the "default internal" encoding.
Definition: encoding.c:1724
void rb_enc_copy(VALUE dst, VALUE src)
Destructively copies the encoding of the latter object to that of former one.
Definition: encoding.c:1192
int rb_utf8_encindex(void)
Identical to rb_utf8_encoding(), except it returns the encoding's index instead of the encoding itsel...
Definition: encoding.c:1533
int rb_enc_fast_mbclen(const char *p, const char *e, rb_encoding *enc)
Identical to rb_enc_mbclen() unless the character at p overruns e.
Definition: encoding.c:1216
rb_encoding * rb_enc_get(VALUE obj)
Identical to rb_enc_get_index(), except the return type.
Definition: encoding.c:1072
rb_encoding * rb_enc_from_index(int idx)
Identical to rb_find_encoding(), except it takes an encoding index instead of a Ruby object.
Definition: encoding.c:414
int rb_ascii8bit_encindex(void)
Identical to rb_ascii8bit_encoding(), except it returns the encoding's index instead of the encoding ...
Definition: encoding.c:1521
unsigned int rb_enc_codepoint_len(const char *p, const char *e, int *len, rb_encoding *enc)
Queries the code point of character pointed by the passed pointer.
Definition: encoding.c:1270
int rb_enc_unicode_p(rb_encoding *enc)
Queries if the passed encoding is either one of UTF-8/16/32.
Definition: encoding.c:689
int rb_enc_to_index(rb_encoding *enc)
Queries the index of the encoding.
Definition: encoding.c:197
void rb_enc_set_index(VALUE obj, int encindex)
Destructively assigns an encoding (via its index) to an object.
Definition: encoding.c:1030
VALUE rb_locale_charmap(VALUE klass)
Returns a platform-depended "charmap" of the current locale.
Definition: localeinit.c:91
void rb_enc_set_default_internal(VALUE encoding)
Destructively assigns the passed encoding as the default internal encoding.
Definition: encoding.c:1774
VALUE rb_enc_default_external(void)
Identical to rb_default_external_encoding(), except it returns the Ruby-level counterpart instance of...
Definition: encoding.c:1651
rb_encoding * rb_enc_find(const char *name)
Identical to rb_find_encoding(), except it takes a C's string instead of Ruby's.
Definition: encoding.c:918
VALUE rb_enc_from_encoding(rb_encoding *enc)
Queries the Ruby-level counterpart instance of rb_cEncoding that corresponds to the passed encoding.
Definition: encoding.c:188
rb_encoding * rb_find_encoding(VALUE obj)
Identical to rb_to_encoding_index(), except the return type.
Definition: encoding.c:336
static bool rb_enc_asciicompat(rb_encoding *enc)
Queries if the passed encoding is in some sense compatible with ASCII.
Definition: encoding.h:782
int rb_define_dummy_encoding(const char *name)
Creates a new "dummy" encoding.
Definition: encoding.c:617
rb_encoding * rb_default_external_encoding(void)
Queries the "default external" encoding.
Definition: encoding.c:1637
int rb_locale_encindex(void)
Identical to rb_locale_encoding(), except it returns the encoding's index instead of the encoding its...
Definition: encoding.c:1553
rb_encoding * rb_enc_check(VALUE str1, VALUE str2)
Identical to rb_enc_compatible(), except it raises an exception instead of returning NULL.
Definition: encoding.c:1097
int rb_enc_mbclen(const char *p, const char *e, rb_encoding *enc)
Queries the number of bytes of the character at the passed pointer.
Definition: encoding.c:1222
int rb_enc_capable(VALUE obj)
Queries if the passed object can have its encoding.
Definition: encoding.c:943
VALUE rb_enc_default_internal(void)
Identical to rb_default_internal_encoding(), except it returns the Ruby-level counterpart instance of...
Definition: encoding.c:1733
VALUE rb_enc_associate_index(VALUE obj, int encindex)
Identical to rb_enc_set_index(), except it additionally does contents fix-up depending on the passed ...
Definition: encoding.c:1038
rb_encoding * rb_enc_compatible(VALUE str1, VALUE str2)
Look for the "common" encoding between the two.
Definition: encoding.c:1176
int rb_enc_replicate(const char *name, rb_encoding *src)
Creates a new encoding, using the passed one as a template.
Definition: encoding.c:550
rb_encoding * rb_locale_encoding(void)
Queries the encoding that represents the current locale.
Definition: encoding.c:1573
static OnigCodePoint rb_enc_mbc_to_codepoint(const char *p, const char *e, rb_encoding *enc)
Identical to rb_enc_codepoint(), except it assumes the passed character is not broken.
Definition: encoding.h:607
rb_encoding * rb_usascii_encoding(void)
Queries the encoding that represents US-ASCII.
Definition: encoding.c:1539
void rb_enc_set_default_external(VALUE encoding)
Destructively assigns the passed encoding as the default external encoding.
Definition: encoding.c:1691
int rb_enc_find_index(const char *name)
Queries the index of the encoding.
Definition: encoding.c:881
static int rb_enc_mbminlen(rb_encoding *enc)
Queries the minimum number of bytes that the passed encoding needs to represent a character.
Definition: encoding.h:448
int rb_enc_alias(const char *alias, const char *orig)
Registers an "alias" name.
Definition: encoding.c:721
int rb_enc_ascget(const char *p, const char *e, int *len, rb_encoding *enc)
Queries the code point of character pointed by the passed pointer.
Definition: encoding.c:1246
int rb_usascii_encindex(void)
Identical to rb_usascii_encoding(), except it returns the encoding's index instead of the encoding it...
Definition: encoding.c:1545
int rb_enc_str_coderange(VALUE str)
Scans the passed string to collect its code range.
Definition: string.c:776
int rb_enc_str_asciionly_p(VALUE str)
Queries if the passed string is "ASCII only".
Definition: string.c:790
VALUE rb_obj_encoding(VALUE obj)
Identical to rb_enc_get_index(), except the return type.
Definition: encoding.c:1206
VALUE rb_funcallv(VALUE recv, ID mid, int argc, const VALUE *argv)
Identical to rb_funcall(), except it takes the method arguments as a C array.
Definition: vm_eval.c:1061
void rb_gc_register_mark_object(VALUE object)
Inform the garbage collector that object is a live Ruby object that should not be moved.
Definition: gc.c:8686
VALUE rb_ary_concat(VALUE lhs, VALUE rhs)
Destructively appends the contents of latter into the end of former.
Definition: array.c:4790
VALUE rb_ary_replace(VALUE copy, VALUE orig)
Replaces the contents of the former object with the contents of the latter.
Definition: array.c:4415
VALUE rb_ary_new(void)
Allocates a new, empty array.
Definition: array.c:750
VALUE rb_ary_push(VALUE ary, VALUE elem)
Special case of rb_ary_cat() that it adds only one element.
Definition: array.c:1308
VALUE rb_ary_entry(VALUE ary, long off)
Queries an element of an array.
Definition: array.c:1679
void rb_ary_store(VALUE ary, long key, VALUE val)
Destructively stores the passed value to the passed array's passed index.
Definition: array.c:1148
#define rb_check_frozen
Just another name of rb_check_frozen.
Definition: error.h:278
static int rb_check_arity(int argc, int min, int max)
Ensures that the passed integer is in the passed range.
Definition: error.h:294
VALUE rb_hash_aset(VALUE hash, VALUE key, VALUE val)
Inserts or replaces ("upsert"s) the objects into the given hash table.
Definition: hash.c:2903
VALUE rb_hash_new(void)
Creates a new, empty hash object.
Definition: hash.c:1529
VALUE rb_check_string_type(VALUE obj)
Try converting an object to its stringised representation using its to_str method,...
Definition: string.c:2659
VALUE rb_attr_get(VALUE obj, ID name)
Identical to rb_ivar_get()
Definition: variable.c:1293
VALUE rb_ivar_set(VALUE obj, ID name, VALUE val)
Identical to rb_iv_set(), except it accepts the name as an ID instead of a C string.
Definition: variable.c:1575
VALUE rb_ivar_get(VALUE obj, ID name)
Identical to rb_iv_get(), except it accepts the name as an ID instead of a C string.
Definition: variable.c:1285
void rb_define_alloc_func(VALUE klass, rb_alloc_func_t func)
Sets the allocator function of a class.
ID rb_intern(const char *name)
Finds or creates a symbol of the given name.
Definition: symbol.c:782
VALUE rb_sym2str(VALUE id)
Identical to rb_id2str(), except it takes an instance of rb_cSymbol rather than an ID.
Definition: symbol.c:924
void rb_define_const(VALUE klass, const char *name, VALUE val)
Defines a Ruby level constant under a namespace.
Definition: variable.c:3253
#define strdup(s)
Just another name of ruby_strdup.
Definition: util.h:176
VALUE rb_sprintf(const char *fmt,...)
Ruby's extended sprintf(3).
Definition: sprintf.c:1201
void rb_marshal_define_compat(VALUE newclass, VALUE oldclass, VALUE(*dumper)(VALUE), VALUE(*loader)(VALUE, VALUE))
Marshal format compatibility layer.
Definition: marshal.c:148
#define MEMCPY(p1, p2, type, n)
Handy macro to call memcpy.
Definition: memory.h:366
#define ALLOCA_N(type, n)
Definition: memory.h:286
#define RB_GC_GUARD(v)
Prevents premature destruction of local objects.
Definition: memory.h:161
int st_foreach(st_table *q, int_type *w, st_data_t e)
Iteration over the given table.
Definition: cxxanyargs.hpp:432
#define DATA_PTR(obj)
Convenient getter macro.
Definition: rdata.h:71
#define RDATA(obj)
Convenient casting macro.
Definition: rdata.h:63
#define StringValue(v)
Ensures that the parameter object is a String.
Definition: rstring.h:72
static char * RSTRING_END(VALUE str)
Queries the end of the contents pointer of the string.
Definition: rstring.h:527
static long RSTRING_LEN(VALUE str)
Queries the length of the string.
Definition: rstring.h:483
static char * RSTRING_PTR(VALUE str)
Queries the contents pointer of the string.
Definition: rstring.h:497
#define TypedData_Wrap_Struct(klass, data_type, sval)
Converts sval, a pointer to your struct, into a Ruby object.
Definition: rtypeddata.h:441
This is the struct that holds necessary info for a struct.
Definition: encoding.c:63
uintptr_t VALUE
Type that represents a Ruby object.
Definition: value.h:40
static bool RB_TYPE_P(VALUE obj, enum ruby_value_type t)
Queries if the given object is of given type.
Definition: value_type.h:375