/* * This file handles character conversions. * * mICQ Copyright (C) © 2001,2002,2003 Rüdiger Kuhlmann * * mICQ is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by * the Free Software Foundation; version 2 dated June, 1991. * * mICQ is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public * License for more details. * * You should have received a copy of the GNU General Public License * along with this package; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA * 02111-1307, USA. * * * $Id$ */ #include "micq.h" #if HAVE_STRING_H #include #endif #if HAVE_ERRNO_H #include #endif #if HAVE_CTYPE_H #include #endif #include "conv.h" #include "preferences.h" #include "util_str.h" #ifdef ENABLE_ICONV #include typedef struct { const char *enc; iconv_t to; iconv_t from; } enc_t; #else typedef struct { const char *enc; } enc_t; #endif static int conv_nr = 0; static enc_t *conv_encs = NULL; /* * Give an ID for the given encoding name */ UBYTE ConvEnc (const char *enc) { UBYTE nr; if (!conv_encs || !conv_nr) { conv_encs = calloc (sizeof (enc_t), conv_nr = 15); conv_encs[0].enc = strdup ("none"); conv_encs[1].enc = strdup ("UTF-8"); conv_encs[2].enc = strdup (ICONV_LATIN1_NAME); conv_encs[3].enc = strdup (ICONV_LATIN9_NAME); conv_encs[4].enc = strdup ("KOI8-U"); conv_encs[5].enc = strdup ("CP1251"); /* NOT cp-1251, NOT windows* */ conv_encs[6].enc = strdup (ICONV_UCS2BE_NAME); conv_encs[7].enc = strdup ("CP1257"); conv_encs[8].enc = strdup ("EUC-JP"); conv_encs[9].enc = strdup ("SHIFT-JIS"); } if (!strcasecmp (enc, "WINDOWS-1251") || !strcmp (enc, "CP-1251")) enc = "CP1251"; if (!strcasecmp (enc, "WINDOWS-1257") || !strcmp (enc, "CP-1257")) enc = "CP1257"; if (!strcasecmp (enc, "ISO-8859-1") || !strcasecmp (enc, "ISO8859-1") || !strcasecmp (enc, "LATIN1")) enc = ICONV_LATIN1_NAME; if (!strcasecmp (enc, "ISO-8859-15") || !strcasecmp (enc, "ISO8859-15") || !strcasecmp (enc, "LATIN9")) enc = ICONV_LATIN9_NAME; if (!strcasecmp (enc, "UCS-2BE") || !strcasecmp (enc, "UNICODEBIG")) enc = ICONV_UCS2BE_NAME; #ifndef ENABLE_ICONV if (!strncasecmp (enc, "KOI8", 4)) enc = "KOI8-U"; #endif for (nr = 0; conv_encs[nr].enc; nr++) if (!strcasecmp (conv_encs[nr].enc, enc)) { #ifdef ENABLE_ICONV if (conv_encs[nr].to && conv_encs[nr].from && conv_encs[nr].to != (iconv_t)(-1) && conv_encs[nr].from != (iconv_t)(-1)) #else if (nr <= ENC_MAX_BUILTIN) #endif return nr; break; } if (nr == conv_nr - 1) { enc_t *new = realloc (conv_encs, sizeof (enc_t) * (conv_nr + 10)); if (!new) return 0; conv_nr += 10; conv_encs = new; } if (!conv_encs[nr].enc) { char *p; for (conv_encs[nr].enc = p = strdup (enc); *p; p++) *p = toupper (*p); conv_encs[nr + 1].enc = NULL; } #ifdef ENABLE_ICONV conv_encs[nr].to = iconv_open ("UTF-8", enc); conv_encs[nr].from = iconv_open (enc, "UTF-8"); if (conv_encs[nr].to == (iconv_t)(-1) || conv_encs[nr].from == (iconv_t)(-1)) return ENC_AUTO | nr; return nr; #else return ENC_AUTO | nr; #endif } /* * Give the encoding name for a given ID */ const char *ConvEncName (UBYTE enc) { if (!conv_encs) ConvEnc ("none"); return conv_encs[enc & ~ENC_AUTO].enc; } const char *ConvCrush0xFE (const char *inn) { static char *t = NULL; static UDWORD size = 0; char *p; if (!inn || !*inn) return ""; t = s_catf (t, &size, "%*s", 100, ""); *t = '\0'; t = s_catf (t, &size, "%s", inn); for (p = t; *p; p++) if (*p == Conv0xFE) *p = '*'; return t; } #ifdef ENABLE_UTF8 /* * Convert a single unicode code point to UTF-8 */ const char *ConvUTF8 (UDWORD x) { static char b[7]; if (!(x & 0xffffff80)) { b[0] = x; b[1] = '\0'; } else if (!(x & 0xfffff800)) { b[0] = 0xc0 | (x >> 6); b[1] = 0x80 | (x & 0x3f); b[2] = '\0'; } else if (!(x & 0xffff0000)) { b[0] = 0xe0 | ( x >> 12); b[1] = 0x80 | ((x & 0xfc0) >> 6); b[2] = 0x80 | (x & 0x3f); b[3] = '\0'; } else if (!(x) & 0xffe00000) { b[0] = 0xf0 | ( x >> 18); b[1] = 0x80 | ((x & 0x3f000) >> 12); b[2] = 0x80 | ((x & 0xfc0) >> 6); b[3] = 0x80 | (x & 0x3f); b[4] = '\0'; } else if (!(x) & 0xfc000000) { b[0] = 0xf8 | ( x >> 24); b[1] = 0x80 | ((x & 0xfc0000) >> 18); b[2] = 0x80 | ((x & 0x3f000) >> 12); b[3] = 0x80 | ((x & 0xfc0) >> 6); b[4] = 0x80 | (x & 0x3f); b[5] = '\0'; } else if (!(x) & 0x80000000) { b[0] = 0xfc | ( x >> 30); b[1] = 0x80 | ((x & 0x3f000000) >> 24); b[2] = 0x80 | ((x & 0xfc0000) >> 18); b[3] = 0x80 | ((x & 0x3f000) >> 12); b[4] = 0x80 | ((x & 0xfc0) >> 6); b[5] = 0x80 | (x & 0x3f); b[6] = '\0'; } else return "?"; return b; } BOOL ConvIsUTF8 (const char *in) { char c; for ( ; *in; in++) { if (~*in & 0x80) continue; if (~*in & 0x40) return 0; for (c = *in; (in[1] & 0x80) && (~in[1] & 0x40) && (c & 0x40); in++) c *= 2; if (c & 0x40) return 0; } return 1; } #ifdef ENABLE_ICONV const char *ConvToUTF8 (const char *inn, UBYTE enc, size_t totalin, UBYTE keep0xfe) { static char *t = NULL; static UDWORD size = 0; size_t inleft, outleft, totalleft; char *out, *tmp; ICONV_CONST char *in; if (!inn) return ""; t = s_catf (t, &size, "%*s", 100, ""); *t = '\0'; enc &= ~ENC_AUTO; if (!conv_nr) ConvEnc ("UTF-8"); if (enc >= conv_nr || !enc) return s_sprintf ("", enc); if (!conv_encs[enc].to || !~(long)conv_encs[enc].to) { conv_encs[enc].to = iconv_open ("UTF-8", conv_encs[enc].enc); if (conv_encs[enc].to == (iconv_t)(-1)) { if (enc != ENC_UTF8) return s_sprintf ("", enc, conv_encs[enc].enc); else return keep0xfe ? s_sprintf ("%s", inn) : ConvCrush0xFE (inn); } } iconv (conv_encs[enc].to, NULL, NULL, NULL, NULL); in = (ICONV_CONST char *)inn; out = t; totalin = (totalin == -1 ? strlen (in) : totalin); totalleft = totalin; inleft = (keep0xfe && memchr (in, 0xfe, totalin)) ? (const char *)memchr (in, 0xfe, totalin) - in : totalin; outleft = size - 1; while (iconv (conv_encs[enc].to, &in, &inleft, &out, &outleft) == (size_t)(-1) || *in == (char)0xfe) { UDWORD rc = errno; if (outleft < 10 || rc == E2BIG) { UDWORD done = out - t; tmp = realloc (t, size + 50); if (!tmp) break; t = tmp; size += 50; outleft += 50; out = t + done; } else if (*in == (char)0xfe && keep0xfe) { *out++ = 0xfe; outleft--; in++; totalin = totalleft - (in - inn); inleft = (keep0xfe && memchr (in, 0xfe, totalin)) ? (const char *)memchr (in, 0xfe, totalin) - in : totalin; } else /* EILSEQ */ { *out++ = '?'; outleft--; in++; inleft--; } } *out = '\0'; return t; } const char *ConvFromUTF8 (const char *inn, UBYTE enc, size_t *resultlen) { static char *t = NULL; static UDWORD size = 0; size_t inleft, outleft; char *out, *tmp; ICONV_CONST char *in; if (!inn) return ""; t = s_catf (t, &size, "%*s", 100, ""); *t = '\0'; enc &= ~ENC_AUTO; if (!conv_nr) ConvEnc ("UTF-8"); if (enc >= conv_nr || !enc) return s_sprintf ("", enc); if (!conv_encs[enc].from || !~(long)conv_encs[enc].from) { conv_encs[enc].from = iconv_open (conv_encs[enc].enc, "UTF-8"); if (conv_encs[enc].from == (iconv_t)(-1)) { if (enc != ENC_UTF8) return s_sprintf ("", enc, conv_encs[enc].enc); else return s_sprintf ("%s", inn); } } iconv (conv_encs[enc].from, NULL, NULL, NULL, NULL); in = (ICONV_CONST char *)inn; out = t; inleft = strchr (in, 0xfe) ? strchr (in, 0xfe) - in : strlen (in); outleft = size - 1; while (iconv (conv_encs[enc].from, &in, &inleft, &out, &outleft) == (size_t)(-1) || *in == (char)0xfe) { UDWORD rc = errno; if (outleft < 10 || rc == E2BIG) { UDWORD done = out - t; tmp = realloc (t, size + 50); if (!tmp) break; t = tmp; size += 50; outleft += 50; out = t + done; } else if (*in == (char)0xfe) { *out++ = 0xfe; outleft--; in++; inleft = strchr (in, 0xfe) ? strchr (in, 0xfe) - in : strlen (in); } else /* EILSEQ */ { *out++ = '?'; outleft--; in++; inleft--; while (*in && ((*in & 0xc0) == 0x80)) /* skip continuation bytes */ in++, inleft--; } } *out = '\0'; if (resultlen) *resultlen = out - t; return t; } BOOL ConvFits (const char *in, UBYTE enc) { char *inn, *p; inn = strdup (in); if (!inn) return 0; for (p = inn; *p; p++) if (*p == Conv0xFE || *p == '?') *p = ' '; return strchr (ConvFromUTF8 (inn, enc, NULL), '?') ? 0 : 1; } #else #define PUT_UTF8(x) t = s_cat (t, &size, ConvUTF8 (x)) #define GET_UTF8(in,y) \ do { UDWORD vl = 0; int todo = 1; UDWORD org = *in++; if ((org & 0xc0) != 0xc0) { y = '!'; continue; } \ while (org & 0x20) { todo += 1; org <<= 2; }; org &= 0x3f; for (vl = 1; vl < todo; vl++) org >>= 2; \ vl = org; while (todo > 0) { org = *in++; \ if ((org & 0xc0) != 0x80) { todo = -1; continue; } org &= 0x3f; \ vl <<= 6; vl |= org; todo--; } if (todo == -1) y = '?'; else y = vl; } while (0) const UDWORD koi8u_utf8[] = { /* 7bit are us-ascii */ 0x2500, 0x2502, 0x250c, 0x2510, 0x2514, 0x2518, 0x251c, 0x2524, 0x252c, 0x2534, 0x253c, 0x2580, 0x2584, 0x2588, 0x258c, 0x2590, 0x2591, 0x2592, 0x2593, 0x2320, 0x25a0, 0x2022, 0x221a, 0x2248, 0x2264, 0x2265, 0x00a0, 0x2321, 0x00b0, 0x00b2, 0x00b7, 0x00f7, 0x2550, 0x2551, 0x2552, 0x0451, 0x0454, 0x2554, 0x0456, 0x0457, 0x2557, 0x2558, 0x2559, 0x255a, 0x255b, 0x0491, 0x255d, 0x255e, 0x255f, 0x2560, 0x2561, 0x0401, 0x0403, 0x2563, 0x0406, 0x0407, 0x2566, 0x2567, 0x2568, 0x2569, 0x256a, 0x0490, 0x256c, 0x00a9, 0x044e, 0x0430, 0x0431, 0x0446, 0x0434, 0x0435, 0x0444, 0x0433, 0x0445, 0x0438, 0x0439, 0x043a, 0x043b, 0x043c, 0x043d, 0x043e, 0x043f, 0x044f, 0x0440, 0x0441, 0x0442, 0x0443, 0x0436, 0x0432, 0x044c, 0x044b, 0x0437, 0x0448, 0x044d, 0x0449, 0x0447, 0x044a, 0x042e, 0x0410, 0x0411, 0x0426, 0x0414, 0x0415, 0x0424, 0x0413, 0x0425, 0x0418, 0x0419, 0x041a, 0x041b, 0x041c, 0x041d, 0x041e, 0x041f, 0x042f, 0x0420, 0x0421, 0x0422, 0x0423, 0x0416, 0x0412, 0x042c, 0x042b, 0x0417, 0x0428, 0x042d, 0x0429, 0x0427, 0x042a, 0x0 }; const UDWORD win1251_utf8[] = { /* 7bit are us-ascii */ 0x0402, 0x0403, 0x201a, 0x0453, 0x201e, 0x2026, 0x2020, 0x2021, 0x0088, 0x2030, 0x0409, 0x2039, 0x040a, 0x040c, 0x040b, 0x040f, 0x0452, 0x2018, 0x2019, 0x201c, 0x201d, 0x2022, 0x2013, 0x2014, 0x0098, 0x2122, 0x0459, 0x203a, 0x045a, 0x045c, 0x045b, 0x045f, 0x00a0, 0x040e, 0x045e, 0x0408, 0x00a4, 0x0490, 0x00a6, 0x00a7, 0x0401, 0x00a9, 0x0404, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x0407, 0x00b0, 0x00b1, 0x0406, 0x0456, 0x0491, 0x00b5, 0x00b6, 0x00b7, 0x0451, 0x2116, 0x0454, 0x00bb, 0x0458, 0x0405, 0x0455, 0x0457, 0x0410, 0x0411, 0x0412, 0x0413, 0x0414, 0x0415, 0x0416, 0x0417, 0x0418, 0x0419, 0x041a, 0x041b, 0x041c, 0x041d, 0x041e, 0x041f, 0x0420, 0x0421, 0x0422, 0x0423, 0x0424, 0x0425, 0x0426, 0x0427, 0x0428, 0x0429, 0x042a, 0x042b, 0x042c, 0x042d, 0x042e, 0x042f, 0x0430, 0x0431, 0x0432, 0x0433, 0x0434, 0x0435, 0x0436, 0x0437, 0x0438, 0x0439, 0x043a, 0x043b, 0x043c, 0x043d, 0x043e, 0x043f, 0x0440, 0x0441, 0x0442, 0x0443, 0x0444, 0x0445, 0x0446, 0x0447, 0x0448, 0x0449, 0x044a, 0x044b, 0x044c, 0x044d, 0x044e, 0x044f, 0x0 }; const char *ConvToUTF8 (const char *inn, UBYTE enc, size_t totalin, UBYTE keep0xfe) { static char *t = NULL; static UDWORD size = 0; const unsigned char *in = (const unsigned char *)inn; UDWORD ucs, i; #if 0 unsigned char x, y; #endif if (!inn) return ""; totalin = (enc == ENC_UCS2BE ? (totalin == -1 ? strlen (in) : totalin) : 0); /* obey totalin _only_ for UCS-2BE */ t = s_catf (t, &size, "%*s", (int)(totalin * 3), ""); *t = '\0'; for (*t = '\0'; *in || totalin; in++) { if ((~*in & 0x80) && (enc & ~ENC_AUTO) != ENC_UCS2BE) { t = s_catf (t, &size, "%c", *in); continue; } if (keep0xfe && *in == (unsigned char)Conv0xFE) { t = s_catf (t, &size, "\xfe"); continue; } switch (enc & ~ENC_AUTO) { case ENC_UTF8: GET_UTF8 (in, i); in--; PUT_UTF8 (i); continue; case ENC_LATIN1: PUT_UTF8 (*in); continue; case ENC_LATIN9: switch (*in) { case 0xa4: ucs = 0x20ac; /* EURO */ case 0xa6: ucs = 0x0160; /* SCARON */ case 0xa8: ucs = 0x0161; /* SMALL SCARON */ case 0xb4: ucs = 0x017d; /* ZCARON */ case 0xb8: ucs = 0x017e; /* SMALL ZCARON */ case 0xbc: ucs = 0x0152; /* OE */ case 0xbd: ucs = 0x0153; /* SMALL OE */ case 0xbe: ucs = 0x0178; /* Y DIAERESIS */ default: ucs = *in; } PUT_UTF8 (ucs); continue; case ENC_KOI8: PUT_UTF8 (koi8u_utf8[*in & 0x7f]); continue; case ENC_WIN1251: PUT_UTF8 (win1251_utf8[*in & 0x7f]); continue; case ENC_UCS2BE: ucs = *(in++) << 8; ucs |= *in; if ((*in & 0xfc) == 0xd8) { in++; totalin -= 2; ucs &= 0x4ff; ucs <<= 2; if ((*in & 0xfc) != 0xc) { PUT_UTF8 ('?'); in--; if (totalin < 2) return t; continue; } ucs |= (0x3 & *(in++)); ucs <<= 8; ucs = *in; } totalin -= 2; PUT_UTF8 (ucs); if (totalin < 2) return t; continue; #if 0 case ENC_EUC: /* FIXME: No, this is no real UTF-8. We just stuff EUC into the private use area U+Fxxxx */ PUT_UTF8 (0xf0000 | (*in << 8) | in[1]); in++; continue; case ENC_SJIS: x = *in++; y = *in; if ((x & 0xe0) == 0x80) { if (y < 0x9f) { x = 2 * x - (x >= 0xe0 ? 0xe1 : 0x61); y += 0x61 - (y >= 0x7f ? 1 : 0); } else { x = 2 * x - (x >= 0xe0 ? 0xe0 : 0x60); y += 2; } } PUT_UTF8 (0xf0000 | (x << 8) | y); continue; #endif default: t = s_cat (t, &size, "?"); } } return t; } const char *ConvFromUTF8 (const char *inn, UBYTE enc, size_t *resultlen) { static char *t = NULL; static UDWORD size = 0; const unsigned char *in = (const unsigned char *)inn; UDWORD val, i; #if 0 unsigned char x, y; #endif if (!inn) return ""; t = s_catf (t, &size, "%*s", (int)strlen ((const char *)in), ""); if ((enc & ~ENC_AUTO) == ENC_UCS2BE) { char *p = NULL; t = s_catf (t, &size, "%*s", (int)strlen ((const char *)in), ""); for (p = t; *in; ) { if (~*in & 0x80) val = *(in++); else { val = '?'; GET_UTF8 (in,val); } if (val > 0xffff || (val & 0xf800) == 0xd800) { *(p++) = 0xd8 | ((val >> 18) & 0x3); *(p++) = (val >> 10) & 0xff; *(p++) = 0xdc | ((val >> 8) & 0x3); *(p++) = val & 0xff; } else { *(p++) = (val >> 8) & 0xff; *(p++) = val & 0xff; } } if (resultlen) *resultlen = p - t; *(p++) = 0; *(p++) = 0; return t; } for (*t = '\0'; *in; in++) { if (~*in & 0x80) { t = s_catf (t, &size, "%c", *in); continue; } if (*in == 0xfe) /* we _do_ allow 0xFE here, it's the ICQ separator character */ { t = s_catf (t, &size, "\xfe"); continue; } val = '?'; GET_UTF8 (in,val); in--; if (val == '?') { t = s_catf (t, &size, "?"); continue; } switch (enc & ~ENC_AUTO) { case ENC_UTF8: PUT_UTF8 (val); continue; case ENC_LATIN1: if (!(val & 0xffffff00)) t = s_catf (t, &size, "%c", (UBYTE)val); else t = s_catf (t, &size, "?"); continue; case ENC_LATIN9: if (!(val & 0xffffff00)) t = s_catf (t, &size, "%c", (UBYTE)val); else switch (val) { case 0x20ac: t = s_catf (t, &size, "\xa4"); continue; /* EURO */ case 0x0160: t = s_catf (t, &size, "\xa6"); continue; /* SCARON */ case 0x0161: t = s_catf (t, &size, "\xa8"); continue; /* SMALL SCARON */ case 0x017d: t = s_catf (t, &size, "\xb4"); continue; /* ZCARON */ case 0x017e: t = s_catf (t, &size, "\xb8"); continue; /* SMALL ZCARON */ case 0x0152: t = s_catf (t, &size, "\xbc"); continue; /* OE */ case 0x0153: t = s_catf (t, &size, "\xbd"); continue; /* SMALL OE */ case 0x0178: t = s_catf (t, &size, "\xbe"); continue; /* Y DIAERESIS */ default: t = s_catf (t, &size, "?"); } continue; case ENC_KOI8: for (i = 0; i <= 128; i++) { if (koi8u_utf8[i] == val) { t = s_catf (t, &size, "%c", (UBYTE)(i + 128)); break; } if (i == 128) t = s_catf (t, &size, "?"); } continue; case ENC_WIN1251: for (i = 0; i <= 128; i++) { if (win1251_utf8[i] == val) { t = s_catf (t, &size, "%c", (UBYTE)(i + 128)); break; } if (i == 128) t = s_catf (t, &size, "?"); } continue; /* case ENC_UCS2BE: handled above */ #if 0 case ENC_EUC: if ((val & 0xffff0000) != 0xf0000) { t = s_catf (t, &size, "?"); continue; } t = s_catf (t, &size, "%c%c", (val & 0xff00) >> 8, val & 0xff); continue; case ENC_SJIS: if ((val & 0xffff0000) != 0xf0000) { t = s_catf (t, &size, "?"); continue; } x = (val & 0xff00) >> 8; y = val & 0xff; if (x & 1) { x = x / 2 + (x < 0xdf ? 0x31 : 0x71); y -= 0x61 - (y < 0xe0 ? 0 : 1); } else { x = x / 2 + (x < 0xdf ? 0x30 : 0x70); y -= 2; } continue; #endif default: t = s_cat (t, &size, "?"); } } if (resultlen) *resultlen = strlen (t); return t; } #endif /* ENABLE_ICONV */ #endif /* ENABLE_UTF8 */