简介
这几天一直在研究中文的简体和繁体之间的转换问题,网上查了一下资料,在此进行整理和备份。
繁体中文有GBK码和BIG5码两种编码,简体中文一般使用的是GB2312编码。
这些编码之间的转换基本都是使用下列3个函数:LCMapString、WideCharToMultiByte和MultiByteToWideChar,其中还会牵涉到UNICODE码和UTF-8码这两种编码。
GB2312编码与GBK编码可以直接使用LCMapString转换,GB2312编码/GBK编码与BIG5编码则无法直接转换,必须使用UNICODE作为中间编码进行中转。
另外UTF-8编码是网络常用编码,如XML文件和网页基本都是使用这种编码,所以在此也一并研究了一下。
下面是我将GB2312/GBK/BIG5/UNICODE/UTF-8这5种编码之间的转换编写到一个函数的代码。
代码
1 int Convert(void *sstr, int scp, void **dstr, int dcp) 2 { 3 #define CP_GBK 936 4 #define CP_BIG5 950 5 #define CP_UTF8 65001 6 7 enum { _unicode, _utf8, _gb2312, _gbk, _big5 }; 8 enum { _wc2mb, _mb2wc, _sc2tc, _tc2sc }; 9 10 LCID lcid; 11 void *src; 12 void *dest; 13 int cch; 14 int scp0; 15 int act; 16 UINT cp; 17 18 if (((scp < _unicode) || (scp > _big5)) || 19 ((dcp < _unicode) || (dcp > _big5))) 20 return -1; 21 22 src = NULL; 23 dest = sstr; 24 cch = 0; 25 scp0 = scp; 26 27 while (scp != dcp) 28 { 29 src = dest; 30 switch (scp) 31 { 32 case _unicode: 33 switch (dcp) 34 { 35 case _utf8: 36 scp = _utf8; 37 act = _wc2mb; 38 cp = CP_UTF8; 39 break; 40 case _gb2312: 41 scp = ((scp0 == _big5) ? _gbk : _gb2312); 42 act = _wc2mb; 43 cp = CP_GBK; 44 break; 45 case _gbk: 46 scp = _gbk; 47 act = _wc2mb; 48 cp = CP_GBK; 49 break; 50 case _big5: 51 scp = _big5; 52 act = _wc2mb; 53 cp = CP_BIG5; 54 break; 55 } 56 break; 57 case _utf8: 58 switch (dcp) 59 { 60 case _unicode: 61 case _gb2312: 62 case _gbk: 63 case _big5: 64 scp = _unicode; 65 act = _mb2wc; 66 cp = CP_UTF8; 67 break; 68 } 69 break; 70 case _gb2312: 71 switch (dcp) 72 { 73 case _unicode: 74 case _utf8: 75 scp = _unicode; 76 act = _mb2wc; 77 cp = CP_GBK; 78 break; 79 case _gbk: 80 case _big5: 81 scp = _gbk; 82 act = _sc2tc; 83 break; 84 } 85 break; 86 case _gbk: 87 switch (dcp) 88 { 89 case _unicode: 90 case _utf8: 91 case _big5: 92 scp = _unicode; 93 act = _mb2wc; 94 cp = CP_GBK; 95 break; 96 case _gb2312: 97 scp = _gb2312; 98 act = _tc2sc; 99 break; 100 } 101 break; 102 case _big5: 103 switch (dcp) 104 { 105 case _unicode: 106 case _utf8: 107 case _gb2312: 108 case _gbk: 109 scp = _unicode; 110 act = _mb2wc; 111 cp = CP_BIG5; 112 break; 113 } 114 break; 115 } 116 117 switch (act) 118 { 119 case _wc2mb: 120 cch = WideCharToMultiByte(cp, 0, (wchar_t *)src, -1, NULL, 0, NULL, NULL); 121 dest = malloc(cch * sizeof(char)); 122 WideCharToMultiByte(cp, 0, (wchar_t *)src, -1, (char *)dest, cch, NULL, NULL); 123 break; 124 case _mb2wc: 125 cch = MultiByteToWideChar(cp, 0, (char *)src, -1, NULL, 0); 126 dest = malloc(cch * sizeof(wchar_t)); 127 MultiByteToWideChar(cp, 0, (char *)src, -1, (wchar_t *)dest, cch); 128 break; 129 case _sc2tc: 130 lcid = GetSystemDefaultLCID(); 131 cch = LCMapString(lcid, LCMAP_TRADITIONAL_CHINESE, (char *)src, -1, NULL, 0); 132 dest = malloc(cch * sizeof(char)); 133 LCMapString(lcid, LCMAP_TRADITIONAL_CHINESE, (char *)src, -1, (char *)dest, cch); 134 break; 135 case _tc2sc: 136 lcid = GetSystemDefaultLCID(); 137 cch = LCMapString(lcid, LCMAP_SIMPLIFIED_CHINESE, (char *)src, -1, NULL, 0); 138 dest = malloc(cch * sizeof(char)); 139 LCMapString(lcid, LCMAP_SIMPLIFIED_CHINESE, (char *)src, -1, (char *)dest, cch); 140 break; 141 } 142 143 if (src && (src != sstr)) 144 { 145 free(src); 146 } 147 } 148 149 if (dstr) 150 { 151 *dstr = dest; 152 } 153 else 154 { 155 free(dest); 156 } 157 158 return cch; 159 }
参数说明
sstr:[in]源字符串的首地址,由于可能是char *和wchar_t *两种数据类型,所以这里我设置为了void *类型 scp:[in]源字符串的编码方式,0:UNICODE编码、1:UTF-8编码、2:GB2312编码、3:GBK编码、4:BIG5编码 dstr:[out]目标字符串地址的指针,由于可能是char **和wchar_t **两种数据类型,所以这里我设置为了void **类型 dcp:[in]目标字符串的编码方式,取值范围与scp类似
函数使用
由于编码方式比较难记忆,所以我将任意两种编码的转化进行了如下定义
#define UnicodeToUtf8(src, dest) Convert((void *)(src), 0, (void **)(dest), 1) #define UnicodeToGb2312(src, dest) Convert((void *)(src), 0, (void **)(dest), 2) #define UnicodeToGbk(src, dest) Convert((void *)(src), 0, (void **)(dest), 3) #define UnicodeToBig5(src, dest) Convert((void *)(src), 0, (void **)(dest), 4) #define Utf8ToUnicode(src, dest) Convert((void *)(src), 1, (void **)(dest), 0) #define Utf8ToGb2312(src, dest) Convert((void *)(src), 1, (void **)(dest), 2) #define Utf8ToGbk(src, dest) Convert((void *)(src), 1, (void **)(dest), 3) #define Utf8ToBig5(src, dest) Convert((void *)(src), 1, (void **)(dest), 4) #define Gb2312ToUnicode(src, dest) Convert((void *)(src), 2, (void **)(dest), 0) #define Gb2312ToUtf8(src, dest) Convert((void *)(src), 2, (void **)(dest), 1) #define Gb2312ToGbk(src, dest) Convert((void *)(src), 2, (void **)(dest), 3) #define Gb2312ToBig5(src, dest) Convert((void *)(src), 2, (void **)(dest), 4) #define GbkToUnicode(src, dest) Convert((void *)(src), 3, (void **)(dest), 0) #define GbkToUtf8(src, dest) Convert((void *)(src), 3, (void **)(dest), 1) #define GbkToGb2312(src, dest) Convert((void *)(src), 3, (void **)(dest), 2) #define GbkToBig5(src, dest) Convert((void *)(src), 3, (void **)(dest), 4) #define Big5ToUnicode(src, dest) Convert((void *)(src), 4, (void **)(dest), 0) #define Big5ToUtf8(src, dest) Convert((void *)(src), 4, (void **)(dest), 1) #define Big5ToGb2312(src, dest) Convert((void *)(src), 4, (void **)(dest), 2) #define Big5ToGbk(src, dest) Convert((void *)(src), 4, (void **)(dest), 3)
测试代码如下:
1 void main() 2 { 3 char *p0; 4 char *p1; 5 6 Gb2312ToBig5("中华人民共和国", &p0); 7 printf("%s\n", p0); 8 Big5ToGb2312(p0, &p1); 9 printf("%s\n", p1); 10 11 free(p0); 12 free(p1); 13 }
时间: 2024-10-26 05:23:54