中文字符转拼音源码

中文字符转拼音源码
2011-11-21      0个评论      
收藏    我要投稿

只对GB2312编码有效

/* 汉字读音分界点处的码值数组*/

static const unsigned short code_pin[] = {

0xb0a1,0xb0a3,0xb0b0,0xb0b9,0xb0bc,0xb0c5,0xb0d7,0xb0df,0xb0ee,0xb0fa,0xb1ad,0xb1bc,0xb1c0,0xb1c6,

0xb1de,0xb1ea,0xb1ee,0xb1f2,0xb1f8,0xb2a3,0xb2b8,0xb2c1,0xb2c2,0xb2cd,0xb2d4,0xb2d9,0xb2de,0xb2e3,

0xb2e5,0xb2f0,0xb2f3,0xb2fd,0xb3ac,0xb3b5,0xb3bb,0xb3c5,0xb3d4,0xb3e4,0xb3e9,0xb3f5,0xb4a7,0xb4a8,

0xb4af,0xb4b5,0xb4ba,0xb4c1,0xb4c3,0xb4cf,0xb4d5,0xb4d6,0xb4da,0xb4dd,0xb4e5,0xb4e8,0xb4ee,0xb4f4,

0xb5a2,0xb5b1,0xb5b6,0xb5c2,0xb5c5,0xb5cc,0xb5df,0xb5ef,0xb5f8,0xb6a1,0xb6aa,0xb6ab,0xb6b5,0xb6bc,

0xb6cb,0xb6d1,0xb6d5,0xb6de,0xb6ea,0xb6f7,0xb6f8,0xb7a2,0xb7aa,0xb7bb,0xb7c6,0xb7d2,0xb7e1,0xb7f0,

0xb7f1,0xb7f2,0xb8c1,0xb8c3,0xb8c9,0xb8d4,0xb8dd,0xb8e7,0xb8f8,0xb8f9,0xb8fb,0xb9a4,0xb9b3,0xb9bc,

0xb9ce,0xb9d4,0xb9d7,0xb9e2,0xb9e5,0xb9f5,0xb9f8,0xb9fe,0xbaa1,0xbaa8,0xbabb,0xbabe,0xbac7,0xbad9,

0xbadb,0xbadf,0xbae4,0xbaed,0xbaf4,0xbba8,0xbbb1,0xbbb6,0xbbc4,0xbbd2,0xbbe7,0xbbed,0xbbf7,0xbcce,

0xbcdf,0xbda9,0xbdb6,0xbdd2,0xbded,0xbea3,0xbebc,0xbebe,0xbecf,0xbee8,0xbeef,0xbef9,0xbfa6,0xbfaa,

0xbfaf,0xbfb5,0xbfbc,0xbfc0,0xbfcf,0xbfd3,0xbfd5,0xbfd9,0xbfdd,0xbfe4,0xbfe9,0xbfed,0xbfef,0xbff7,

0xc0a4,0xc0a8,0xc0ac,0xc0b3,0xc0b6,0xc0c5,0xc0cc,0xc0d5,0xc0d7,0xc0e2,0xc0e5,0xc1a9,0xc1aa,0xc1b8,

0xc1c3,0xc1d0,0xc1d5,0xc1e1,0xc1ef,0xc1fa,0xc2a5,0xc2ab,0xc2bf,0xc2cd,0xc2d3,0xc2d5,0xc2dc,0xc2e8,

0xc2f1,0xc2f7,0xc3a2,0xc3a8,0xc3b4,0xc3b5,0xc3c5,0xc3c8,0xc3d0,0xc3de,0xc3e7,0xc3ef,0xc3f1,0xc3f7,

0xc3fd,0xc3fe,0xc4b1,0xc4b4,0xc4c3,0xc4ca,0xc4cf,0xc4d2,0xc4d3,0xc4d8,0xc4d9,0xc4db,0xc4dc,0xc4dd,

0xc4e8,0xc4ef,0xc4f1,0xc4f3,0xc4fa,0xc4fb,0xc5a3,0xc5a7,0xc5ab,0xc5ae,0xc5af,0xc5b0,0xc5b2,0xc5b6,

0xc5b7,0xc5be,0xc5c4,0xc5ca,0xc5d2,0xc5d7,0xc5de,0xc5e7,0xc5e9,0xc5f7,0xc6aa,0xc6ae,0xc6b2,0xc6b4,

0xc6b9,0xc6c2,0xc6cb,0xc6da,0xc6fe,0xc7a3,0xc7b9,0xc7c1,0xc7d0,0xc7d5,0xc7e0,0xc7ed,0xc7ef,0xc7f7,

0xc8a6,0xc8b1,0xc8b9,0xc8bb,0xc8bf,0xc8c4,0xc8c7,0xc8c9,0xc8d3,0xc8d5,0xc8d6,0xc8e0,0xc8e3,0xc8ed,

0xc8ef,0xc8f2,0xc8f4,0xc8f6,0xc8f9,0xc8fd,0xc9a3,0xc9a6,0xc9aa,0xc9ad,0xc9ae,0xc9af,0xc9b8,0xc9ba,

0xc9ca,0xc9d2,0xc9dd,0xc9e9,0xc9f9,0xcaa6,0xcad5,0xcadf,0xcba2,0xcba4,0xcba8,0xcbaa,0xcbad,0xcbb1,

0xcbb5,0xcbb9,0xcbc9,0xcbd1,0xcbd4,0xcbe1,0xcbe4,0xcbef,0xcbf2,0xcbfa,0xcca5,0xccae,0xccc0,0xcccd,

0xccd8,0xccd9,0xccdd,0xccec,0xccf4,0xccf9,0xccfc,0xcda8,0xcdb5,0xcdb9,0xcdc4,0xcdc6,0xcdcc,0xcdcf,

0xcdda,0xcde1,0xcde3,0xcdf4,0xcdfe,0xcec1,0xcecb,0xcece,0xced7,0xcef4,0xcfb9,0xcfc6,0xcfe0,0xcff4,

0xd0a8,0xd0bd,0xd0c7,0xd0d6,0xd0dd,0xd0e6,0xd0f9,0xd1a5,0xd1ab,0xd1b9,0xd1c9,0xd1ea,0xd1fb,0xd2ac,

0xd2bb,0xd2f0,0xd3a2,0xd3b4,0xd3b5,0xd3c4,0xd3d9,0xd4a7,0xd4bb,0xd4c5,0xd4d1,0xd4d4,0xd4db,0xd4df,

0xd4e2,0xd4f0,0xd4f4,0xd4f5,0xd4f6,0xd4fa,0xd5aa,0xd5b0,0xd5c1,0xd5d0,0xd5da,0xd5e4,0xd5f4,0xd6a5,

0xd6d0,0xd6db,0xd6e9,0xd7a5,0xd7a7,0xd7a8,0xd7ae,0xd7b5,0xd7bb,0xd7bd,0xd7c8,0xd7d7,0xd7de,0xd7e2,

0xd7ea,0xd7ec,0xd7f0,0xd7f2 };

/* 汉字读音数组*/

static const char *str_pin[] = {

"a","ai","an","ang","ao","ba","bai","ban","bang","bao","bei","ben","beng","bi","bian","biao",

"bie","bin","bing","bo","bu","ca","cai","can","cang","cao","ce","ceng","cha","chai","chan",

"chang","chao","che","chen","cheng","chi","chong","chou","chu","chuai","chuan","chuang","chui",

"chun","chuo","ci","cong","cou","cu","cuan","cui","cun","cuo","da","dai","dan","dang","dao",

"de","deng","di","dian","diao","die","ding","diu","dong","dou","du","duan","dui","dun","duo",

"e","en","er","fa","fan","fang","fei","fen","feng","fo","fou","fu","ga","gai","gan","gang",

"gao","ge","gei","gen","geng","gong","gou","gu","gua","guai","guan","guang","gui","gun",

"guo","ha","hai","han","hang","hao","he","hei","hen","heng","hong","hou","hu","hua","huai",

"huan","huang","hui","hun","huo","ji","jia","jian","jiang","jiao","jie","jin","jing",

"jiong","jiu","ju","juan","jue","jun","ka","kai","kan","kang","kao","ke","ken","keng",

"kong","kou","ku","kua","kuai","kuan","kuang","kui","kun","kuo","la","lai","lan","lang",

"lao","le","lei","leng","li","lia","lian","liang","liao","lie","lin","ling","liu","long","lou",

"lu","lv","luan","lue","lun","luo","ma","mai","man","mang","mao","me","mei","men","meng",

"mi","mian","miao","mie","min","ming","miu","mo","mou","mu","na","nai","nan","nang","nao","ne",

"nei","nen","neng","ni","nian","niang","niao","nie","nin","ning","niu","nong","nu","nv","nuan",

"nue","nuo","o","ou","pa","pai","pan","pang","pao","pei","pen","peng","pi","pian","piao","pie",

"pin","ping","po","pu","qi","qia","qian","qiang","qiao","qie","qin","qing","qiong","qiu","qu",

"quan","que","qun","ran","rang","rao","re","ren","reng","ri","rong","rou","ru","ruan","rui",

"run","ruo","sa","sai","san","sang","sao","se","sen","seng","sha","shai","shan","shang","shao",

"she","shen","sheng","shi","shou","shu","shua","shuai","shuan","shuang","shui","shun","shuo",

"si","song","sou","su","suan","sui","sun","suo","ta","tai","tan","tang","tao","te","teng",

"ti","tian","tiao","tie","ting","tong","tou","tu","tuan","tui","tun","tuo","wa","wai","wan",

"wang","wei","wen","weng","wo","wu","xi","xia","xian","xiang","xiao","xie","xin","xing",

"xiong","xiu","xu","xuan","xue","xun","ya","yan","yang","yao","ye","yi","yin","ying","yo",

"yong","you","yu","yuan","yue","yun","za","zai","zan","zang","zao","ze","zei","zen","zeng",

"zha","zhai","zhan","zhang","zhao","zhe","zhen","zheng","zhi","zhong","zhou","zhu","zhua",

"zhuai","zhuan","zhuang","zhui","zhun","zhuo","zi","zong","zou","zu","zuan","zui","zun","zuo"};

/* 数组长度*/

static const size_t SIZE_ARRAY = sizeof(code_pin) / sizeof(short);

#include <iostream>

#include <string>

using std::string;

using std::cout;

using std::endl;

/* 获取一个汉字的拼音*/

const char* get_pin(unsigned short char_zh)

{

size_t low = 0, high = SIZE_ARRAY - 1;

size_t index;

while (high - low != 1)

{

index = (low + high) / 2;

if (code_pin[index] == char_zh) return str_pin[index];

if (code_pin[index] < char_zh) low = index;

else high = index;

}

return str_pin[code_pin[high] <= char_zh ? high : low];

}

/* 将中英混合字符串转化成拼音形式*/

string str_to_pin(string const &input)

{

string result ;

unsigned short char_zh;

int inputLength = input.length();

unsigned char high, low;

for (int i = 0; i < inputLength; ++i)

{

high = input[i] ;

if (high < 0x80) result.append(1, high);

else

{

low = input[++i];

char_zh = (high << 8 ) + low ;

result.append(get_pin(char_zh));

}

}

return result;

}

/* 将中英混合字符串转化成拼音形式,每个拼音后跟一个空格*/

string str_to_pin_space(string const &input)

{

string result;

unsigned short char_zh;

int inputLength = input.length();

unsigned char high, low;

for (int i = 0; i < inputLength; ++i)

{

high = input[i];

if(high < 0x80)

{

if (i > 0 && input[i-1] < 0)  result.append(1, ‘ ‘);

result.append(1, high);

}

else

{

if(i > 0) result.append(1, ‘ ‘);

low = input[++i];

char_zh = (high << 8 ) + low ;

result.append(get_pin(char_zh));

}

}

return result;

}

/*

将中英及数字混合串转化成拼音,且每一连续部分之间放置一个空格。

输入必须满足只有上述三种字符的情况,正确性由调用端保证。

*/

string str_to_pin_space_(string const &input)

{

string result(1, ‘ ‘) ;

unsigned short char_zh;

int inputLength = input.length();

unsigned char high, low;

for (int i = 0; i < inputLength; ++i)

{

high = input[i] ;

if (high < 58)

{

if (i > 0 && (input[i-1] < 0 || input[i-1] > 57))

{

result.append(1, ‘ ‘);

}

result.append(1, high);

}

else if (high > 64 && high < 123)

{

if (i > 0 && (input[i-1] < 0 || input[i-1] < 58))

{

result.append(1, ‘ ‘);

}

result.append(1, high);

}

else

{

if(i > 0) result.append(1, ‘ ‘);

low = input[++i];

char_zh = (high << 8 ) + low ;

result.append(get_pin(char_zh));

}

}

return result;

}

/* 测试代码*/

int main()

{

cout << str_to_pin_space_("zhong hua人民dfd啊a152左边35gdaf共和国") << endl;

return 0;

}

摘自 学无止境的blog

时间: 2024-09-29 12:34:11

中文字符转拼音源码的相关文章

使用扫描软件扫描含有中文字符的二维码显示乱码?

在使用中琅领跑条码标签打印软件制作并打印二维码时,有些朋友也会遇到这种问题:使用扫描软件扫描含有中文字符的二维码时,扫描界面显示为一串问号(即乱码).是什么原因造成扫描中文乱码的呢?又该如何解决呢?今天就来简单介绍一下. 首先,通过中琅领跑条码标签打印软件左侧工具栏的"绘制二维条码"按钮,在画布空白区域绘制出一个二维码. 在二维码上点击鼠标右键选择"属性",在弹出的属性窗口中,选择"数据源"选项,在左侧"数据对象"栏中,先点击

ctrl c 中文字符到 vnc 里,中文字符已经被转码

为了测试程序对多语言字符的支持情况,我找来一段中文和北欧的文字,希望把这些文字上传到elasticsearch,并能正确显示. 首先测试了北欧文字,一切OK. 但是中文复制到 VNC 客户端(Linux)后却是问号,因为Linux本来就打不出中文,所以显示乱码我也没在意,我觉得中文的编码无非就是一坨二进制的东西,我又没有改变什么,显示问号只是 linux 无法解析而已.跑了下程序,然后到elasticsearch查询结果,中文部分依然显示的是问号. 接下来就几个想法,首先是,程序在某处应该设置c

WP8_UTF8 to GB2312转码 (url网址中带中文字符的处理)

直接使用例如:http://www.abc.php?name=中文符 ,客户端调用,在服务端修改后,会出现乱码, 而windows phone 又不能直接支持gb2312, 经过大量分析和验证,发现 凡事 经过"从汉字转换到16进制"后, 即可成功实现转码,代码如下: string text = "中文符"; string url1 = "http://www.abc.php?name="; string url2 = url1 + Utf8To

浅析pinyin4j源码 简单利用pinyin4j对中文字符进行自然排序

pinyin4j项目  官网地址 http://pinyin4j.sourceforge.net/ 我们先把资源下载下来,连同源码和jar包一起放入工程.如下图: 接下来在demo包下,我们写一个测试类,简单使用pinyin4j对中文字符进行自然排序 新建一个ConvertTest.java package demo; import java.util.ArrayList; import java.util.Collections; import java.util.Comparator; im

浅析pinyin4j源码 简单利用pinyin4j对中文字符进行自然排序(转)

pinyin4j项目  官网地址 http://pinyin4j.sourceforge.net/ 我们先把资源下载下来,连同源码和jar包一起放入工程.如下图: 接下来在demo包下,我们写一个测试类,简单使用pinyin4j对中文字符进行自然排序 新建一个ConvertTest.java package demo; import java.util.ArrayList; import java.util.Collections; import java.util.Comparator; im

使用 URLDecoder 和 URLEncoder 对中文字符进行编码和解码

摘要: URLDecoder 和 URLEncoder 用于完成普通字符串 和 application/x-www-form-urlencoded MIME 字符串之间的相互转换.在本文中,我们以使用URLDecoder解决GET请求中文乱码问题为场景说明 URLDecoder/URLEncoder 的用法,并给出了 application/x-www-form-urlencoded MIME 字符串的编码规则. 一. URLDecoder/URLEncoder 使用场景概述 URLDecode

c#中文转全拼或首拼

参考:http://www.jb51.net/article/42217.htmhttp://blog.csdn.net/cstester/article/details/4758172 ChineseToPinyinHelper.cs: using System; using System.Collections.Generic; using System.Text; using System.Text.RegularExpressions; namespace FisherComom { /

JAVA中文字符编码问题

JAVA的中文字符乱码问题一直很让人头疼.特别是在WEB应用中.网上的分析文章和解决方案都很多,但总是针对某些特定情况的.很多次遇到乱码问 题后, 经过极为辛苦的调试和搜索资料后终于解决,满以为自己已经掌握了对付这些字符乱码怪兽的诀窍.可当过段时间,换了个应用或换了个环境,又会碰到那讨厌的火 星文,并再次无所适从.于是下决心好好整理一下中文字符编码问题,以方便自己记忆,也为其他程序员兄弟们提供一份参考. 首先要了解JAVA处理字符的原理.JAVA使用UNICODE来存储字符数据,处理字符时通常有

匹配中文字符的正则表达式: [/u4e00-/u9fa5]

原文:匹配中文字符的正则表达式: [/u4e00-/u9fa5] 这里是几个主要非英文语系字符范围(google上找到的): 2E80-33FFh:中日韩符号区.收容康熙字典部首.中日韩辅助部首.注音符号.日本假名.韩文音符,中日韩的符号.标点.带圈或带括符文数字.月份,以及日本的假名组合.单位.年号.月份.日期.时间等. 3400-4DFFh:中日韩认同表意文字扩充A区,总计收容6,582个中日韩汉字. 4E00-9FFFh:中日韩认同表意文字区,总计收容20,902个中日韩汉字. A000-