相信一定有不少的程序開發(fā)人員時(shí)常會(huì)遇到字符編碼的問題,,而這個(gè)問題也是非常讓人頭痛的。因?yàn)檫@些都是潛在的錯(cuò)誤,,要找出這些錯(cuò)誤也得要有這方面的開發(fā)經(jīng)驗(yàn)才行,。特別是在處理xml文檔時(shí)
,,該問題的出現(xiàn)就更加的頻繁了,有一次用java寫服務(wù)器端程序,,用vc寫客戶端與之交互,。交互的協(xié)議都是用xml寫的。結(jié)果在通訊時(shí)老是發(fā)現(xiàn)數(shù)據(jù)接受不正確,。納悶,!于是用抓取網(wǎng)絡(luò)數(shù)據(jù)包工具抓取數(shù)據(jù),后來才發(fā)現(xiàn)原來是java上xml的頭是這樣的<?xml
version="1.0"
encoding="UTF-8"?>,而vc上默認(rèn)的是GB2312,。所以一遇到漢字?jǐn)?shù)據(jù)就不正確了,。去網(wǎng)上找資料,這方面的文章好象特別少,,針對(duì)像這樣的問題,,下面我介紹一下我自己寫的一個(gè)轉(zhuǎn)換程序,。當(dāng)然,程序很簡單,。如果有畫蛇添足的地方,,還望各位高手一笑了之。 如果您對(duì)UTF-8,、Unicode,、GB2312等還是很陌生的話,請(qǐng)查看http://www./books/UTF-8-Unicode.html,,我這里就不浪費(fèi)口舌了,。下面介紹一下WinAPI的兩個(gè)函數(shù):WideCharToMultiByte、MultiByteToWideChar,。
函數(shù)原型:
int WideCharToMultiByte(
UINT CodePage, // code page
DWORD dwFlags, // performance and mapping flags
LPCWSTR lpWideCharStr, // wide-character string
int cchWideChar, // number of chars in string
LPSTR lpMultiByteStr, // buffer for new string
int cbMultiByte, // size of buffer
LPCSTR lpDefaultChar, // default for unmappable chars
LPBOOL lpUsedDefaultChar // set when default char used
); //將寬字符轉(zhuǎn)換成多個(gè)窄字符
int MultiByteToWideChar(
UINT CodePage, // code page
DWORD dwFlags, // character-type options
LPCSTR lpMultiByteStr, // string to map
int cbMultiByte, // number of bytes in string
LPWSTR lpWideCharStr, // wide-character buffer
int cchWideChar // size of buffer
);//將多個(gè)窄字符轉(zhuǎn)換成寬字符 需要用到的一些函數(shù):CString CXmlProcess::HexToBin(CString string)//將16進(jìn)制數(shù)轉(zhuǎn)換成2進(jìn)制
{
if( string == "0") return "0000";
if( string == "1") return "0001";
if( string == "2") return "0010";
if( string == "3") return "0011";
if( string == "4") return "0100";
if( string == "5") return "0101";
if( string == "6") return "0110";
if( string == "7") return "0111";
if( string == "8") return "1000";
if( string == "9") return "1001";
if( string == "a") return "1010";
if( string == "b") return "1011";
if( string == "c") return "1100";
if( string == "d") return "1101";
if( string == "e") return "1110";
if( string == "f") return "1111";
return "";
}
CString CXmlProcess::BinToHex(CString BinString)//將2進(jìn)制數(shù)轉(zhuǎn)換成16進(jìn)制
{
if( BinString == "0000") return "0";
if( BinString == "0001") return "1";
if( BinString == "0010") return "2";
if( BinString == "0011") return "3";
if( BinString == "0100") return "4";
if( BinString == "0101") return "5";
if( BinString == "0110") return "6";
if( BinString == "0111") return "7";
if( BinString == "1000") return "8";
if( BinString == "1001") return "9";
if( BinString == "1010") return "a";
if( BinString == "1011") return "b";
if( BinString == "1100") return "c";
if( BinString == "1101") return "d";
if( BinString == "1110") return "e";
if( BinString == "1111") return "f";
return "";
}
int CXmlProcess::BinToInt(CString string)//2進(jìn)制字符數(shù)據(jù)轉(zhuǎn)換成10進(jìn)制整型
{
int len =0;
int tempInt = 0;
int strInt = 0;
for(int i =0 ;i < string.GetLength() ;i ++)
{
tempInt = 1;
strInt = (int)string.GetAt(i)-48;
for(int k =0 ;k < 7-i ; k++)
{
tempInt = 2*tempInt;
}
len += tempInt*strInt;
}
return len;
} UTF-8轉(zhuǎn)換成GB2312先把UTF-8轉(zhuǎn)換成Unicode.然后再把Unicode通過函數(shù)WideCharToMultiByte轉(zhuǎn)換成GB2312
WCHAR* CXmlProcess::UTF_8ToUnicode(char *ustart) //把UTF-8轉(zhuǎn)換成Unicode
{
char char_one;
char char_two;
char char_three;
int Hchar;
int Lchar;
char uchar[2];
WCHAR *unicode;
CString string_one;
CString string_two;
CString string_three;
CString combiString;
char_one = *ustart;
char_two = *(ustart+1);
char_three = *(ustart+2);
string_one.Format("%x",char_one);
string_two.Format("%x",char_two);
string_three.Format("%x",char_three);
string_three = string_three.Right(2);
string_two = string_two.Right(2);
string_one = string_one.Right(2);
string_three = HexToBin(string_three.Left(1))+HexToBin(string_three.Right(1));
string_two = HexToBin(string_two.Left(1))+HexToBin(string_two.Right(1));
string_one = HexToBin(string_one.Left(1))+HexToBin(string_one.Right(1));
combiString = string_one +string_two +string_three;
combiString = combiString.Right(20);
combiString.Delete(4,2);
combiString.Delete(10,2);
Hchar = BinToInt(combiString.Left(8));
Lchar = BinToInt(combiString.Right(8));
uchar[1] = (char)Hchar;
uchar[0] = (char)Lchar;
unicode = (WCHAR *)uchar;
return unicode;
}
char * CXmlProcess::UnicodeToGB2312(unsigned short uData) //把Unicode 轉(zhuǎn)換成 GB2312
{
char *buffer ;
buffer = new char[sizeof(WCHAR)];
WideCharToMultiByte(CP_ACP,NULL,&uData,1,buffer,sizeof(WCHAR),NULL,NULL);
return buffer;
}
GB2312轉(zhuǎn)換成UTF-8:先把GB2312通過函數(shù)MultiByteToWideChar轉(zhuǎn)換成Unicode.然后再把Unicode通過拆開Unicode后拼裝成UTF-8,。 WCHAR * CXmlProcess::Gb2312ToUnicode(char *gbBuffer) //GB2312 轉(zhuǎn)換成 Unicode
{
WCHAR *uniChar;
uniChar = new WCHAR[1];
::MultiByteToWideChar(CP_ACP,MB_PRECOMPOSED,gbBuffer,2,uniChar,1);
return uniChar;
}
char * CXmlProcess::UnicodeToUTF_8(WCHAR *UniChar) // Unicode 轉(zhuǎn)換成UTF-8
{
char *buffer;
CString strOne;
CString strTwo;
CString strThree;
CString strFour;
CString strAnd;
buffer = new char[3];
int hInt,lInt;
hInt = (int)((*UniChar)/256);
lInt = (*UniChar)%256;
CString string ;
string.Format("%x",hInt);
strTwo = HexToBin(string.Right(1));
string = string.Left(string.GetLength() - 1);
strOne = HexToBin(string.Right(1));
string.Format("%x",lInt);
strFour = HexToBin(string.Right(1));
string = string.Left(string.GetLength() -1);
strThree = HexToBin(string.Right(1));
strAnd = strOne +strTwo + strThree + strFour;
strAnd.Insert(0,"1110");
strAnd.Insert(8,"10");
strAnd.Insert(16,"10");
strOne = strAnd.Left(8);
strAnd = strAnd.Right(16);
strTwo = strAnd.Left(8);
strThree = strAnd.Right(8);
*buffer = (char)BinToInt(strOne);
buffer[1] = (char)BinToInt(strTwo);
buffer[2] = (char)BinToInt(strThree);
return buffer;
} 例子:將GB2312轉(zhuǎn)換成UTF-8的調(diào)用: char * CXmlProcess::translateCharToUTF_8(char *xmlStream, int len)
{
int newCharLen =0 ;
int oldCharLen = 0;
int revCharLen = len;
char* newCharBuffer;
char* finalCharBuffer;
char *buffer ;
CString string;
buffer = new char[sizeof(WCHAR)];
newCharBuffer = new char[int(1.5*revCharLen)];//設(shè)置最大的一個(gè)緩沖區(qū)
while(oldCharLen < revCharLen)
{
if( *(xmlStream + oldCharLen) >= 0)
{
*(newCharBuffer+newCharLen) = *(xmlStream +oldCharLen);
newCharLen ++;
oldCharLen ++;
}//如果是英文直接復(fù)制就可以
else
{
WCHAR *pbuffer = this->Gb2312ToUnicode(xmlStream+oldCharLen);
buffer = this->UnicodeToUTF_8(pbuffer);
*(newCharBuffer+newCharLen) = *buffer;
*(newCharBuffer +newCharLen +1) = *(buffer + 1);
*(newCharBuffer +newCharLen +2) = *(buffer + 2);
newCharLen += 3;
oldCharLen += 2;
}
}
newCharBuffer[newCharLen] = ''\0'';
CString string1 ;
string1.Format("%s",newCharBuffer);
finalCharBuffer = new char[newCharLen+1];
memcpy(finalCharBuffer,newCharBuffer,newCharLen+1);
return finalCharBuffer;
}
程序都非常的簡單,由于實(shí)在太窮,。已經(jīng)吃了兩天的方便面,。所以現(xiàn)在頭昏,程序的詳細(xì)說明就不寫了,。程序員到了像我這樣的地步也真是少見,。工資低沒有辦法。哎?。,。?!
|
/* 字符串編碼轉(zhuǎn)換 GBK to UTF8 (ansi版) [email protected] */ char *gbk2utf8(const char *strGBK){ int len; wchar_t *strUnicode; char *strUTF8; if (!strGBK){return NULL;} len = MultiByteToWideChar(CP_GBK, 0,strGBK, -1, NULL,0); if (len <1){return NULL;} strUnicode = (wchar_t *) malloc(sizeof(wchar_t) * len); if (!strUnicode){return NULL;} len = MultiByteToWideChar(CP_GBK, 0, strGBK, -1, strUnicode, len); if (len<1){free(strUnicode);return NULL;} len = WideCharToMultiByte(CP_UTF8, 0, strUnicode, -1, NULL, 0, NULL, NULL); if (len<1){free(strUnicode);return NULL;} strUTF8 = (char *) malloc(sizeof(char) * len); if (!strUTF8){free(strUnicode);return NULL;} len = WideCharToMultiByte (CP_UTF8, 0, strUnicode, -1, strUTF8, len, NULL,NULL); free(strUnicode); if (len<1){free(strUTF8);return NULL;} return strUTF8; }
( xmwen 發(fā)表于 2009-11-3 19:38:00)
[ 原創(chuàng)文檔 本文適合中級(jí)讀者 已閱讀34485次 ]
搞笑,,這種害人害己的文章還有這么多人訪問。
作者光知道 WideCharToMultiByte 可以把 Unicode 轉(zhuǎn)成 GB2312 就不知道也可以把 Unicode 轉(zhuǎn)換為 UTF-8 嗎,?
其實(shí)這是一個(gè)很簡單的程序,,都被作者搞復(fù)雜了。
要實(shí)現(xiàn) GB2312 (其實(shí)是GBK)轉(zhuǎn)換為 UTF-8 其實(shí)很簡單,,先用 MultiByteToWideChar 把 GB2312 轉(zhuǎn)換為 Unicode,,再用 WideCharToMultiByte 把 Unicode 轉(zhuǎn)換為 UTF-8 就可以了。
UTF-8 轉(zhuǎn)換為 GB2312 是個(gè)相反的過程,,先用 MultiByteToWideChar 把 UTF-8 轉(zhuǎn)換為 Unicode,,再用 WideCharToMultiByte 把 Unicode 轉(zhuǎn)換為 GB2312 就可以了。
( 雁過留聲 發(fā)表于 2007-1-11 9:11:00)
translateCharToUTF_8的編碼不對(duì), 請(qǐng)作者檢查一下, 如: "你是我的好朋友" 轉(zhuǎn)換成了;"浣犳槸鎴戠殑濂芥i脲弸鍚?" 正確的應(yīng)是: "浣犳槸鎴戠殑濂芥湅鍙嬪悧" 對(duì)于有的編碼還能對(duì)... 交流一下:[email protected] (
kudoo 發(fā)表于 2006-8-20 19:46:00)
shines在2005-2-6,,提供了一段程序,,里面有 buffersize = WideCharToMultiByte(CP_UTF8, MB_PRECOMPOSED, unicode, wide_size, NULL, 0, NULL, 0); buffer = new char[buffersize+1];
但是,我在調(diào)試的時(shí)候發(fā)現(xiàn):buffersize似乎已經(jīng)預(yù)先留了‘\0’的位置,,或者是不是我出錯(cuò)了 比如:“i love you,愛”GB2312是需要14個(gè)字節(jié) UTF8是需要15個(gè)字節(jié),,返回時(shí)候就是這些了啊,, 我的地址是:[email protected], 誰能回答以下,,感謝?。?
( robin_fox_nan 發(fā)表于 2006-3-19 20:20:00)
暈.格式?jīng)]有了 原文請(qǐng)看 http://www./viewfull.asp?id=33 ( 鬼龍之舞 發(fā)表于
2005-8-25 16:13:00)
支持樓主!是因?yàn)槟阄也艑懗鰜淼?不管是在體積還是在速度,相信都比樓主的強(qiáng)一點(diǎn),如果不考慮移植性的話 感謝樓主!!
UTF8toUnicode proc uses esi edi lpszBuf_OUT,lpszUTF8_IN mov esi,lpszUTF8_IN mov edi,lpszBuf_OUT .while TRUE mov al,[esi] .if sbyte ptr al <0 mov al,[esi] and al,00001111b shl al,4 mov [edi+1],al mov al,[esi+1] and al,00111100b shr al,2 or [edi+1],al
mov al,[esi+1] and al,11b shl al,6 mov [edi+0],al mov al,[esi+2] and al,00111111b or [edi+0],al add edi,2 add esi,3 .elseif al xor ah,ah stosw inc esi .else mov WORD ptr [edi],0 .break .endif .endw ret UTF8toUnicode endp
( 鬼龍之舞 發(fā)表于 2005-8-25 16:11:00)
UnicodetoUTF8 proc uses esi edi lpBuf_OUT,lpszUTF8_IN mov esi,lpszUTF8_IN mov edi,lpBuf_OUT .while TRUE mov ax,[esi] .if ax==0 stosw .break .elseif ah==0 add esi,2 stosw .else mov al,[esi+1] shr al,4 or al,11100000b mov [edi+0],al
mov al,[esi+1] and al,00001111b shl al,2 or al,10000000b mov ah,[esi+0] shr ah,6 or al,ah mov [edi+1],al
mov al,[esi+0] and al,00111111b or al,10000000b mov [edi+2],al
add edi,3 add esi,2 .endif .endw ret UnicodetoUTF8 endp
( 鬼龍之舞 發(fā)表于 2005-8-25 16:11:00) 幫忙弄以下 ( zztop5384 發(fā)表于 2005-4-18 10:35:00)
int WideCharToMultiByte( UINT CodePage, // code page DWORD dwFlags, // performance and mapping flags LPCWSTR lpWideCharStr, // wide-character string int cchWideChar, // number of chars in string LPSTR lpMultiByteStr, // buffer for new string int cbMultiByte, // size of buffer LPCSTR lpDefaultChar, // default for unmappable chars LPBOOL lpUsedDefaultChar // set when default char used ); //將寬字符轉(zhuǎn)換成多個(gè)窄字符
這些只是函數(shù)原型,,并沒有具體實(shí)現(xiàn)
( zztop5384 發(fā)表于 2005-4-18 10:27:00)
//對(duì)不起,,少加了個(gè)擴(kuò)號(hào) WCHAR* CXmlProcess::UTF_8ToUnicode(char *pText) { char uchar[2]; WCHAR *unicode;
char_one = pText[0]; char_two = pText[1]); char_three = pText[2];
uchar[1] = ((pText[0] & 0x0F) << 4) + ((pText[1] >> 2) & 0x0F); uchar[0] = ((pText[1] & 0x03) << 6) + (pText[2] & 0x3F);
unicode = (WCHAR *)uchar; return unicode;
} |
|