一个支持多平台的字符编码转换还挺麻烦得。 Android NDK 自带的 iconv 链接,代码能编译链接成功,但是无法实现 gbk/utf8 编码的正常转换。需要自己编译一个 libiconv。
#include "kminwindef.h"
#include "kIrrCompileConfig.h"
#ifdef _KIRR_WINDOWS_
#ifdef _KIRR_ANDROID_PLATFORM_
#ifdef _KIRR_WINDOWS_API_
// 常用的几个。
#ifdef _MSC_VER /* Visual Studio */
#ifdef __ANDROID__
// 不常用不建议采用的。
#ifdef _WIN32 // _WINDOWS
android 编译 iconv 库,实现 gbk 与 utf8 之间的转换。 https://www.gnu.org/software/libiconv/ doc
这玩意需要先跑 $ sudo sh ./configure
Android 配置不了,配置生成 linux 环境下的 config.h 文件,反正 Android 和 Ubuntu 也差不多。
找了台 Ubuntu 64 位 系统配置,配置好后入库。
这里是标准做法:Building iconv for Android 编写 ios 和 android 共用的 c/c++ 库时 使用 iconv 的问题,貌似苹果版本
./configure --host=$ANDROID_HOST --with-sysroot=$ANDROID_SYSROOT
hawkhai@ubuntu:/Users/apple/Desktop/libiconv-1.16$ sudo sh ./configure
hawkhai@ubuntu:/Users/apple/Desktop/libiconv-1.16$
再搞一个 cmake 去编译它。
cmake_minimum_required(VERSION 3.4.1)
include_directories(
${CMAKE_CURRENT_SOURCE_DIR}/include
${CMAKE_CURRENT_SOURCE_DIR}/libcharset
${CMAKE_CURRENT_SOURCE_DIR}/lib
${CMAKE_CURRENT_SOURCE_DIR}/libcharset/include
${CMAKE_CURRENT_SOURCE_DIR}/srclib
)
add_definitions(-frtti -fexceptions
-Wno-multichar
-DANDROID
-DLIBDIR="c"
-DBUILDING_LIBICONV
-DIN_LIBRARY)
add_library(
iconv
STATIC
libcharset/lib/localcharset.c
lib/iconv.c
lib/relocatable.c
)
E:/libiconv-1.16/libcharset/lib/localcharset.c:696: error: undefined reference to 'nl_langinfo'
修改目录文件 libcharset/config.h 的宏定义,禁止该宏定义:
/* Define if you have <langinfo.h> and nl_langinfo(CODESET). */
#define HAVE_LANGINFO_CODESET 0
简单。WideCharToMultiByte() 和 MultiByteToWideChar()。
#pragma once
#include <stddef.h>
#include <locale.h>
#include <stdlib.h>
#include <string.h>
#define _ENABLE_PER_THREAD_LOCALE 0x1
#define _DISABLE_PER_THREAD_LOCALE 0x2
// https://www.gnu.org/software/libc/manual/html_node/Setting-the-Locale.html
// https://blog.weghos.com/skia/Skia/src/gpu/GrAutoLocaleSetter.h.html
class LocaleGuard {
private:
#ifdef _MSC_VER
char* m_localeSaved = nullptr;
char* m_localeNew = nullptr;
int m_preLocaleStatus = _DISABLE_PER_THREAD_LOCALE;
#else
locale_t fOldLocale = 0;
locale_t fLocale = 0;
#endif
int m_category = LC_CTYPE; // LC_ALL LC_CTYPE
public:
LocaleGuard(const char* locale, int category = LC_CTYPE) {
#ifdef _MSC_VER
m_preLocaleStatus = _configthreadlocale(_ENABLE_PER_THREAD_LOCALE);
m_category = category;
char* localeOld = setlocale(m_category, NULL);
if (localeOld) {
m_localeSaved = strdup(localeOld);
}
char* localeNew = setlocale(m_category, locale);
if (localeNew) {
m_localeNew = strdup(localeNew);
}
#else
m_category = category;
fLocale = newlocale(m_category, locale, nullptr);
if (fLocale) {
fOldLocale = uselocale(fLocale);
} else {
fOldLocale = static_cast<locale_t>(nullptr);
}
#endif
}
#ifdef _MSC_VER
const int getPreLocaleStatus() {
return m_preLocaleStatus;
}
const char* getPreLocaleInfo() {
return m_localeSaved;
}
const char* getLocaleInfo() {
return m_localeNew;
}
#else
locale_t getPreLocaleInfo() {
return fOldLocale;
}
locale_t getLocaleInfo() {
return fLocale;
}
#endif
virtual ~LocaleGuard() {
#ifdef _MSC_VER
if (m_localeSaved) {
setlocale(m_category, m_localeSaved);
free(m_localeSaved);
m_localeSaved = nullptr;
}
if (m_localeNew) {
free(m_localeNew);
m_localeNew = nullptr;
}
if (m_preLocaleStatus != -1) {
_configthreadlocale(m_preLocaleStatus);
}
#else
if (fLocale) {
uselocale(fOldLocale);
freelocale(fLocale);
}
#endif
}
};
#pragma once
#include <iostream>
#include <locale.h>
#include <stdio.h>
#include <stdlib.h>
#include <wchar.h>
#ifdef _MSC_VER
#include <windows.h>
#endif
//
// Code Page Default Values.
// Please Use Unicode, either UTF-16 (as in WCHAR) or UTF-8 (code page CP_ACP)
//
#define CP_ACP ((unsigned) 0) // default to ANSI code page
#define CP_OEMCP 1 // default to OEM code page
#define CP_MACCP 2 // default to MAC code page
#define CP_THREAD_ACP 3 // current thread's ANSI code page
#define CP_SYMBOL 42 // SYMBOL translations
#define CP_UTF7 65000 // UTF-7 translation
#define CP_UTF8 65001 // UTF-8 translation
std::wstring strToWstr(const std::string& text, unsigned codepage = CP_UTF8);
std::string wstrToStr(const std::wstring& text, unsigned codepage = CP_UTF8);
//setlocale(LC_CTYPE, "chinese-traditional"); // 认为输入的 MBS 是 Big5 编码
//setlocale(LC_CTYPE, "chinese-simplified"); // 设置输出的 MBS 为 GBK 编码
#define LOCALE_CHINESE_CHINA_936 "Chinese_China.936"
//#define LOCALE_CHINESE_SIMPLIFIED "chinese-simplified"
#define LOCALE_ZH_CN_UTF8 "zh_CN.utf8"
std::wstring strToWstr(const std::string& text, char const* locale);
std::string wstrToStr(const std::wstring& text, char const* locale);
#include "vmbswcs.h"
#include "vlocale.h"
#include <assert.h>
#include <string.h>
#ifdef __ANDROID__
#include "iconv.h"
#endif
std::wstring strToWstr(const std::string& text, unsigned codepage/*CP_UTF8*/)
{
#ifdef _MSC_VER
const int length = ::MultiByteToWideChar(codepage, 0, text.c_str(), (int)text.size(), NULL, 0);
if (length > 0) {
std::wstring str;
str.resize(length);
if (0 != ::MultiByteToWideChar(codepage, 0, text.c_str(), (int)text.size(), &str[0], (int)str.size()))
return str;
}
return L"";
#endif
if (CP_ACP == codepage) {
return strToWstr(text.c_str(), LOCALE_CHINESE_CHINA_936).c_str();
}
if (CP_UTF8 == codepage) {
return strToWstr(text.c_str(), LOCALE_ZH_CN_UTF8).c_str();
}
throw "not implement yet.";
return L"";
}
std::string wstrToStr(const std::wstring& text, unsigned codepage/*CP_UTF8*/)
{
#ifdef _MSC_VER
const int length = ::WideCharToMultiByte(codepage, 0, text.c_str(), (int)text.size(), NULL, 0, NULL, NULL);
if (length > 0) {
std::string str;
str.resize(length);
if (0 != ::WideCharToMultiByte(codepage, 0, text.c_str(), (int)text.size(), &str[0], (int)str.size(), NULL, NULL))
return str;
}
return "";
#endif
if (CP_ACP == codepage) {
return wstrToStr(text.c_str(), LOCALE_CHINESE_CHINA_936).c_str();
}
if (CP_UTF8 == codepage) {
return wstrToStr(text.c_str(), LOCALE_ZH_CN_UTF8).c_str();
}
throw "not implement yet.";
return "";
}
#ifdef __ANDROID__
int code_convert(
const char *from_charset,
const char *to_charset,
const char *inbuf,
size_t inlen,
char *outbuf,
size_t outlen) {
const char **pin = &inbuf;
char **pout = &outbuf;
memset(outbuf, 0, outlen);
// https://blog.csdn.net/Sky_qing/article/details/8995201
iconv_t cd = iconv_open(to_charset, from_charset);
if (cd == (iconv_t)(-1)) {
return -1;
}
iconv(cd, const_cast<char**>(pin), &inlen, pout, &outlen);
iconv_close(cd);
return 0;
}
/* UTF-8 to GBK */
int code_u2g(const char *inbuf, size_t inlen, char *outbuf, size_t outlen) {
int retv = code_convert("UTF-8","GBK", inbuf, inlen, outbuf, outlen);
return retv;
}
/* GBK to UTF-8 */
int code_g2u(const char *inbuf, size_t inlen, char *outbuf, size_t outlen) {
int retv = code_convert("GBK", "UTF-8", inbuf, inlen, outbuf, outlen);
return retv;
}
#endif
std::string wstrToStr(const std::wstring& text, char const* locale)
{
#ifdef __ANDROID__
if (strcmp(LOCALE_CHINESE_CHINA_936, locale) == 0) {
std::string retv = wstrToStr(text.c_str(), LOCALE_ZH_CN_UTF8);
int xsize = retv.length() * 2 + 1;
char* buffer = new char[xsize];
int rc = code_u2g(retv.c_str(), retv.length(), buffer, xsize);
retv = buffer;
delete[] buffer;
return retv;
}
#endif
#ifdef _MSC_VER
if (strcmp(LOCALE_CHINESE_CHINA_936, locale) == 0) {
return wstrToStr(text, CP_ACP);
}
if (strcmp(LOCALE_ZH_CN_UTF8, locale) == 0) {
return wstrToStr(text, CP_UTF8);
}
#endif
LocaleGuard temp(locale, LC_CTYPE);
#ifdef DEBUG_FAKELIB
printf("Locale information %s %d -> %s\n", temp.getPreLocaleInfo(), temp.getPreLocaleStatus(), temp.getLocaleInfo());
#endif
int reqsize = wcstombs(NULL, text.c_str(), 0) + 1;
if (reqsize == 0) {
return "";
}
char* pmb = (char*)malloc(reqsize);
if (pmb == nullptr) {
return "";
}
size_t size = wcstombs(pmb, text.c_str(), reqsize);
if (size == (size_t)(-1)) {
free(pmb);
return "";
}
#ifdef DEBUG_FAKELIB
assert(size + 1 == reqsize);
#endif
pmb[size] = 0;
std::string str(pmb);
free(pmb);
return str;
}
std::wstring strToWstr(const std::string& text, char const* locale)
{
#ifdef __ANDROID__
if (strcmp(LOCALE_CHINESE_CHINA_936, locale) == 0) {
int xsize = text.length() * 2 + 1;
char* buffer = new char[xsize];
int rc = code_g2u(text.c_str(), text.length(), buffer, xsize);
std::wstring retv = strToWstr(buffer, LOCALE_ZH_CN_UTF8);
delete[] buffer;
return retv;
}
#endif
#ifdef _MSC_VER
if (strcmp(LOCALE_CHINESE_CHINA_936, locale) == 0) {
return strToWstr(text, CP_ACP);
}
if (strcmp(LOCALE_ZH_CN_UTF8, locale) == 0) {
return strToWstr(text, CP_UTF8);
}
#endif
LocaleGuard temp(locale, LC_CTYPE);
#ifdef DEBUG_FAKELIB
printf("Locale information %s %d -> %s\n", temp.getPreLocaleInfo(), temp.getPreLocaleStatus(), temp.getLocaleInfo());
#endif
int reqsize = mbstowcs(NULL, text.c_str(), 0) + 1;
if (reqsize == 0) {
return L"";
}
wchar_t* pwc = (wchar_t*)malloc(reqsize * sizeof(wchar_t));
if (pwc == nullptr) {
return L"";
}
size_t size = mbstowcs(pwc, text.c_str(), reqsize);
if (size == (size_t)(-1)) {
free(pwc);
return L"";
}
#ifdef DEBUG_FAKELIB
assert(size + 1 == reqsize);
#endif
pwc[size] = 0;
std::wstring str(pwc);
free(pwc);
return str;
}
#include "vlocale.h"
#include "vmbswcs.h"
void myprint(const char* tag, const wchar_t* pstr)
{
printf("%8s\t{ ", tag);
for (int i = 0; i < wcslen(pstr); i++) {
wprintf(L"%#.x, ", unsigned int(pstr[i]));
}
wprintf(L"0 }\n");
}
void myprint(const char* tag, const char* pstr)
{
printf("%8s\t { ", tag);
for (int i = 0; i < strlen(pstr); i++) {
printf("%#.x, ", unsigned char(pstr[i]));
}
printf("0 }\n");
}
int maintest()
{
// const wchar_t* pwchello = L"Hello 世界!";
wchar_t helloWc[] = { 0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x4e16, 0x754c, 0xff01, 0, };
unsigned char helloAcp[] = { 0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0xca, 0xc0, 0xbd, 0xe7, 0xa3, 0xa1, 0, };
unsigned char helloUtf8[] = { 0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0xe4, 0xb8, 0x96, 0xe7, 0x95, 0x8c, 0xef, 0xbc, 0x81, 0, };
std::string temp1 = wstrToStr(helloWc, LOCALE_CHINESE_CHINA_936);
std::string temp2 = wstrToStr(helloWc, LOCALE_ZH_CN_UTF8);
std::wstring temp3 = strToWstr((char*)helloAcp, LOCALE_CHINESE_CHINA_936);
std::wstring temp4 = strToWstr((char*)helloUtf8, LOCALE_ZH_CN_UTF8);
return 0;
}
int main()
{
maintest();
const wchar_t* pwchello = L"Hello 世界!";
std::string utf8str = wstrToStr(pwchello, CP_UTF8);
std::string acpstr = wstrToStr(pwchello, CP_ACP);
myprint("wchar_t", pwchello);
myprint("CP_ACP", acpstr.c_str());
myprint("CP_UTF8", utf8str.c_str());
std::string temp1 = wstrToStr(pwchello, LOCALE_CHINESE_CHINA_936);
myprint("wcstombs", temp1.c_str());
std::string temp2 = wstrToStr(pwchello, LOCALE_ZH_CN_UTF8);
myprint("wcstombs", temp2.c_str());
std::wstring temp3 = strToWstr(acpstr.c_str(), LOCALE_CHINESE_CHINA_936);
myprint("mbstowcs", temp3.c_str());
std::wstring temp4 = strToWstr(utf8str.c_str(), LOCALE_ZH_CN_UTF8);
myprint("mbstowcs", temp4.c_str());
getchar();
return 0;
}
wchar_t { 0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x4e16, 0x754c, 0xff01, 0 }
CP_ACP { 0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0xca, 0xc0, 0xbd, 0xe7, 0xa3, 0xa1, 0 }
CP_UTF8 { 0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0xe4, 0xb8, 0x96, 0xe7, 0x95, 0x8c, 0xef, 0xbc, 0x81, 0 }
Locale information C 2 -> Chinese_China.936
wcstombs { 0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0xca, 0xc0, 0xbd, 0xe7, 0xa3, 0xa1, 0 }
Locale information C 2 -> zh_CN.utf8
wcstombs { 0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0xe4, 0xb8, 0x96, 0xe7, 0x95, 0x8c, 0xef, 0xbc, 0x81, 0 }
Locale information C 2 -> Chinese_China.936
mbstowcs { 0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x4e16, 0x754c, 0xff01, 0 }
Locale information C 2 -> zh_CN.utf8
mbstowcs { 0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x4e16, 0x754c, 0xff01, 0 }
悲剧。mbstowcs() 和 wcstombs()。多线程中这玩意悲剧。
size_t mbstowcs(wchar_t *wcstr, const char *mbstr, size_t count);
// 通过这玩意调节编码:"zh_CN.utf8" / "Chinese_China.936"
// Android 不支持 "Chinese_China.936"
setlocale(LC_CTYPE, "zh_CN.utf8");
Windows 特有 API:_configthreadlocale
。
setlocale(LC_CTYPE, "chs");
// 使 setlocale 只针对当前线程起作用
_configthreadlocale(_ENABLE_PER_THREAD_LOCALE);
// 使 setlocale 对所有线程的设置都有用(默认值)
_configthreadlocale(_DISABLE_PER_THREAD_LOCALE);
setlocale // 只能在当前进程各自的运行时库里生效
Android Java,还可以通过 JNI 去搞。 new String(str.getBytes(), "UTF-8");
char* jstringTostring(JNIEnv* env, jstring jstr)
{
jstring strencode = env->NewStringUTF("utf-8");
jclass strClass = env->FindClass("java/lang/String");
jmethodID getBytesId = env->GetMethodID(strClass, "getBytes", "(Ljava/lang/String;)[B");
jbyteArray barr = (jbyteArray)env->CallObjectMethod(jstr, getBytesId, strencode);
jsize alen = env->GetArrayLength(barr);
jbyte* ba = env->GetByteArrayElements(barr, JNI_FALSE);
char* rtn = NULL;
if (alen > 0) {
rtn = (char*)malloc(alen + 1);
memcpy(rtn, ba, alen);
rtn[alen] = 0;
}
env->ReleaseByteArrayElements(barr, ba, 0);
return rtn;
}
jstring stringToJstring(JNIEnv* env, const char* pat)
{
jstring encoding = env->NewStringUTF("utf-8");
jclass strClass = env->FindClass("Ljava/lang/String;");
jmethodID initId = env->GetMethodID(strClass, "<init>", "([BLjava/lang/String;)V");
jbyteArray bytes = env->NewByteArray(strlen(pat));
env->SetByteArrayRegion(bytes, 0, strlen(pat), (jbyte*)pat);
return (jstring)env->NewObject(strClass, initId, bytes, encoding);
}
locale 把按照所涉及到的文化传统的各个方面分成 12 个大类,这 12 个大类分别是:
Locale 是软件在运行时的语言环境,它包括语言 (Language),地域 (Territory) 和字符集 (Codeset)。
一个 locale 的书写格式为:语言[_地域[.字符集]]
。
zh_CN.GB2312
= 中文_中华人民共和国+国标 2312 字符集
。zh_TW.BIG5
= 中文_台湾.大五码字符集
。转换描述符不能再多线程中同时使用。 GBK,UTF−8。
#include <iconv.h>
iconv_t iconv_open(const char *tocode, const char *fromcode);
size_t iconv(iconv_t cd, char **inbuf, size_t *inbytesleft, char **outbuf, size_t *outbytesleft);
int iconv_close(iconv_t cd);
这个库有意思。CMake-based MinGW-w64 Cross Toolchain 里面包含:ANGLE、FFmpeg 等各种库,遗憾没有 Android 版本。