Added routines to handle conversion of Java strings between UTF8 and UCS2
representations.
This commit is contained in:
@@ -26,23 +26,73 @@ static inline JavaArray *newCharArray(Uint32 length)
|
||||
return (new (mem) JavaArray(Array::obtain(tkChar), length));
|
||||
}
|
||||
|
||||
/* Count the number of bytes it would take to encode the given Unicode
|
||||
* string using UTF-8. Add in the extra byte for the terminating NUL.
|
||||
*/
|
||||
static int
|
||||
countUtf8Chars(const uint16 *ucs2, int ucs2len)
|
||||
{
|
||||
int utf8len = 1; // Need one character for terminating NUL
|
||||
|
||||
/* Return the UTF representation of this string. This routine allocates
|
||||
for (int i = ucs2len-1; i >= 0; i--) {
|
||||
uint16 u = ucs2[i];
|
||||
if (u < 0x80)
|
||||
utf8len += 1;
|
||||
else if (u < 0x800)
|
||||
utf8len += 2;
|
||||
else
|
||||
utf8len += 3;
|
||||
}
|
||||
return utf8len;
|
||||
}
|
||||
|
||||
/* Convert a Unicode (UCS-2) string to UTF-8 encoding. The length of
|
||||
* the destination string, in bytes, is given by the utf8len argument.
|
||||
* A NUL character is appended to the destination string, if possible.
|
||||
* Returns: the actual length of the resulting string, in bytes.
|
||||
*/
|
||||
static int
|
||||
convertUnicodeToUtf8(char *utf8, const uint16* ucs2, int utf8len)
|
||||
{
|
||||
char* start_utf8 = utf8;
|
||||
char* lastchar = utf8 + utf8len - 1;
|
||||
|
||||
while (utf8 < lastchar) {
|
||||
uint16 u = *ucs2++;
|
||||
if (u < 0x80) {
|
||||
*utf8++ = (char)u;
|
||||
} else if (u < 0x800) {
|
||||
if (utf8 >= (lastchar - 1))
|
||||
break;
|
||||
*utf8++ = 0xc0 | ((u >> 6) & 0x1f);
|
||||
*utf8++ = 0x80 | (u & 0x3f);
|
||||
} else {
|
||||
if (utf8 >= (lastchar - 2))
|
||||
break;
|
||||
*utf8++ = 0xe0 | ((u >> 12) & 0x0f);
|
||||
*utf8++ = 0x80 | ((u >> 6) & 0x3f);
|
||||
*utf8++ = 0x80 | (u & 0x3f);
|
||||
}
|
||||
}
|
||||
if (utf8 <= lastchar)
|
||||
*utf8 = 0;
|
||||
|
||||
return utf8 - start_utf8;
|
||||
}
|
||||
|
||||
/* Return the UTF8 representation of this string. This routine allocates
|
||||
* enough memory for the conversion; this memory can be freed using
|
||||
* JavaString::freeUtf()
|
||||
*/
|
||||
char *JavaString::convertUtf()
|
||||
{
|
||||
/* XXX Fixme For now, we just copy the string over byte by byte... */
|
||||
const int16 *chars = getStr();
|
||||
char *copy = new char[count+1];
|
||||
const uint16 *chars = getStr();
|
||||
int utf8len = countUtf8Chars(chars, count);
|
||||
char *utf8 = new char[utf8len];
|
||||
|
||||
int32 i;
|
||||
for (i = 0; i < count; i++)
|
||||
copy[i] = (char) chars[i];
|
||||
convertUnicodeToUtf8(utf8, chars, utf8len);
|
||||
|
||||
copy[i] = 0;
|
||||
return copy;
|
||||
return utf8;
|
||||
}
|
||||
|
||||
void JavaString::freeUtf(char *str)
|
||||
@@ -50,29 +100,96 @@ void JavaString::freeUtf(char *str)
|
||||
delete [] str;
|
||||
}
|
||||
|
||||
/* Count the number of Unicode characters in a NUL-terminated
|
||||
* UTF8 string. Don't count the final NUL character.
|
||||
*/
|
||||
static int
|
||||
countUnicodeChars(const char *utf8)
|
||||
{
|
||||
signed char c;
|
||||
int length = 0;
|
||||
|
||||
// Unicode characters are encoded as 1, 2, or 3 bytes in a UCS-2 string
|
||||
while (c = *utf8) {
|
||||
length++;
|
||||
|
||||
if (c >= 0) {
|
||||
// Characters in the range of 0..0x7f are encoded using one byte
|
||||
// b0xxxxxxx
|
||||
utf8++;
|
||||
} else if ((c & 0xe0) == 0xc0) {
|
||||
// Characters in the range 0x80..0x7ff are encoded using two bytes
|
||||
// b110xxxxx b10yyyyyy
|
||||
utf8 += 2;
|
||||
} else {
|
||||
// Characters in the range 0x800..0xffff are encoded using three bytes
|
||||
// b1110xxxx b10yyyyyy b10zzzzzz
|
||||
PR_ASSERT((c & 0xf0) == 0xe0);
|
||||
utf8 += 3;
|
||||
}
|
||||
}
|
||||
return length;
|
||||
}
|
||||
|
||||
/* Convert a UTF-8 encoded string to Unicode (UCS-2) representation. The
|
||||
* length of the destination string, in 16-bit characters, is given by the
|
||||
* ucs2 argument. The result is *not* NUL-terminated.
|
||||
* Returns: the actual length of the resulting string, in characters.
|
||||
*/
|
||||
static int
|
||||
convertUTF8ToUnicode(uint16 *ucs2, const char *utf8, int ucs2len)
|
||||
{
|
||||
signed char c;
|
||||
int length = 0;
|
||||
|
||||
// Unicode characters are encoded as 1, 2, or 3 bytes in a UCS-2 string
|
||||
while ((c = *utf8) != 0) {
|
||||
length++;
|
||||
if (length > ucs2len)
|
||||
return ucs2len;
|
||||
|
||||
if (c >= 0) {
|
||||
// Characters in the range of 0..0x7f are encoded using one byte
|
||||
// b0xxxxxxx
|
||||
*ucs2 = c;
|
||||
utf8++;
|
||||
} else if ((c & 0xe0) == 0xc0) {
|
||||
// Characters in the range 0x80..0x7ff are encoded using two bytes
|
||||
// b110xxxxx b10yyyyyy
|
||||
*ucs2 = ((c & 0x1f) << 6) | (utf8[1] & 0x3f);
|
||||
utf8 += 2;
|
||||
} else {
|
||||
// Characters in the range 0x800..0xffff are encoded using three bytes
|
||||
// b1110xxxx b10yyyyyy b10zzzzzz
|
||||
PR_ASSERT((c & 0xf0) == 0xe0);
|
||||
*ucs2 = ((c & 0x0f) << 12) | ((utf8[1] & 0x3f) << 6) | (utf8[2] & 0x3f);
|
||||
utf8 += 3;
|
||||
}
|
||||
ucs2++;
|
||||
}
|
||||
return length;
|
||||
}
|
||||
|
||||
|
||||
/* Create a new JavaString from a char array that represents the string in UTF-8
|
||||
* format.
|
||||
*/
|
||||
JavaString::JavaString(const char *str) : JavaObject(*strType)
|
||||
{
|
||||
count = PL_strlen(str);
|
||||
count = countUnicodeChars(str);
|
||||
|
||||
offset = 0;
|
||||
|
||||
/* Let's keep the string zero-terminated anyway */
|
||||
value = (JavaArray *) newCharArray(count+1);
|
||||
int16 *chars = const_cast<int16 *>(getStr());
|
||||
value = (JavaArray *) newCharArray(count);
|
||||
uint16 *chars = const_cast<uint16 *>(getStr());
|
||||
|
||||
for (int32 i = 0; i < count; i++)
|
||||
chars[i] = str[i];
|
||||
|
||||
chars[count] = 0;
|
||||
convertUTF8ToUnicode(chars, str, count);
|
||||
}
|
||||
|
||||
/* print a textual representation of this string */
|
||||
void JavaString::dump()
|
||||
{
|
||||
const int16 *chars = getStr();
|
||||
const uint16 *chars = getStr();
|
||||
|
||||
for (int16 i = 0; i < count; i++)
|
||||
putchar(chars[i]);
|
||||
|
||||
Reference in New Issue
Block a user