tubestation/extensions/xmlterm/lineterm/unistring.c

/*
 * The contents of this file are subject to the Mozilla Public
 * License Version 1.1 (the "MPL"); you may not use this file
 * except in compliance with the MPL. You may obtain a copy of
 * the MPL at http://www.mozilla.org/MPL/
 *
 * Software distributed under the MPL is distributed on an "AS
 * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or
 * implied. See the MPL for the specific language governing
 * rights and limitations under the MPL.
 *
 * The Original Code is lineterm.
 *
 * The Initial Developer of the Original Code is Ramalingam Saravanan.
 * Portions created by Ramalingam Saravanan <svn@xmlterm.org> are
 * Copyright (C) 1999 Ramalingam Saravanan. All Rights Reserved.
 *
 * Contributor(s):
 *
 * Alternatively, the contents of this file may be used under the
 * terms of the GNU General Public License (the "GPL"), in which case
 * the provisions of the GPL are applicable instead of
 * those above. If you wish to allow use of your version of this
 * file only under the terms of the GPL and not to allow
 * others to use your version of this file under the MPL, indicate
 * your decision by deleting the provisions above and replace them
 * with the notice and other provisions required by the GPL.
 * If you do not delete the provisions above, a recipient
 * may use your version of this file under either the MPL or the
 * GPL.
 */

/* unistring.c: Unicode string operations implementation */

/* public declarations */
#include "unistring.h"

/* private declarations */

/** Encodes Unicode string US with NUS characters into UTF8 string S with
 * upto NS characters, returning the number of REMAINING Unicode characters
 * and the number of ENCODED Utf8 characters
 */
void ucstoutf8(const UNICHAR* us, int nus, char* s, int ns,
               int* remaining, int* encoded)
{
  int j, k;

  j = 0;
  k = 0;
  while ((j < ns) && (k < nus)) {
    UNICHAR uch = us[k++];

    if (uch < 0x0080) {
      s[j++] = uch;

    } else if (uch < 0x0800) {
      if (j >= ns-1) break;
      s[j++] = ((uch & 0x07C0) >>  6) | 0xC0;
      s[j++] =  (uch & 0x003F)        | 0x80;

    } else {
      if (j >= ns-2) break;
      s[j++] = ((uch & 0xF000) >> 12) | 0xE0;
      s[j++] = ((uch & 0x0FC0) >>  6) | 0x80;
      s[j++] =  (uch & 0x003F)        | 0x80;
    }
  }

  if (remaining)
    *remaining = nus - k;

  if (encoded)
    *encoded = j;
}


/** Decodes UTF8 string S with NS characters to Unicode string US with
 * upto NUS characters, returning the number of REMAINING Utf8 characters
 * and the number of DECODED Unicode characters.
 * If skipNUL is non-zero, NUL input characters are skipped.
 * returns 0 if successful,
 *        -1 if an error occurred during decoding
 */
int utf8toucs(const char* s, int ns, UNICHAR* us, int nus,
              int skipNUL, int* remaining, int* decoded)
{
  int j, k;
  int retcode = 0;

  j = 0;
  k = 0;
  while ((j < ns) && (k < nus)) {
    char ch = s[j];

    if (0x80 & ch) {
      if (0x40 & ch) {
        if (0x20 & ch) {
          /* consume 3 */
          if (j >= ns-2) break;

          if ( (s[j+1] & 0x40) || !(s[j+1] & 0x80) ||
               (s[j+2] & 0x40) || !(s[j+2] & 0x80) ) {
            retcode = -1;
          }

          us[k++] =   ((ch     & 0x0F) << 12)
                    | ((s[j+1] & 0x3F) << 6)
                    | ( s[j+2] & 0x3F);

          j += 3;

        } else {
          /* consume 2 */
          if (j >= ns-1) break;

          if ( (s[j+1] & 0x40) || !(s[j+1] & 0x80) ) {
            retcode = -1;
          }

          us[k++] =   ((ch     & 0x1F) << 6)
                    | ( s[j+1] & 0x3F);
          j += 2;
        }

      } else {
        /* consume 1 (error) */
        retcode = -1;
        j++;
      }

    } else {
      /* consume 1 */
      if (ch || !skipNUL) {
        us[k++] = ch;
      }
      j++;
    }
  }

  if (remaining)
    *remaining = ns - j;

  if (decoded)
    *decoded = k;

  return retcode;
}


/** Prints Unicode string US with NUS characters to file stream STREAM,
 * escaping non-printable ASCII characters and all non-ASCII characters
 */
void ucsprint(FILE* stream, const UNICHAR* us, int nus)
{
  static const char hexDigits[17] = "0123456789abcdef";
  UNICHAR uch;
  int k;

  for (k=0; k<nus; k++) {
    uch = us[k];

    if (uch < (UNICHAR)U_SPACE) {
      /* ASCII control character */
      fprintf(stream, "^%c", (char) uch+U_ATSIGN);

    } else if (uch == (UNICHAR)U_CARET) {
      /* Caret */
      fprintf(stream, "^^");

    } else if (uch < (UNICHAR)U_DEL) {
      /* Printable ASCII character */
      fprintf(stream, "%c", (char) uch);

    } else {
      /* DEL or non-ASCII character */
      char esc_str[8]="&#0000;";
      int j;
      for (j=5; j>1; j--) {
        esc_str[j] = hexDigits[uch%16];
        uch = uch / 16;
      }
      fprintf(stream, "%s", esc_str);
    }
  }
}


/** Copy exactly n characters from plain character source string to UNICHAR
 * destination string, ignoring source characters past a null character and
 * padding the destination with null characters if necessary.
 */
UNICHAR* ucscopy(register UNICHAR* dest, register const char* srcplain,
                 size_t n)
{
  register UNICHAR ch;
  register const UNICHAR* destmx = dest + n;

  /* Copy characters from source to destination, stopping at NUL */
  while (dest < destmx) {
    *dest++ = (ch = *srcplain++);
    if (ch == U_NUL)
      break;
  }

  /* Pad with NULs, if necessary */
  while (dest < destmx)
    *dest++ = U_NUL;

  return dest;
}


#ifndef USE_WCHAR
/** Locates first occurrence of character within string and returns pointer
 * to it if found, else returning null pointer. (character may be NUL)
 */
UNICHAR* ucschr(register const UNICHAR* str, register const UNICHAR chr)
{
  do {
    if (*str == chr)
      return (UNICHAR*) str;
  } while (*str++ != U_NUL);

  return NULL;
}


/** Locates last occurrence of character within string and returns pointer
 * to it if found, else returning null pointer. (character may be NUL)
 */
UNICHAR* ucsrchr(register const UNICHAR* str, register const UNICHAR chr)
{
  const UNICHAR* retstr = NULL;
  do {
    if (*str == chr)
      retstr = str;
  } while (*str++ != U_NUL);

  return (UNICHAR*) retstr;
}


/** Compare all characters between string1 and string2, returning
 * a zero value if all characters are equal, or returning
 * character1 - character2 for the first character that is different
 * between the two strings.
 * (Characters following a null character are not compared.)
 */
int ucscmp(register const UNICHAR* str1, register const UNICHAR* str2)
{
  register UNICHAR ch1, ch2;

  do {
    if ((ch1 = *str1++) != (ch2 = *str2++))
      return ch1 - ch2;

  } while (ch1 != U_NUL);

  return 0;
}


/** Compare upto n characters between string1 and string2, returning
 * a zero value if all compared characters are equal, or returning
 * character1 - character2 for the first character that is different
 * between the two strings.
 * (Characters following a null character are not compared.)
 */
int ucsncmp(register const UNICHAR* str1, register const UNICHAR* str2,
            size_t n)
{
  register UNICHAR ch1, ch2;
  register const UNICHAR* str1mx = str1 + n;

  while (str1 < str1mx) {
    if ((ch1 = *str1++) != (ch2 = *str2++))
      return ch1 - ch2;

    if (ch1 == U_NUL)
      break;
  }

  return 0;
}


/** Copy exactly n characters from source to destination, ignoring source
 * characters past a null character and padding the destination with null
 * characters if necessary.
 */
UNICHAR* ucsncpy(register UNICHAR* dest, register const UNICHAR* src,
                 size_t n)
{
  register UNICHAR ch;
  register const UNICHAR* destmx = dest + n;

  /* Copy characters from source to destination, stopping at NUL */
  while (dest < destmx) {
    *dest++ = (ch = *src++);
    if (ch == U_NUL)
      break;
  }

  /* Pad with NULs, if necessary */
  while (dest < destmx)
    *dest++ = U_NUL;

  return dest;
}


/** Returns string length
 */
size_t ucslen(const UNICHAR* str)
{
  register const UNICHAR* strcp = str;

  while (*strcp++ != U_NUL);

  return strcp - str - 1;
}


/** Locates substring within string and returns pointer to it if found,
 * else returning null pointer. If substring has zero length, then full
 * string is returned.
 */
UNICHAR* ucsstr(register const UNICHAR* str, const UNICHAR* substr)
{
  register UNICHAR subch1, ch;

  /* If null substring, return string */
  if (*substr == U_NUL)
    return (UNICHAR*) str;

  /* First character of non-null substring */
  subch1 = *substr;

  if ((ch = *str) == U_NUL)
    return NULL;

  do {

    if (ch == subch1) {
      /* First character matches; check if rest of substring matches */
      register const UNICHAR* strcp = str;
      register const UNICHAR* substrcp = substr;
      do {
        substrcp++;
        strcp++;
        if (*substrcp == U_NUL)
          return (UNICHAR*) str;
      } while (*substrcp == *strcp);
    }

  } while ((ch = *(++str)) != U_NUL);

  return NULL;
}


/** Returns length of longest initial segment of string that contains
 * only the specified characters.
 */
size_t ucsspn(const UNICHAR* str, const UNICHAR* chars)
{
  register UNICHAR strch, ch;
  register const UNICHAR* charscp;
  register const UNICHAR* strcp = str;

  while ((strch = *strcp++) != U_NUL) {
    charscp = chars;

    /* Check that it is one of the specified characters */
    while ((ch = *charscp++) != U_NUL) {
      if (strch == ch)
        break;
    }
    if (ch == U_NUL)
      return (size_t) (strcp - str - 1);
  }

  return (size_t) (strcp - str - 1);
}


/** Returns length of longest initial segment of string that does not
 * contain any of the specified characters.
 */
size_t ucscspn(const UNICHAR* str, const UNICHAR* chars)
{
  register UNICHAR strch, ch;
  register const UNICHAR* charscp;
  register const UNICHAR* strcp = str;

  while ((strch = *strcp++) != U_NUL) {
    charscp = chars;

    /* Check that it is not one of the specified characters */
    while ((ch = *charscp++) != U_NUL) {
      if (strch == ch)
        return (size_t) (strcp - str - 1);
    }
  }

  return (size_t) (strcp - str - 1);
}
#endif  /* !USE_WCHAR */