Unicode

Unicode in web passwords

When we enter passwords, the text fields only show stars or some other character that corresponds to the number of characters. If I use an input method that produces unicode (especially Indian languages), I notice that this number varies in a different pattern (sometimes the number reduces) than the characters entered. A careful inspection showed that one star corresponds to one unicode codepoint, and format characters are not included.

I tried the malayalam word സരീക്ഷ്യൂമന്‍ by using an input method tool called Keyman, which allows to enter the word by entering the roman character sequence “sareekshyooman”.

I used Windows XP Professional (SP 2) and IE 6.0. Here is what I found.

Transliteration Unicode # stars Unicode characters Explanation
s സ് 2 D38 D4D sa (D38), virama (D4D)
sa à´¸ 1 D38 sa (D38)
sar സര്‍ 3 D38 D30 D4D 200D sa (D38), ra(D30), virama (D4D).
There may be a ZWJ(200D) also, but this doesn’t produce a star.
sare സരെ 3 D38 D30 D46 sa (D38), ra(D30), e(D46)
saree സരീ 3 D38 D30 D40 sa (D38), ra(D30), ee(D40)
sareek സരീക് 5 D38 D30 D40 D15 D4D sa (D38), ra(D30), ee(D40), ka(D15), virama(D4D)
sareeks സരീക്സ് 7 D38 D30 D40 D15 D4D D38 D4D sa (D38), ra(D30), ee(D40), ka(D15), virama(D4D), sa (D38), virama (D4D)
sareeksh സരീക്ഷ് 7 D38 D30 D40 D15 D4D D37 D4D sa (D38), ra(D30), ee(D40), ka(D15), virama(D4D), sa (D37), virama (D4D)
sareekshy സരീക്ഷ്യ് 9 D38 D30 D40 D15 D4D D37 D4D D2F D4D sa (D38), ra(D30), ee(D40), ka(D15), virama(D4D), sa (D37), virama (D4D), ya (D2F), virama(D4D)
sareekshyo സരീക്ഷ്യൊ 9 D38 D30 D40 D15 D4D D37 D4D D2F D4A sa (D38), ra(D30), ee(D40), ka(D15), virama(D4D), sa (D37), virama (D4D), ya (D2F), o(D4A)
sareekshyoo സരീക്ഷ്യൂ 9 D38 D30 D40 D15 D4D D37 D4D D2F D42 sa (D38), ra(D30), ee(D40), ka(D15), virama(D4D), sa (D37), virama (D4D), ya (D2F), oo(D42)
sareekshyoom സരീക്ഷ്യൂം 10 D38 D30 D40 D15 D4D D37 D4D D2F D42 D02 sa (D38), ra(D30), ee(D40), ka(D15), virama(D4D), sa (D37), virama (D4D), ya (D2F), oo(D42), anuswaram(D02). Can be ma (D2E), virama(D4D) instead of anuswaram(D02).
sareekshyooma സരീക്ഷ്യൂമ 10 D38 D30 D40 D15 D4D D37 D4D D2F D42 D2E sa (D38), ra(D30), ee(D40), ka(D15), virama(D4D), sa (D37), virama (D4D), ya (D2F), oo(D42), ma (D2E)
sareekshyooman സരീക്ഷ്യൂമന്‍ 12 D38 D30 D40 D15 D4D D37 D4D D2F D42 D2E D28 D4D 200D sa (D38), ra(D30), ee(D40), ka(D15), virama(D4D), sa (D37), virama (D4D), ya (D2F), oo(D42), ma (D2E), na (D28) virama(D4D).
There may be ZWJ(200D) also here, but that doesn’t appear.

Unicode

Comments (0)

Permalink

Conversion between codepoints and UTF

I was browsing Wikipedia for information on UTF-16 and UTF-8 this morning, and from the specifications, I wrote some C++ routines to convert from a codepoint to UTF-8 and UTF-16 and back. These are not perfect, but works for the test data I tried.

Code:

/*
 * utf.cxx
 *
 * (Some basic conversion routines for UTF-8 and UTF-16)
 * Author: Umesh Nair, Feb 2007
 *
 */

#include <sys/types.h>
#include <vector>
#include <iostream>
#include <iomanip>
#include <iterator>

using std::cout;
using std::endl;

// u_int8_t is sufficient,
// I used this for displaying it nicely
typedef u_int16_t OneByte;  

typedef u_int16_t TwoByte;

typedef u_int32_t FourByte;

// UTF is a variable-length encoding, so a vector is used
typedef std::vector<OneByte> utf8vec;
typedef std::vector<TwoByte> utf16vec;

// Converts codepoint to UTF-16
bool
codePointToUTF16(FourByte cp, utf16vec& utf)
{
  // Error check omitted
   if (cp < 0x10000) {
      // BMP
      utf.push_back(static_cast<TwoByte>(cp));
   } else {
      TwoByte lead = (cp >> 10) + 0xD800 - (0x10000 >> 10);
      TwoByte trail = (cp & 0x3FF) + 0xDC00;
      utf.push_back(lead);
      utf.push_back(trail);
   }

  return true;
}

// Converts UTF-16 to codepoint
bool
UTF16ToCodePoint(const utf16vec& utf, FourByte& cp)
{
   if (utf.size() == 1) {
      // BMP
      cp = utf[0];
   } else {
      TwoByte lead = utf[0];
      TwoByte trail = utf[1];

      // Error check omitted
      cp = (lead << 10) + trail
         - ((0xD800 << 10) + 0xDC00 - 0x10000);
   }

   return true;
}

// Converts codepoint to UTF-8
bool
codePointToUTF8(FourByte cp, utf8vec& utf)
{
   if (cp < 0x80) {
      utf.push_back(static_cast<OneByte>(cp));
   } else if (cp < 0x800) {
      OneByte b0 = 0x80 + (cp & 0x3F);

      OneByte b1 = 0xC0 + (cp >> 6);

      utf.push_back(b1);
      utf.push_back(b0);
   } else if (cp < 0x10000) {
      OneByte b0 = 0x80 + (cp & 0x3F);
      OneByte b1 = 0x80 + ((cp >> 6) & 0x3F);

      OneByte b2 = 0xE0 + (cp >> 12);

      utf.push_back(b2);
      utf.push_back(b1);
      utf.push_back(b0);
   } else {
      OneByte b0 = 0x80 + (cp & 0x3F);
      OneByte b1 = 0x80 + ((cp >> 6) & 0x3F);
      OneByte b2 = 0x80 + ((cp >> 12) & 0x3F);

      OneByte b3 = 0xF0 + (cp >> 18);

      utf.push_back(b3);
      utf.push_back(b2);
      utf.push_back(b1);
      utf.push_back(b0);
   }

   return true;
}

// Converts UTF-8 to codepoint, does validity check as well
bool
UTF8ToCodePoint(const utf8vec& utf, FourByte& cp)
{
   utf8vec::const_iterator it;

   FourByte lead = utf[0];

   size_t nBytes = utf.size();

   if (nBytes == 4) {
      if ((lead & 0xF8) != 0xF0) {
         return false;
      }
      lead = ((lead - 0xF0) << 18);
   } else if (nBytes == 3) {
      if ((lead& 0xF0) != 0xE0) {
         return false;
      }
      lead = ((lead - 0xE0) << 12);
   } else if (nBytes == 2) {
      if ((lead & 0xE0) != 0xC0) {
         return false;
      }
      lead = ((lead - 0xC0) << 6);
   } else if (nBytes == 1) {
      if ((lead & 0x80) != 0) {
         return false;
      }
      lead = lead;
   } else {
      assert(false);
   }

   FourByte value = 0;
   if (nBytes > 1) {
      for (it = utf.begin() + 1; it != utf.end(); ++it) {
         value <<= 6;
         FourByte bitmask = (*it & 0x3F);
         value += bitmask;
      }
   }
   cp = lead + value;

   return true;
}

// Testing with one data
void
testACodePoint(FourByte cp)
{
   cout << "\n\nTESTING Code point "
      << std::hex << cp << endl;

   FourByte cp8, cp16;

   utf16vec utf16;

   if (codePointToUTF16(cp, utf16)) {
      cout << "UTF-16 : ";
      std::copy(utf16.begin(), utf16.end(),
         std::ostream_iterator(cout, " "));
      cout << endl;
      if (UTF16ToCodePoint(utf16, cp16)) {
         cout << "Code point : "
            << std::hex << cp16 << endl;
      } else {
         cout << "Error in converting from UTF-16." << endl;
      }

   } else {
      cout << "Error in converting to UTF-16." << endl;
   }

   utf8vec utf8;

   if (codePointToUTF8(cp, utf8)) {
      cout << "UTF-8 : ";
      std::copy(utf8.begin(), utf8.end(),
         std::ostream_iterator(cout, " "));
      cout << endl;
      if (UTF8ToCodePoint(utf8, cp8)) {
         cout << "Code point : "
            << std::hex << cp8 << endl;
      } else {
         cout << "Error in converting from UTF-8." << endl;
      }

   } else {
      cout << "Error in converting to UTF-8." << endl;
   }
}

// Test program
int main()
{
   testACodePoint(0x10000);
   testACodePoint(0x10FFFD);;
   testACodePoint(0x64321);;
   testACodePoint(0x05D0);;
}

Output:


[ 85 ] $ a.out

TESTING Code point 10000
UTF-16 : d800 dc00
Code point : 10000
UTF-8 : f0 90 80 80
Code point : 10000

TESTING Code point 10fffd
UTF-16 : dbff dffd
Code point : 10fffd
UTF-8 : f4 8f bf bd
Code point : 10fffd

TESTING Code point 64321
UTF-16 : d950 df21
Code point : 64321
UTF-8 : f1 a4 8c a1
Code point : 64321

TESTING Code point 5d0
UTF-16 : 5d0
Code point : 5d0
UTF-8 : d7 90
Code point : 5d0

C/C++
Programming
Unicode

Comments (0)

Permalink