February 2007

Unicode in web passwords

When we enter passwords, the text fields only show stars or some other character that corresponds to the number of characters. If I use an input method that produces unicode (especially Indian languages), I notice that this number varies in a different pattern (sometimes the number reduces) than the characters entered. A careful inspection showed that one star corresponds to one unicode codepoint, and format characters are not included.

I tried the malayalam word സരീക്ഷ്യൂമന്‍ by using an input method tool called Keyman, which allows to enter the word by entering the roman character sequence “sareekshyooman”.

I used Windows XP Professional (SP 2) and IE 6.0. Here is what I found.

Transliteration Unicode # stars Unicode characters Explanation
s സ് 2 D38 D4D sa (D38), virama (D4D)
sa à´¸ 1 D38 sa (D38)
sar സര്‍ 3 D38 D30 D4D 200D sa (D38), ra(D30), virama (D4D).
There may be a ZWJ(200D) also, but this doesn’t produce a star.
sare സരെ 3 D38 D30 D46 sa (D38), ra(D30), e(D46)
saree സരീ 3 D38 D30 D40 sa (D38), ra(D30), ee(D40)
sareek സരീക് 5 D38 D30 D40 D15 D4D sa (D38), ra(D30), ee(D40), ka(D15), virama(D4D)
sareeks സരീക്സ് 7 D38 D30 D40 D15 D4D D38 D4D sa (D38), ra(D30), ee(D40), ka(D15), virama(D4D), sa (D38), virama (D4D)
sareeksh സരീക്ഷ് 7 D38 D30 D40 D15 D4D D37 D4D sa (D38), ra(D30), ee(D40), ka(D15), virama(D4D), sa (D37), virama (D4D)
sareekshy സരീക്ഷ്യ് 9 D38 D30 D40 D15 D4D D37 D4D D2F D4D sa (D38), ra(D30), ee(D40), ka(D15), virama(D4D), sa (D37), virama (D4D), ya (D2F), virama(D4D)
sareekshyo സരീക്ഷ്യൊ 9 D38 D30 D40 D15 D4D D37 D4D D2F D4A sa (D38), ra(D30), ee(D40), ka(D15), virama(D4D), sa (D37), virama (D4D), ya (D2F), o(D4A)
sareekshyoo സരീക്ഷ്യൂ 9 D38 D30 D40 D15 D4D D37 D4D D2F D42 sa (D38), ra(D30), ee(D40), ka(D15), virama(D4D), sa (D37), virama (D4D), ya (D2F), oo(D42)
sareekshyoom സരീക്ഷ്യൂം 10 D38 D30 D40 D15 D4D D37 D4D D2F D42 D02 sa (D38), ra(D30), ee(D40), ka(D15), virama(D4D), sa (D37), virama (D4D), ya (D2F), oo(D42), anuswaram(D02). Can be ma (D2E), virama(D4D) instead of anuswaram(D02).
sareekshyooma സരീക്ഷ്യൂമ 10 D38 D30 D40 D15 D4D D37 D4D D2F D42 D2E sa (D38), ra(D30), ee(D40), ka(D15), virama(D4D), sa (D37), virama (D4D), ya (D2F), oo(D42), ma (D2E)
sareekshyooman സരീക്ഷ്യൂമന്‍ 12 D38 D30 D40 D15 D4D D37 D4D D2F D42 D2E D28 D4D 200D sa (D38), ra(D30), ee(D40), ka(D15), virama(D4D), sa (D37), virama (D4D), ya (D2F), oo(D42), ma (D2E), na (D28) virama(D4D).
There may be ZWJ(200D) also here, but that doesn’t appear.

Unicode

Comments (0)

Permalink

Conversion between codepoints and UTF

I was browsing Wikipedia for information on UTF-16 and UTF-8 this morning, and from the specifications, I wrote some C++ routines to convert from a codepoint to UTF-8 and UTF-16 and back. These are not perfect, but works for the test data I tried.

Code:

/*
 * utf.cxx
 *
 * (Some basic conversion routines for UTF-8 and UTF-16)
 * Author: Umesh Nair, Feb 2007
 *
 */

#include <sys/types.h>
#include <vector>
#include <iostream>
#include <iomanip>
#include <iterator>

using std::cout;
using std::endl;

// u_int8_t is sufficient,
// I used this for displaying it nicely
typedef u_int16_t OneByte;  

typedef u_int16_t TwoByte;

typedef u_int32_t FourByte;

// UTF is a variable-length encoding, so a vector is used
typedef std::vector<OneByte> utf8vec;
typedef std::vector<TwoByte> utf16vec;

// Converts codepoint to UTF-16
bool
codePointToUTF16(FourByte cp, utf16vec& utf)
{
  // Error check omitted
   if (cp < 0x10000) {
      // BMP
      utf.push_back(static_cast<TwoByte>(cp));
   } else {
      TwoByte lead = (cp >> 10) + 0xD800 - (0x10000 >> 10);
      TwoByte trail = (cp & 0x3FF) + 0xDC00;
      utf.push_back(lead);
      utf.push_back(trail);
   }

  return true;
}

// Converts UTF-16 to codepoint
bool
UTF16ToCodePoint(const utf16vec& utf, FourByte& cp)
{
   if (utf.size() == 1) {
      // BMP
      cp = utf[0];
   } else {
      TwoByte lead = utf[0];
      TwoByte trail = utf[1];

      // Error check omitted
      cp = (lead << 10) + trail
         - ((0xD800 << 10) + 0xDC00 - 0x10000);
   }

   return true;
}

// Converts codepoint to UTF-8
bool
codePointToUTF8(FourByte cp, utf8vec& utf)
{
   if (cp < 0x80) {
      utf.push_back(static_cast<OneByte>(cp));
   } else if (cp < 0x800) {
      OneByte b0 = 0x80 + (cp & 0x3F);

      OneByte b1 = 0xC0 + (cp >> 6);

      utf.push_back(b1);
      utf.push_back(b0);
   } else if (cp < 0x10000) {
      OneByte b0 = 0x80 + (cp & 0x3F);
      OneByte b1 = 0x80 + ((cp >> 6) & 0x3F);

      OneByte b2 = 0xE0 + (cp >> 12);

      utf.push_back(b2);
      utf.push_back(b1);
      utf.push_back(b0);
   } else {
      OneByte b0 = 0x80 + (cp & 0x3F);
      OneByte b1 = 0x80 + ((cp >> 6) & 0x3F);
      OneByte b2 = 0x80 + ((cp >> 12) & 0x3F);

      OneByte b3 = 0xF0 + (cp >> 18);

      utf.push_back(b3);
      utf.push_back(b2);
      utf.push_back(b1);
      utf.push_back(b0);
   }

   return true;
}

// Converts UTF-8 to codepoint, does validity check as well
bool
UTF8ToCodePoint(const utf8vec& utf, FourByte& cp)
{
   utf8vec::const_iterator it;

   FourByte lead = utf[0];

   size_t nBytes = utf.size();

   if (nBytes == 4) {
      if ((lead & 0xF8) != 0xF0) {
         return false;
      }
      lead = ((lead - 0xF0) << 18);
   } else if (nBytes == 3) {
      if ((lead& 0xF0) != 0xE0) {
         return false;
      }
      lead = ((lead - 0xE0) << 12);
   } else if (nBytes == 2) {
      if ((lead & 0xE0) != 0xC0) {
         return false;
      }
      lead = ((lead - 0xC0) << 6);
   } else if (nBytes == 1) {
      if ((lead & 0x80) != 0) {
         return false;
      }
      lead = lead;
   } else {
      assert(false);
   }

   FourByte value = 0;
   if (nBytes > 1) {
      for (it = utf.begin() + 1; it != utf.end(); ++it) {
         value <<= 6;
         FourByte bitmask = (*it & 0x3F);
         value += bitmask;
      }
   }
   cp = lead + value;

   return true;
}

// Testing with one data
void
testACodePoint(FourByte cp)
{
   cout << "\n\nTESTING Code point "
      << std::hex << cp << endl;

   FourByte cp8, cp16;

   utf16vec utf16;

   if (codePointToUTF16(cp, utf16)) {
      cout << "UTF-16 : ";
      std::copy(utf16.begin(), utf16.end(),
         std::ostream_iterator(cout, " "));
      cout << endl;
      if (UTF16ToCodePoint(utf16, cp16)) {
         cout << "Code point : "
            << std::hex << cp16 << endl;
      } else {
         cout << "Error in converting from UTF-16." << endl;
      }

   } else {
      cout << "Error in converting to UTF-16." << endl;
   }

   utf8vec utf8;

   if (codePointToUTF8(cp, utf8)) {
      cout << "UTF-8 : ";
      std::copy(utf8.begin(), utf8.end(),
         std::ostream_iterator(cout, " "));
      cout << endl;
      if (UTF8ToCodePoint(utf8, cp8)) {
         cout << "Code point : "
            << std::hex << cp8 << endl;
      } else {
         cout << "Error in converting from UTF-8." << endl;
      }

   } else {
      cout << "Error in converting to UTF-8." << endl;
   }
}

// Test program
int main()
{
   testACodePoint(0x10000);
   testACodePoint(0x10FFFD);;
   testACodePoint(0x64321);;
   testACodePoint(0x05D0);;
}

Output:


[ 85 ] $ a.out

TESTING Code point 10000
UTF-16 : d800 dc00
Code point : 10000
UTF-8 : f0 90 80 80
Code point : 10000

TESTING Code point 10fffd
UTF-16 : dbff dffd
Code point : 10fffd
UTF-8 : f4 8f bf bd
Code point : 10fffd

TESTING Code point 64321
UTF-16 : d950 df21
Code point : 64321
UTF-8 : f1 a4 8c a1
Code point : 64321

TESTING Code point 5d0
UTF-16 : 5d0
Code point : 5d0
UTF-8 : d7 90
Code point : 5d0

C/C++
Programming
Unicode

Comments (0)

Permalink

Calculating number of combinations

The well-known formula for calculating the number of combinations of n objects with r objects taken a t a time is

but this requires calculating three factorials which may be expensive and may cause overflow. A better way is to use the other definition

and doing multiplication and division alternatively. An example is the C++ code below:

int nCr (int n, int r)
{
   int ncr = n;
   int k = 1;
   int r1 = n - r;

   // Handle special cases
   if (r1 < 0) { return 0;}    // Invalid value
   if (r1 < r) { r = r1;}      // nCr = nC(n-r)
   if (r == 0) { return 1;}    // nC0 = 1

   // To avoid integer overflow, divide as early as possible
   for (int k = 2; k <= r; ++k) {
     ncr *= --n;
     ncr /= k;
   }
   return ncr;
}

C/C++
Mathematics

Comments (0)

Permalink

Surviving with Emacs/VM mail client where everybody else uses MS Outlook

Using Emacs and VM for e-mail in a company that has a lot of MS Outlook employees can be a difficult task. Microsoft Outlook handles e-mails in a non-standard way in many cases. Reading e-mails properly and making your e-mails properly read by others can be tough tasks. This post addresses some issues.

Removing HTML part from text messages

Outlook always includes an HTML portion of the message as well, even if the message was composed as “Plain text”. This function removes the HTML part, preserving only the text portion.

(defun upn-mail-remove-html ()
  "Removes HTML portions from reply e-mails."
  (interactive)
  (let (old-case-fold-val)
	 (setq old-case-fold-val case-fold-search)
	 (setq case-fold-search t)
	 (when (search-forward "")
			 (delete-region beg (point)))))
	 (setq case-fold-search old-case-fold-val)))

The above function can be conveniently added in the supercite’s post-hook so that it will be done after supercite prepares the reply buffer.

(add-hook 'sc-post-hook 'upn-mail-remove-html)

While all other mail clients show a name as “<FirstName> <LastName>”, Outlook shows it as “<LastName>, <FirstName>”, thereby breaking VM’s handling of the names and generating the right supercite prefixes. The following function extracts the first name and last name from both mail formats.

;; The following one matches the name from MS Outlook.
(defconst upn-ms-outlook-name-regexp "\\([^,\"]+\\),[ \t]+\\([^,\"]+\\)")

;; The following one matches the name from all "sensible" mailers.
(defconst upn-normal-name-regexp "\\([^\"<>]+\\)[ \t]+\\([^\"<>]+\\)")

(defun upn-get-ms-outlook-name (fromline)
  "Extracts First name and last name from a name in MS Outlook format.
   Returns nil if it doesn't match."
  (let (matched-retval)
    (setq matched-retval (string-match upn-ms-outlook-name-regexp fromline) )

    (if (not matched-retval)
        nil
      (let (
            (firstword (substring fromline (match-beginning 1) (match-end 1)))
            (secondword  (substring fromline (match-beginning 2) (match-end 2)))
            )
        ;; return the firstname which will be the second word:
        (list secondword firstword)))))

This function extract the same information from a normal name.

(defun upn-get-normal-name (fromline)
"Extracts First name and last name from a name in a normal format.
  Returns nil if it doesn't match."
  (let (matched-retval)
    (setq matched-retval (string-match upn-normal-name-regexp fromline) )

    (if (not matched-retval)
        nil
      (let (
            (firstword (substring fromline (match-beginning 1) (match-end 1)))
            (secondword  (substring fromline (match-beginning 2) (match-end 2)))
            )
        ;; return the firstname which will be the second word:
        (list firstword secondword)))))

The following function can be used to extract the name info from either format.

(defun upn-get-first-last-name (fromline)
  "Gets the first name and last name from the From: line of a mail header.
Handles normal names and names from MS outlook."
  (let (result)
    ;; Most people are rotten by Exchange sever, so check it first
    (setq result (upn-get-ms-outlook-name fromline))
    (unless result

      ;; Now check the normal one
      (setq result (upn-get-normal-name fromline))
      (unless result

        ;; Set the name as nil, so the calling function can take action
        (setq result nil)))

    ;; Return result
    (or result)))

The following function generates a word from the first name and the initial showing the last name from a name of either format, to be used as the supercite prefix.

(defun upn-get-FirstL-name (namestring)
  "Gets a string formed by a capitalized first name appended with the
first letter of the last name capitalized, from the From: header of an
e-mail.  This is used as the attribution if one has not been
provided."
  (let (result result-list)
    (setq result-list (upn-get-first-last-name namestring))
    (if (not result-list)
        (setq result namestring)
      (setq result
            (concat (capitalize (car result-list))
                    (upcase (substring (cadr result-list) 0 1)))))
    (or result)))

Emacs

Comments (0)

Permalink