Languages Around The World

Collation Examples

Simple Collation Sample Customization

The following program demonstrates how to compare and create sort keys with default locale.

In C:

       #include <stdio.h>
       #include <memory.h>
       #include <string.h>
       #include "unicode/ustring.h"
       #include "unicode/utypes.h"
       #include "unicode/uloc.h"
       #include "unicode/ucol.h"
       #define MAXBUFFERSIZE 100
       #define BIGBUFFERSIZE 5000
       UBool collateWithLocaleInC(const char* locale, UErrorCode *status)
       {
           UChar         dispName    [MAXBUFFERSIZE]; 
           int32_t       bufferLen   = 0;
           UChar         source            [MAXBUFFERSIZE];
           UChar         target            [MAXBUFFERSIZE];
           UCollationResult result   = UCOL_EQUAL;
           uint8_t             sourceKeyArray    [MAXBUFFERSIZE];
           uint8_t             targetKeyArray    [MAXBUFFERSIZE]; 
           int32_t       sourceKeyOut      = 0, 
                       targetKeyOut = 0;
           UCollator     *myCollator = 0;
           if (U_FAILURE(*status))
           {
               return FALSE;
           }
           u_uastrcpy(source, "This is a test.");
           u_uastrcpy(target, "THIS IS A TEST.");
           myCollator = ucol_open(locale, status);
           if (U_FAILURE(*status)){
               bufferLen = uloc_getDisplayName(locale, 0, dispName, MAXBUFFERSIZE, status);
               /*Report the error with display name... */
               fprintf(stderr,
               "Failed to create the collator for : \"%s\"\n", dispName);
               return FALSE;
           }
           result = ucol_strcoll(myCollator, source, u_strlen(source), target, u_strlen(target));
           /* result is 1, secondary differences only for ignorable space characters*/
           if (result != UCOL_LESS)
           {
               fprintf(stderr,
               "Comparing two strings with only secondary differences in C failed.\n");
               return FALSE;
           }
           /* To compare them with just primary differences */
           ucol_setStrength(myCollator, UCOL_PRIMARY);
           result = ucol_strcoll(myCollator, source, u_strlen(source), target, u_strlen(target));
           /* result is 0 */
           if (result != 0)
           {
               fprintf(stderr,
               "Comparing two strings with no differences in C failed.\n");
               return FALSE;
           }

           /* Now, do the same comparison with keys */            sourceKeyOut = ucol_getSortKey(myCollator, source, -1, sourceKeyArray, MAXBUFFERSIZE);            targetKeyOut = ucol_getSortKey(myCollator, target, -1, targetKeyArray, MAXBUFFERSIZE);            result = 0;            result = strcmp(sourceKeyArray, targetKeyArray);            if (result != 0)            {                fprintf(stderr,                "Comparing two strings with sort keys in C failed.\n");                return FALSE;            }            ucol_close(myCollator);            return TRUE;        }

In C++:

       #include <stdio.h>
       #include "unicode/unistr.h"
       #include "unicode/utypes.h"
       #include "unicode/locid.h"
       #include "unicode/coll.h"
       #include "unicode/tblcoll.h"
       #include "unicode/coleitr.h"
       #include "unicode/sortkey.h"
       UBool collateWithLocaleInCPP(const Locale& locale, UErrorCode& status)
       {
           UnicodeString dispName; 
           UnicodeString source("This is a test.");
           UnicodeString target("THIS IS A TEST.");
           Collator::EComparisonResult result    = Collator::EQUAL;
           CollationKey sourceKey;
           CollationKey targetKey; 
           Collator      *myCollator = 0;
           if (U_FAILURE(status))
           {
               return FALSE;
           }
           myCollator = Collator::createInstance(locale, status);
           if (U_FAILURE(status)){
               locale.getDisplayName(dispName);
               /*Report the error with display name... */
               fprintf(stderr,
               "%s: Failed to create the collator for : \"%s\"\n", dispName);
               return FALSE;
           }
           result = myCollator->compare(source, target);
           /* result is 1, secondary differences only for ignorable space characters*/
           if (result != UCOL_LESS)
           {
               fprintf(stderr,
               "Comparing two strings with only secondary differences in C failed.\n");
               return FALSE;
           }
           /* To compare them with just primary differences */
           myCollator->setStrength(Collator::PRIMARY);
           result = myCollator->compare(source, target);
           /* result is 0 */
           if (result != 0)
           {
               fprintf(stderr,
               "Comparing two strings with no differences in C failed.\n");
               return FALSE;
           }
           /* Now, do the same comparison with keys */
           myCollator->getCollationKey(source, sourceKey, status);
           myCollator->getCollationKey(target, targetKey, status);
           result = Collator::EQUAL;

           result = sourceKey.compareTo(targetKey);            if (result != 0)            {                fprintf(stderr,                "%s: Comparing two strings with sort keys in C failed.\n");                return FALSE;            }            delete myCollator;            return TRUE;        }

Main Function:

       extern "C" UBool collateWithLocaleInC(const char* locale, UErrorCode *status);
       int main()
       {
          UErrorCode status = U_ZERO_ERROR;
          fprintf(stdout, "\n");
          if (collateWithLocaleInCPP(Locale("en", "US"), status) != TRUE)
          {
               fprintf(stderr,
               "Collate with locale in C++ failed.\n");
          } else 
          {
              fprintf(stdout, "Collate with Locale C++ example worked!!\n");
          }
          status = U_ZERO_ERROR;
          fprintf(stdout, "\n");
          if (collateWithLocaleInC("en_US", &status) != TRUE)
          {
               fprintf(stderr,
               "%s: Collate with locale in C failed.\n");
          } else 
          {
              fprintf(stdout, "Collate with Locale C example worked!!\n");
          }
          return 0;
       } 

In Java:

       
            import com.ibm.icu.text.Collator;
            import com.ibm.icu.text.CollationElementIterator;
            import com.ibm.icu.text.CollationKey;
            import java.util.Locale;

            public class CollateExample
            {
           
                public static void main(String arg[]) 
                {
                    CollateExample example = new CollateExample();
                    try {
                        if (!example.collateWithLocale(Locale.US)) {
                            System.err.println("Collate with locale example failed.");
                        } 
                        else {
                            System.out.println("Collate with Locale example worked!!");
                        }
                    } catch (Exception e) {
                        System.err.println("Collating with locale failed");
                        e.printStackTrace();
                    }
                }
       
                public boolean collateWithLocale(Locale locale) throws Exception
                {
                    String source = "This is a test.";
                    String target = "THIS IS A TEST.";
                    Collator myCollator = Collator.getInstance(locale);

                    int result = myCollator.compare(source, target);                     // result is 1, secondary differences only for ignorable space characters                     if (result >= 0) {                         System.err.println(                             "Comparing two strings with only secondary differences failed.");                         return false;                     }                     // To compare them with just primary differences                     myCollator.setStrength(Collator.PRIMARY);                     result = myCollator.compare(source, target);                     // result is 0                     if (result != 0) {                         System.err.println(                                        "Comparing two strings with no differences failed.");                         return false;                     }                     // Now, do the same comparison with keys                     CollationKey sourceKey = myCollator.getCollationKey(source);                     CollationKey targetKey = myCollator.getCollationKey(target);                     result = sourceKey.compareTo(targetKey);                     if (result != 0) {                         System.err.println("Comparing two strings with sort keys failed.");                         return false;                     }                     return true;                 }               }            

Language-sensitive searching

String searching is a well-researched area, and there are algorithms that can optimize the searching process. Perhaps the best is the Boyer-Moore method. For full textual description of concept behind the sample programs, please see Laura Werner's text searching article for more details (http://icu-project.org/docs/papers/efficient_text_searching_in_java.html ).

The source of the language-sensitive text searching based on ICU Collation Service can be found on the Internet at http://source.icu-project.org/repos/icu/icu/trunk/source/i18n/usearch.cpp .

Using large buffers to manage sort keys

A good solution for the problem of not knowing the sort key size in advance is to allocate a large buffer and store all the sort keys there, while keeping a list of indexes or pointers to that buffer.

Following is sample code that will take a pointer to an array of UChar pointer, an array of key indexes. It will allocate and fill a buffer with sort keys and return the maximum size for a sort key. Once you have done this to your string, you just need to allocate a field of maximum size and copy your sortkeys from the buffer to fields.

uint32_t 

fillBufferWithKeys(UCollator *coll, UChar **source, uint32_t *keys, uint32_t sourceSize,                             uint8_t **buffer, uint32_t *maxSize, UErrorCode *status) {   if(status == NULL || U_FAILURE(*status)) {     return 0;   }   uint32_t bufferSize = 16384;   uint32_t increment = 16384;   uint32_t currentOffset = 0;   uint32_t keySize = 0;   uint32_t i = 0;   *maxSize = 0;   *buffer = (uint8_t *)malloc(bufferSize * sizeof(uint8_t));   if(buffer == NULL) {     *status = U_MEMORY_ALLOCATION_ERROR;     return 0;   }   for(i = 0; i < sourceSize; i++) {     keys[i] = currentOffset;     keySize = ucol_getSortKey(coll, source[i], -1, *buffer+currentOffset, bufferSize-currentOffset);     if(keySize > bufferSize-currentOffset) {       *buffer = (uint8_t *)realloc(*buffer, bufferSize+increment);       if(buffer == NULL) {         *status = U_MEMORY_ALLOCATION_ERROR;         return 0;       }       bufferSize += increment;       keySize = ucol_getSortKey(coll, source[i], -1, *buffer+currentOffset, bufferSize-currentOffset);     }     /* here you can hook code that does something interesting with the keySize -      * remembers the maximum or similar...      */     if(keySize > *maxSize) {       *maxSize = keySize;     }     currentOffset += keySize;   }   return currentOffset; }


Copyright (c) 2000 - 2008 IBM and Others - PDF Version - Feedback: http://icu-project.org/contacts.html

User Guide for ICU v4.0 Generated 2008-06-02.