
Collation Examples
Simple Collation Sample Customization
The following program demonstrates how to compare and create sort keys with default locale.
In C:
#include <stdio.h> #include <memory.h> #include <string.h> #include "unicode/ustring.h" #include "unicode/utypes.h" #include "unicode/uloc.h" #include "unicode/ucol.h" #define MAXBUFFERSIZE 100 #define BIGBUFFERSIZE 5000 UBool collateWithLocaleInC(const char* locale, UErrorCode *status) { UChar dispName [MAXBUFFERSIZE]; int32_t bufferLen = 0; UChar source [MAXBUFFERSIZE]; UChar target [MAXBUFFERSIZE]; UCollationResult result = UCOL_EQUAL; uint8_t sourceKeyArray [MAXBUFFERSIZE]; uint8_t targetKeyArray [MAXBUFFERSIZE]; int32_t sourceKeyOut = 0, targetKeyOut = 0; UCollator *myCollator = 0; if (U_FAILURE(*status)) { return FALSE; } u_uastrcpy(source, "This is a test."); u_uastrcpy(target, "THIS IS A TEST."); myCollator = ucol_open(locale, status); if (U_FAILURE(*status)){ bufferLen = uloc_getDisplayName(locale, 0, dispName, MAXBUFFERSIZE, status); /*Report the error with display name... */ fprintf(stderr, "Failed to create the collator for : \"%s\"\n", dispName); return FALSE; } result = ucol_strcoll(myCollator, source, u_strlen(source), target, u_strlen(target)); /* result is 1, secondary differences only for ignorable space characters*/ if (result != UCOL_LESS) { fprintf(stderr, "Comparing two strings with only secondary differences in C failed.\n"); return FALSE; } /* To compare them with just primary differences */ ucol_setStrength(myCollator, UCOL_PRIMARY); result = ucol_strcoll(myCollator, source, u_strlen(source), target, u_strlen(target)); /* result is 0 */ if (result != 0) { fprintf(stderr, "Comparing two strings with no differences in C failed.\n"); return FALSE; } |
In C++:
#include <stdio.h> #include "unicode/unistr.h" #include "unicode/utypes.h" #include "unicode/locid.h" #include "unicode/coll.h" #include "unicode/tblcoll.h" #include "unicode/coleitr.h" #include "unicode/sortkey.h" UBool collateWithLocaleInCPP(const Locale& locale, UErrorCode& status) { UnicodeString dispName; UnicodeString source("This is a test."); UnicodeString target("THIS IS A TEST."); Collator::EComparisonResult result = Collator::EQUAL; CollationKey sourceKey; CollationKey targetKey; Collator *myCollator = 0; if (U_FAILURE(status)) { return FALSE; } myCollator = Collator::createInstance(locale, status); if (U_FAILURE(status)){ locale.getDisplayName(dispName); /*Report the error with display name... */ fprintf(stderr, "%s: Failed to create the collator for : \"%s\"\n", dispName); return FALSE; } result = myCollator->compare(source, target); /* result is 1, secondary differences only for ignorable space characters*/ if (result != UCOL_LESS) { fprintf(stderr, "Comparing two strings with only secondary differences in C failed.\n"); return FALSE; } /* To compare them with just primary differences */ myCollator->setStrength(Collator::PRIMARY); result = myCollator->compare(source, target); /* result is 0 */ if (result != 0) { fprintf(stderr, "Comparing two strings with no differences in C failed.\n"); return FALSE; } /* Now, do the same comparison with keys */ myCollator->getCollationKey(source, sourceKey, status); myCollator->getCollationKey(target, targetKey, status); result = Collator::EQUAL; |
Main Function:
extern "C" UBool collateWithLocaleInC(const char* locale, UErrorCode *status); int main() { UErrorCode status = U_ZERO_ERROR; fprintf(stdout, "\n"); if (collateWithLocaleInCPP(Locale("en", "US"), status) != TRUE) { fprintf(stderr, "Collate with locale in C++ failed.\n"); } else { fprintf(stdout, "Collate with Locale C++ example worked!!\n"); } status = U_ZERO_ERROR; fprintf(stdout, "\n"); if (collateWithLocaleInC("en_US", &status) != TRUE) { fprintf(stderr, "%s: Collate with locale in C failed.\n"); } else { fprintf(stdout, "Collate with Locale C example worked!!\n"); } return 0; } |
In Java:
import com.ibm.icu.text.Collator; import com.ibm.icu.text.CollationElementIterator; import com.ibm.icu.text.CollationKey; import java.util.Locale; public class CollateExample { public static void main(String arg[]) { CollateExample example = new CollateExample(); try { if (!example.collateWithLocale(Locale.US)) { System.err.println("Collate with locale example failed."); } else { System.out.println("Collate with Locale example worked!!"); } } catch (Exception e) { System.err.println("Collating with locale failed"); e.printStackTrace(); } } public boolean collateWithLocale(Locale locale) throws Exception { String source = "This is a test."; String target = "THIS IS A TEST."; Collator myCollator = Collator.getInstance(locale); |
Language-sensitive searching
String searching is a well-researched area, and there are algorithms that can optimize the searching process. Perhaps the best is the Boyer-Moore method. For full textual description of concept behind the sample programs, please see Laura Werner's text searching article for more details (http://icu-project.org/docs/papers/efficient_text_searching_in_java.html ).
The source of the language-sensitive text searching based on ICU Collation Service can be found on the Internet at http://source.icu-project.org/repos/icu/icu/trunk/source/i18n/usearch.cpp .
Using large buffers to manage sort keys
A good solution for the problem of not knowing the sort key size in advance is to allocate a large buffer and store all the sort keys there, while keeping a list of indexes or pointers to that buffer.
Following is sample code that will take a pointer to an array of UChar pointer, an array of key indexes. It will allocate and fill a buffer with sort keys and return the maximum size for a sort key. Once you have done this to your string, you just need to allocate a field of maximum size and copy your sortkeys from the buffer to fields.
uint32_t |
Copyright (c) 2000 - 2008 IBM and Others - PDF Version - Feedback: http://icu-project.org/contacts.html
User Guide for ICU v4.0 Generated 2008-06-02.