ICU-1851 CODAN implementation, take 1

X-SVN-Rev: 12686
2003-07-25 05:31:54 +00:00 · 2003-07-25 05:31:54 +00:00 · 29de7f8abe
commit 29de7f8abe
parent e371874f36
5 changed files with 623 additions and 86 deletions
--- a/icu4c/source/i18n/ucol.cpp
+++ b/icu4c/source/i18n/ucol.cpp
@ -2657,7 +2657,7 @@ uint32_t ucol_prv_getSpecialCE(const UCollator *coll, UChar ch, uint32_t CE, col
          }
          // Now we have the character that needs to be decomposed
          // if the normalizing buffer was not used, we can just use our structure and be happy.
-          if(source->flags & UCOL_ITER_INNORMBUF == 0) {
+          if((source->flags & UCOL_ITER_INNORMBUF) == 0) {
            // decompose into writable buffer
            int32_t decompLen = unorm_getDecomposition(cp, FALSE, &(source->writableBuffer[1]), UCOL_WRITABLE_BUFFER_SIZE-1);
            if(decompLen < 0) {
@ -3027,6 +3027,221 @@ uint32_t ucol_prv_getSpecialCE(const UCollator *coll, UChar ch, uint32_t CE, col
      }
      return CE;
      }
+    case DIGIT_TAG:      
+      {
+      /* 
+      	 We do a check to see if we want to collate digits as numbers; if so we generate
+         a custom collation key. Otherwise we pull out the value stored in the expansion table.
+      */
+      uint32_t size;
+      uint32_t i;    /* general counter */
+      collIterateState digitState;
+
+      if (coll->numericCollation == UCOL_ON){
+		UChar32 char32 = 0;		
+		
+		uint32_t digIndx = 0; 
+		uint32_t endIndex = 0;		
+		uint32_t trailingZeroIndex = 0;
+
+		uint32_t primWeight = 0;
+		
+		uint32_t digVal = 0;		
+		uint8_t	collateVal = 0;
+		
+		UBool nonZeroValReached = false;
+
+		uint8_t *numTempBuf;
+		uint8_t stackNumTempBuf[UCOL_MAX_BUFFER]; // I just need a temporary place to store my generated CEs.
+		uint32_t numTempBufSize = UCOL_MAX_BUFFER;
+		
+		numTempBuf = stackNumTempBuf;
+		/*
+			 We parse the source string until we hit a char that's NOT a digit.
+      		Use this u_charDigitValue. This might be slow because we have to 
+      		handle surrogates...
+      	*/
+/*      	
+      	if (U16_IS_LEAD(ch)){
+          if (!collIter_eos(source)) {
+            backupState(source, &digitState);
+            UChar trail = getNextNormalizedChar(source);
+            if(U16_IS_TRAIL(trail)) {
+			  char32 = U16_GET_SUPPLEMENTARY(ch, trail);
+            } else {
+              loadState(source, &digitState, TRUE);
+              char32 = ch;
+            }
+          } else {
+		    char32 = ch;
+          }
+        } else {
+		  char32 = ch;
+        }
+		digVal = u_charDigitValue(char32);
+*/
+        digVal = u_charDigitValue(cp); // if we have arrived here, we have
+        // already processed possible supplementaries that trigered the digit tag -
+        // all supplementaries are marked in the UCA.
+		/* 
+			We  pad a zero in front of the first element anyways. This takes
+			care of the (probably) most common case where people are sorting things followed
+		 	by a single digit
+		*/
+		digIndx++;
+      	for(;;){
+      	// Make sure we have enough space.
+      	if (digIndx >= ((numTempBufSize - 2) * 2) + 1)
+      	{
+      		numTempBufSize *= 2;
+      		if (numTempBuf == stackNumTempBuf){
+      			numTempBuf = (uint8_t *)malloc(sizeof(uint8_t) * numTempBufSize);
+      			memcpy(numTempBuf, stackNumTempBuf, UCOL_MAX_BUFFER);
+      		}else
+      			realloc(numTempBuf, numTempBufSize);
+      	}
+      	
+			// Skipping over leading zeroes.      	
+      		if (digVal != 0 || nonZeroValReached){
+				if (digVal != 0 && !nonZeroValReached)
+					nonZeroValReached = true;
+				
+				/*
+					We parse the digit string into base 100 numbers (this fits into a byte).
+				 	We only add to the buffer in twos, thus if we are parsing an odd character,
+					that serves as the 'tens' digit while the if we are parsing an even one, that 
+				 	is the 'ones' digit. We dumped the parsed base 100 value (collateVal) into 
+				 	a buffer. We multiply each collateVal by 2 (to give us room) and add 5 (to avoid
+				 	overlapping magic CE byte values). The last byte we subtract 1 to ensure it is less
+				 	than all the other bytes. 
+				 */
+
+				if (digIndx % 2 == 1){
+					collateVal += (uint8_t)digVal;	
+					 
+					 // This removes trailing zeroes.
+					if (collateVal == 0 && !trailingZeroIndex)
+						trailingZeroIndex = ((digIndx-1)/2) + 2;
+					else if (trailingZeroIndex)
+						trailingZeroIndex = 0;
+						
+					numTempBuf[((digIndx-1)/2) + 2] = collateVal*2 + 6;
+					collateVal = 0;
+				}
+				else{
+					// We drop the collation value into the buffer so if we need to do
+					// a "front patch" we don't have to check to see if we're hitting the
+					// last element.
+					collateVal = (uint8_t)(digVal * 10);
+					numTempBuf[((digIndx)/2) + 2] = collateVal*2 + 6;
+				}
+				digIndx++;
+      		}
+      		
+      		// Get next character.
+      		if (!collIter_eos(source)){
+				ch = getNextNormalizedChar(source);
+				if (U16_IS_LEAD(ch)){
+                  if (!collIter_eos(source)) {
+                    backupState(source, &digitState);
+                    UChar trail = getNextNormalizedChar(source);
+                    if(U16_IS_TRAIL(trail)) {
+					  char32 = U16_GET_SUPPLEMENTARY(ch, trail);
+                    } else {
+                      loadState(source, &digitState, TRUE);
+                      char32 = ch;
+                    }
+                  }
+                } else {
+				  char32 = ch;
+                }
+					
+				if ((digVal = u_charDigitValue(char32)) == -1){
+					// Resetting position to point to the next unprocessed char. We
+					// overshot it when doing our test/set for numbers.
+                  if (char32 > 0xFFFF) { // For surrogates.
+                    loadState(source, &digitState, TRUE);
+					//goBackOne(source);
+                  }
+  				  goBackOne(source);
+				  break;
+				}
+            } else {
+			  break;
+            }
+		}
+		
+		if (nonZeroValReached == false){
+			digIndx = 2;
+			numTempBuf[2] = 6;
+		}
+		
+		endIndex = trailingZeroIndex ? trailingZeroIndex : ((digIndx/2) + 2) ;				
+		if (digIndx % 2 != 0){
+			/* 
+				We missed a value. Since digIndx isn't even, stuck too many values into the buffer (this is what
+				we get for padding the first byte with a zero). "Front-patch" now by pushing all nybbles forward. 
+				Doing it this way ensures that at least 50% of the time (statistically speaking) we'll only be doing a
+				single pass and optimizes for strings with single digits. I'm just assuming that's the more common case.
+			*/
+
+			for(i = 2; i < endIndex; i++){
+				numTempBuf[i] = 	(((((numTempBuf[i] - 6)/2) % 10) * 10) + 
+									(((numTempBuf[i+1])-6)/2) / 10) * 2 + 6;
+			}
+			--digIndx;
+		}
+		
+		// Subtract one off of the last byte. 
+		numTempBuf[endIndex-1] -= 1;			
+				
+		/* 
+			We want to skip over the first two slots in the buffer. The first slot
+			is reserved for the header byte 0x1B. The second slot is for the 
+			sign/exponent byte: 0x80 + (decimalPos/2) & 7f.
+		*/ 
+		numTempBuf[0] = 0x1B;
+		numTempBuf[1] = (uint8_t)(0x80 + ((digIndx/2) & 0x7F));
+		
+		// Now transfer the collation key to our collIterate struct.
+		// The total size for our collation key is endIndx bumped up to the next largest even value divided by two.
+		  size = ((endIndex+1) & ~1)/2;
+		  CE = (((numTempBuf[0] << 8) | numTempBuf[1]) << UCOL_PRIMARYORDERSHIFT) | //Primary weight
+		  		(UCOL_BYTE_COMMON << UCOL_SECONDARYORDERSHIFT) | // Secondary weight
+		  		UCOL_BYTE_COMMON; // Tertiary weight.
+		  i = 2; // Reset the index into the buffer.
+		  while(i < endIndex)
+		  {
+			primWeight = numTempBuf[i++] << 8;
+			if ( i < endIndex)
+				primWeight |= numTempBuf[i++];
+			*(source->CEpos++) = (primWeight << UCOL_PRIMARYORDERSHIFT) | UCOL_CONTINUATION_MARKER;
+		  }
+		  
+		  if (numTempBuf != stackNumTempBuf)
+		  	free(numTempBuf);
+      } else {
+        // no numeric mode, we'll just switch to whatever we stashed and continue
+		  CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* find the offset to expansion table */
+		  CE = *CEOffset++;
+          break;
+#if 0
+		  CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* find the offset to expansion table */
+		  size = getExpansionCount(CE);
+		  CE = *CEOffset++;
+		  if(size != 0) { /* if there are less than 16 elements in expansion, we don't terminate */
+			for(i = 1; i<size; i++) {
+			  *(source->CEpos++) = *CEOffset++;
+			}
+		  } else { /* else, we do */
+			while(*CEOffset != 0) {
+			  *(source->CEpos++) = *CEOffset++;
+			}
+		  }
+#endif
+	  }
+      return CE;
+      }
    /* various implicits optimization */
    // TODO: remove CJK_IMPLICIT_TAG completely - handled by the getImplicit
    case CJK_IMPLICIT_TAG:    /* 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D*/
@ -3492,6 +3707,217 @@ uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE,
        source->CEpos = source->CEs;
      }
      return *(source->toReturn);
+     case DIGIT_TAG:      
+      {
+      /* 
+      	 We do a check to see if we want to collate digits as numbers; if so we generate
+         a custom collation key. Otherwise we pull out the value stored in the expansion table.
+      */
+      //uint32_t size;
+      uint32_t i;    /* general counter */
+      collIterateState state;
+
+      if (coll->numericCollation == UCOL_ON){
+		UChar32 char32 = 0;		
+		
+		uint32_t digIndx = 0; 
+		uint32_t endIndex = 0;		
+		uint32_t leadingZeroIndex = 0;
+
+		uint32_t primWeight = 0;
+		
+		uint32_t digVal = 0;		
+		uint8_t	collateVal = 0;
+		
+		UBool nonZeroValReached = false;
+
+		uint8_t *numTempBuf;
+		uint8_t stackNumTempBuf[UCOL_MAX_BUFFER]; // I just need a temporary place to store my generated CEs.
+		uint32_t numTempBufSize = UCOL_MAX_BUFFER;
+		
+		numTempBuf = stackNumTempBuf;
+		/*
+			 We parse the source string until we hit a char that's NOT a digit.
+      		Use this u_charDigitValue. This might be slow because we have to 
+      		handle surrogates...
+      	*/
+      	
+      	if (U16_IS_TRAIL (ch)){
+      		if (!collIter_bos(source)){
+              UChar lead = getPrevNormalizedChar(source);
+              if(U16_IS_LEAD(lead)) {
+				char32 = U16_GET_SUPPLEMENTARY(lead,ch);
+				goBackOne(source);
+              } else {
+                char32 = ch;
+              }
+            } else {
+				char32 = ch;
+            }
+        } else {
+			char32 = ch;
+        }
+		digVal = u_charDigitValue(char32);
+		
+      	for(;;){
+      	// Make sure we have enough space.
+      	if (digIndx >= ((numTempBufSize - 2) * 2) + 1)
+      	{
+      		numTempBufSize *= 2;
+      		if (numTempBuf == stackNumTempBuf){
+      			numTempBuf = (uint8_t *)malloc(sizeof(uint8_t) * numTempBufSize);
+      			memcpy(numTempBuf, stackNumTempBuf, UCOL_MAX_BUFFER);
+      		}else
+      			realloc(numTempBuf, numTempBufSize);
+      	}
+      	
+			// Skipping over "trailing" zeroes but we still add to digIndx.
+      		if (digVal != 0 || nonZeroValReached){
+				if (digVal != 0 && !nonZeroValReached)
+					nonZeroValReached = true;
+				
+				/*
+					We parse the digit string into base 100 numbers (this fits into a byte).
+				 	We only add to the buffer in twos, thus if we are parsing an odd character,
+					that serves as the 'tens' digit while the if we are parsing an even one, that 
+				 	is the 'ones' digit. We dumped the parsed base 100 value (collateVal) into 
+				 	a buffer. We multiply each collateVal by 2 (to give us room) and add 5 (to avoid
+				 	overlapping magic CE byte values). The last byte we subtract 1 to ensure it is less
+				 	than all the other bytes. 
+				 	
+				 	Since we're doing in this reverse we want to put the first digit encountered into the
+				 	ones place and the second digit encountered into the tens place.
+				 */
+				
+				if (digIndx % 2 == 1){
+					collateVal += (uint8_t)(digVal * 10);
+					
+					 // This removes leading zeroes.
+					if (collateVal == 0 && !leadingZeroIndex)
+						leadingZeroIndex = ((digIndx-1)/2) + 2;
+					else if (leadingZeroIndex)
+						leadingZeroIndex = 0;
+											
+					numTempBuf[((digIndx-1)/2) + 2] = collateVal*2 + 6;
+					collateVal = 0;
+				}
+				else{
+					collateVal = (uint8_t)digVal;	
+				}
+      		}
+      		digIndx++;
+      		
+      		if (!collIter_bos(source)){
+				ch = getPrevNormalizedChar(source);
+				//goBackOne(source);
+				if (U16_IS_TRAIL(ch)){
+                    backupState(source, &state);
+					if (!collIter_bos(source))
+					{
+						goBackOne(source);
+                        UChar lead = getPrevNormalizedChar(source);
+                        if(U16_IS_LEAD(lead)) {
+						  char32 = U16_GET_SUPPLEMENTARY(lead,ch);
+                        } else {
+                          loadState(source, &state, FALSE);
+                          char32 = ch;
+                        }
+					}
+				}
+				else
+					char32 = ch;
+					
+				if ((digVal = u_charDigitValue(char32)) == -1){
+                  if (char32 > 0xFFFF) {// For surrogates.
+                    loadState(source, &state, FALSE);
+                  }
+					// Don't need to "reverse" the goBackOne call,
+					// as this points to the next position to process..
+					//if (char32 > 0xFFFF) // For surrogates.
+						//getNextNormalizedChar(source);
+					break;
+				}
+                goBackOne(source);
+			}else
+				break;
+		}
+
+		if (nonZeroValReached == false){
+			digIndx = 2;
+			numTempBuf[2] = 6;
+		}
+
+		if (digIndx % 2 != 0){
+				numTempBuf[((digIndx)/2) + 2] = collateVal*2 + 6;
+				digIndx += 1;				
+		}
+		
+		endIndex = leadingZeroIndex ? leadingZeroIndex : ((digIndx/2) + 2) ;				
+		
+		// Subtract one off of the last byte. Really the first byte here, but it's reversed...
+		numTempBuf[2] -= 1;			
+				
+		/* 
+			We want to skip over the first two slots in the buffer. The first slot
+			is reserved for the header byte 0x1B. The second slot is for the 
+			sign/exponent byte: 0x80 + (decimalPos/2) & 7f.
+		*/ 
+		numTempBuf[0] = 0x1B;
+		numTempBuf[1] = (uint8_t)(0x80 + ((digIndx/2) & 0x7F));
+		
+		// Now transfer the collation key to our collIterate struct.
+		// The total size for our collation key is endIndx bumped up to the next largest even value divided by two.
+		//size = ((endIndex+1) & ~1)/2;
+		  *(source->CEpos++) = (((numTempBuf[0] << 8) | numTempBuf[1]) << UCOL_PRIMARYORDERSHIFT) | //Primary weight
+		  		(UCOL_BYTE_COMMON << UCOL_SECONDARYORDERSHIFT) | // Secondary weight
+		  		UCOL_BYTE_COMMON; // Tertiary weight.
+		  i = endIndex - 1; // Reset the index into the buffer.
+		  while(i >= 2)
+		  {
+			primWeight = numTempBuf[i--] << 8;
+			if ( i >= 2)
+				primWeight |= numTempBuf[i--];
+			*(source->CEpos++) = (primWeight << UCOL_PRIMARYORDERSHIFT) | UCOL_CONTINUATION_MARKER;
+		  }
+		  if (numTempBuf != stackNumTempBuf)
+		  	free(numTempBuf);
+		  	
+		  source->toReturn = source->CEpos -1;
+		  return *(source->toReturn);
+      }
+      else {
+		  CEOffset = (uint32_t *)coll->image + getExpansionOffset(CE);
+          CE = *(CEOffset++);
+          break;
+#if 0
+		/* find the offset to expansion table */
+		  CEOffset = (uint32_t *)coll->image + getExpansionOffset(CE);
+		  size     = getExpansionCount(CE);
+		  if (size != 0) {
+			/*
+			if there are less than 16 elements in expansion, we don't terminate
+			*/
+			uint32_t count;
+			for (count = 0; count < size; count++) {
+			  *(source->CEpos ++) = *CEOffset++;
+			}
+		  }
+		  else {
+			/* else, we do */
+			while (*CEOffset != 0) {
+			  *(source->CEpos ++) = *CEOffset ++;
+			}
+		  }
+		  source->toReturn = source->CEpos - 1;
+          // in case of one element expansion, we 
+          // want to immediately return CEpos
+          if(source->toReturn == source->CEs) {
+            source->CEpos = source->CEs;
+          }
+		  return *(source->toReturn);
+#endif
+	  }
+      }  
    case HANGUL_SYLLABLE_TAG: /* AC00-D7AF*/
      {
        const uint32_t
@ -6485,6 +6911,20 @@ ucol_setAttribute(UCollator *coll, UColAttribute attr, UColAttributeValue value,
    UColAttributeValue oldFrench = coll->frenchCollation;
    UColAttributeValue oldCaseFirst = coll->caseFirst;
    switch(attr) {
+    case UCOL_NUMERIC_COLLATION: /* sort substrings of digits as numbers */
+      if(value == UCOL_ON) {
+        coll->numericCollation = UCOL_ON;
+        coll->numericCollationisDefault = FALSE;
+      } else if (value == UCOL_OFF) {
+        coll->numericCollation = UCOL_OFF;
+        coll->numericCollationisDefault = FALSE;
+      } else if (value == UCOL_DEFAULT) {
+        coll->numericCollationisDefault = TRUE;
+        coll->numericCollation = (UColAttributeValue)coll->options->numericCollation;
+      } else {
+        *status = U_ILLEGAL_ARGUMENT_ERROR;
+      }
+      break;
    case UCOL_HIRAGANA_QUATERNARY_MODE: /* special quaternary values for Hiragana */
      if(value == UCOL_ON) {
        coll->hiraganaQ = UCOL_ON;
@ -6602,6 +7042,8 @@ ucol_getAttribute(const UCollator *coll, UColAttribute attr, UErrorCode *status)
      return UCOL_DEFAULT;
    }
    switch(attr) {
+    case UCOL_NUMERIC_COLLATION:
+      return coll->numericCollation;    
    case UCOL_HIRAGANA_QUATERNARY_MODE:
      return coll->hiraganaQ;
    case UCOL_FRENCH_COLLATION: /* attribute for direction of secondary weights*/
@ -8399,3 +8841,4 @@ returnResult:

 }
 #endif /* #if !UCONFIG_NO_COLLATION */
+
--- a/icu4c/source/i18n/ucol_elm.cpp
+++ b/icu4c/source/i18n/ucol_elm.cpp
@ -470,7 +470,7 @@ int uprv_uca_setMaxExpansion(uint32_t           endexpansion,
      start = mid;                                                           
    }                                                                        
  } 
-      
+
  if (*start == endexpansion) {                                                     
    result = start - pendexpansionce;  
  }                                                                          
@ -478,7 +478,7 @@ int uprv_uca_setMaxExpansion(uint32_t           endexpansion,
    if (*limit == endexpansion) {                                                     
      result = limit - pendexpansionce;      
    }                                            
-      
+
  if (result > -1) {
    /* found the ce in expansion, we'll just modify the size if it is 
       smaller */
@ -494,7 +494,7 @@ int uprv_uca_setMaxExpansion(uint32_t           endexpansion,
    int      shiftsize     = (pendexpansionce + pos) - start;
    uint32_t *shiftpos     = start + 1;
    uint8_t  *sizeshiftpos = pexpansionsize + (shiftpos - pendexpansionce);
-    
+
    /* okay need to rearrange the array into sorted order */
    if (shiftsize == 0 || *(pendexpansionce + pos) < endexpansion) {
      *(pendexpansionce + pos + 1) = endexpansion;
@ -631,7 +631,7 @@ int uprv_uca_setMaxJamoExpansion(UChar                  ch,
  *(pendexpansionce + maxexpansion->position) = endexpansion;
  *(maxexpansion->isV + maxexpansion->position) = isV;
  maxexpansion->position ++;
-  
+
  return maxexpansion->position;
 }

@ -1012,9 +1012,12 @@ uprv_uca_addAnElement(tempUCATable *t, UCAElements *element, UErrorCode *status)
  if(U_FAILURE(*status)) {
      return 0xFFFF;
  }
+
+  element->mapCE = 0; // clear mapCE so that we can catch expansions
+
  if(element->noOfCEs == 1) {
    if(element->isThai == FALSE) {
-      element->mapCE = element->CEs[0];
+		  element->mapCE = element->CEs[0];      
    } else { /* add thai - totally bad here */
      expansion = (uint32_t)(UCOL_SPECIAL_FLAG | (THAI_TAG<<UCOL_TAG_SHIFT) 
        | ((uprv_uca_addExpansion(expansions, element->CEs[0], status)+(headersize>>2))<<4) 
@ -1048,10 +1051,10 @@ uprv_uca_addAnElement(tempUCATable *t, UCAElements *element, UErrorCode *status)
        | ((element->CEs[0]>>8) & 0xFFFF00) // first and second byte of primary
        | ((element->CEs[1]>>24) & 0xFF);   // third byte of primary
    } else {
-      expansion = (uint32_t)(UCOL_SPECIAL_FLAG | (EXPANSION_TAG<<UCOL_TAG_SHIFT) 
-        | ((uprv_uca_addExpansion(expansions, element->CEs[0], status)+(headersize>>2))<<4)
-        & 0xFFFFF0);
-
+	  expansion = (uint32_t)(UCOL_SPECIAL_FLAG | (EXPANSION_TAG<<UCOL_TAG_SHIFT) 
+		| ((uprv_uca_addExpansion(expansions, element->CEs[0], status)+(headersize>>2))<<4)
+		& 0xFFFFF0);
+		
      for(i = 1; i<element->noOfCEs; i++) {
        uprv_uca_addExpansion(expansions, element->CEs[i], status);
      }
@ -1076,6 +1079,32 @@ uprv_uca_addAnElement(tempUCATable *t, UCAElements *element, UErrorCode *status)
    }
  }

+  // We treat digits differently - they are "uber special" and should be
+  // processed differently if numeric collation is on. 
+  UChar32 uniChar = 0;
+  //printElement(element);
+  if ((element->cSize == 2) && U16_IS_LEAD(element->uchars[0])){
+	  uniChar = U16_GET_SUPPLEMENTARY(element->uchars[0], element->uchars[1]);	  
+  } else if (element->cSize == 1){
+	  uniChar = element->uchars[0];
+  }
+
+  // Here, we either have one normal CE OR mapCE is set. Therefore, we stuff only
+  // one element to the expansion buffer. When we encounter a digit and we don't 
+  // do numeric collation, we will just pick the CE we have and break out of case
+  // (see ucol.cpp ucol_prv_getSpecialCE && ucol_prv_getSpecialPrevCE). If we picked
+  // a special, further processing will occur. If it's a simple CE, we'll return due
+  // to how the loop is constructed.
+  if (uniChar != 0 && u_isdigit(uniChar)){
+	  expansion = (uint32_t)(UCOL_SPECIAL_FLAG | (DIGIT_TAG<<UCOL_TAG_SHIFT) | 1); // prepare the element
+      if(element->mapCE) { // if there is an expansion, we'll pick it here
+        expansion |= ((uprv_uca_addExpansion(expansions, element->mapCE, status)+(headersize>>2))<<4);
+      } else {
+	    expansion |= ((uprv_uca_addExpansion(expansions, element->CEs[0], status)+(headersize>>2))<<4);
+      }
+	  element->mapCE = expansion;
+  }
+
  // here we want to add the prefix structure.
  // I will try to process it as a reverse contraction, if possible.
  // prefix buffer is already reversed.
@ -1157,7 +1186,7 @@ void uprv_uca_getMaxExpansionJamo(UNewTrie       *mapping,
  const uint32_t TBASE  = 0x11A8;
  const uint32_t VCOUNT = 21;
  const uint32_t TCOUNT = 28;
-  
+
  uint32_t v = VBASE + VCOUNT - 1;
  uint32_t t = TBASE + TCOUNT - 1;
  uint32_t ce;
@ -1556,3 +1585,5 @@ uprv_uca_canonicalClosure(tempUCATable *t, UErrorCode *status)
 U_NAMESPACE_END

 #endif /* #if !UCONFIG_NO_COLLATION */
+
+
--- a/icu4c/source/i18n/ucol_imp.h
+++ b/icu4c/source/i18n/ucol_imp.h
@ -635,6 +635,9 @@ typedef enum {
    LONG_PRIMARY_TAG = 12,   /* This is a three byte primary with starting secondaries and tertiaries */
                             /* It fits in a single 32 bit CE and is used instead of expansion to save */
                             /* space without affecting the performance (hopefully) */
+                             
+	DIGIT_TAG = 13,			/* COllate Digits As Numbers (CODAN) implementation */
+	
    CE_TAGS_COUNT
 } UColCETags;

@ -653,7 +656,8 @@ typedef struct {
      /*UColAttributeValue*/ int32_t caseLevel;         /* do we have an extra case level */
      /*UColAttributeValue*/ int32_t normalizationMode; /* attribute for normalization */
      /*UColAttributeValue*/ int32_t strength;          /* attribute for strength */
-      /*UColAttributeValue*/ int32_t hiraganaQ;         /* attribuge for special Hiragana */
+      /*UColAttributeValue*/ int32_t hiraganaQ;         /* attribute for special Hiragana */
+      /*UColAttributeValue*/ int32_t numericCollation;
      uint8_t reserved[64];                 /* for future use */
 } UColOptionSet;

@ -862,6 +866,7 @@ struct UCollator {
    UColAttributeValue normalizationMode; /* attribute for normalization */
    UColAttributeValue strength;          /* attribute for strength */
    UColAttributeValue hiraganaQ;         /* attribute for Hiragana */
+    UColAttributeValue numericCollation;
    UBool variableTopValueisDefault;
    UBool frenchCollationisDefault;
    UBool alternateHandlingisDefault; /* attribute for handling variable elements*/
@ -870,6 +875,7 @@ struct UCollator {
    UBool normalizationModeisDefault; /* attribute for normalization */
    UBool strengthisDefault;          /* attribute for strength */
    UBool hiraganaQisDefault;         /* attribute for Hiragana */
+    UBool numericCollationisDefault;
    UBool hasRealData;                /* some collators have only options, like French, no rules */
                                      /* to speed up things, we use the UCA image, but we don't want it */
                                      /* to run around */
@ -980,4 +986,3 @@ static inline UBool ucol_unsafeCP(UChar c, const UCollator *coll) {
 #endif /* #if !UCONFIG_NO_COLLATION */

 #endif
-
--- a/icu4c/source/i18n/ucol_tok.cpp
+++ b/icu4c/source/i18n/ucol_tok.cpp
@ -212,13 +212,16 @@ void ucol_uprv_tok_setOptionInImage(UColOptionSet *opts, UColAttribute attrib, U
  case UCOL_STRENGTH:
    opts->strength = value;
    break;
+  case UCOL_NUMERIC_COLLATION:
+  	opts->numericCollation = value;
+  	break;
  case UCOL_ATTRIBUTE_COUNT:
  default:
    break;
  }
 }

-#define UTOK_OPTION_COUNT 19
+#define UTOK_OPTION_COUNT 20

 static UBool didInit = FALSE;
 /* we can be strict, or we can be lenient */
@ -265,6 +268,7 @@ U_STRING_DECL(option_15,    "first",          5);
 U_STRING_DECL(option_16,    "last",           4);
 U_STRING_DECL(option_17,    "optimize",       8);
 U_STRING_DECL(option_18,    "suppressContractions",         20);
+U_STRING_DECL(option_19,    "numericOrdering",				15);


 /*
@ -320,6 +324,30 @@ static const ucolTokSuboption firstLastSub[7] = {
  {suboption_17, 8, UCOL_PRIMARY},
 };

+enum OptionNumber {
+  OPTION_ALTERNATE_HANDLING = 0,
+    OPTION_FRENCH_COLLATION,
+    OPTION_CASE_LEVEL,
+    OPTION_CASE_FIRST,
+    OPTION_NORMALIZATION_MODE,
+    OPTION_HIRAGANA_QUATERNARY,
+    OPTION_STRENGTH,
+    OPTION_NUMERIC_COLLATION,
+    OPTION_NORMAL_OPTIONS_LIMIT = OPTION_NUMERIC_COLLATION,
+    OPTION_VARIABLE_TOP,
+    OPTION_REARRANGE,
+    OPTION_BEFORE,
+    OPTION_TOP,
+    OPTION_FIRST,
+    OPTION_LAST,
+    OPTION_OPTIMIZE,
+    OPTION_SUPPRESS_CONTRACTIONS,
+    OPTION_UNDEFINED,
+    OPTION_SCRIPT_ORDER,
+    OPTION_CHARSET_NAME,
+    OPTION_CHARSET
+} ;
+
 static const ucolTokOption rulesOptions[UTOK_OPTION_COUNT] = {
 /*00*/ {option_02,  9, alternateSub, 2, UCOL_ALTERNATE_HANDLING}, /*"alternate" */
 /*01*/ {option_03,  9, frenchSub, 1, UCOL_FRENCH_COLLATION}, /*"backwards"      */
@ -328,18 +356,19 @@ static const ucolTokOption rulesOptions[UTOK_OPTION_COUNT] = {
 /*04*/ {option_06, 13, onOffSub, 2, UCOL_NORMALIZATION_MODE}, /*"normalization" */
 /*05*/ {option_13, 9, onOffSub, 2, UCOL_HIRAGANA_QUATERNARY_MODE}, /*"hiraganaQ" */
 /*06*/ {option_14, 8, strengthSub, 5, UCOL_STRENGTH}, /*"strength" */
- /*07*/ {option_04, 12, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"variable top"   */
- /*08*/ {option_01,  9, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"rearrange"      */
- /*09*/ {option_12,  6, beforeSub, 3, UCOL_ATTRIBUTE_COUNT}, /*"before"    */
- /*10*/ {option_05,  3, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"top"            */
- /*11*/ {option_15,  5, firstLastSub, 7, UCOL_ATTRIBUTE_COUNT}, /*"first" */
- /*12*/ {option_16,  4, firstLastSub, 7, UCOL_ATTRIBUTE_COUNT}, /*"last" */
- /*13*/ {option_17,  8, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"optimize"      */
- /*14*/ {option_18, 20, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"suppressContractions"      */
- /*15*/ {option_00,  9, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"undefined"      */
- /*16*/ {option_09, 11, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"scriptOrder"    */
- /*17*/ {option_10, 11, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"charsetname"    */
- /*18*/ {option_11,  7, NULL, 0, UCOL_ATTRIBUTE_COUNT}  /*"charset"        */
+ /*07*/ {option_19, 15, onOffSub, 2, UCOL_NUMERIC_COLLATION},  /*"numericOrdering"*/ 
+ /*08*/ {option_04, 12, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"variable top"   */
+ /*09*/ {option_01,  9, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"rearrange"      */
+ /*10*/ {option_12,  6, beforeSub, 3, UCOL_ATTRIBUTE_COUNT}, /*"before"    */
+ /*11*/ {option_05,  3, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"top"            */
+ /*12*/ {option_15,  5, firstLastSub, 7, UCOL_ATTRIBUTE_COUNT}, /*"first" */
+ /*13*/ {option_16,  4, firstLastSub, 7, UCOL_ATTRIBUTE_COUNT}, /*"last" */
+ /*14*/ {option_17,  8, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"optimize"      */
+ /*15*/ {option_18, 20, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"suppressContractions"      */
+ /*16*/ {option_00,  9, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"undefined"      */
+ /*17*/ {option_09, 11, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"scriptOrder"    */
+ /*18*/ {option_10, 11, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"charsetname"    */
+ /*19*/ {option_11,  7, NULL, 0, UCOL_ATTRIBUTE_COUNT}  /*"charset"        */
 };

 static
@ -406,6 +435,7 @@ void ucol_uprv_tok_initData() {
    U_STRING_INIT(option_16, "last",           4);
    U_STRING_INIT(option_17, "optimize",       8);
    U_STRING_INIT(option_18, "suppressContractions",         20);
+	U_STRING_INIT(option_19, "numericOrdering",      15);
    didInit = TRUE;
  }
 }
@ -558,65 +588,85 @@ uint8_t ucol_uprv_tok_readAndSetOption(UColTokenParser *src, UErrorCode *status)

  if(i < 0) {
    *status = U_ILLEGAL_ARGUMENT_ERROR;
-  } else if(i<7) {
-    if(optionArg) {
-      for(j = 0; j<rulesOptions[i].subSize; j++) {
-        if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) {
-          ucol_uprv_tok_setOptionInImage(src->opts, rulesOptions[i].attr, rulesOptions[i].subopts[j].attrVal);
-          result =  UCOL_TOK_SUCCESS;
-        }
-      }
-    } 
-    if(result == 0) {
-      *status = U_ILLEGAL_ARGUMENT_ERROR;
-    }
-  } else if(i == 7) { /* variable top */
-    result = UCOL_TOK_SUCCESS | UCOL_TOK_VARIABLE_TOP;
-  } else if(i == 8) {  /*rearange */
-    result = UCOL_TOK_SUCCESS;
-  } else if(i == 9) {  /*before*/
-    if(optionArg) {
-      for(j = 0; j<rulesOptions[i].subSize; j++) {
-        if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) {
-        result = UCOL_TOK_SUCCESS | rulesOptions[i].subopts[j].attrVal + 1;
-        }
-      }
-    }
-    if(result == 0) {
-      *status = U_ILLEGAL_ARGUMENT_ERROR;
-    }
-    
-  } else if(i == 10) {  /*top */ /* we are going to have an array with structures of limit CEs */
-    /* index to this array will be src->parsedToken.indirectIndex*/
-    src->parsedToken.indirectIndex = 0;
-    result = UCOL_TOK_SUCCESS | UCOL_TOK_TOP;
-  } else if(i == 11 || i ==12) { /* first, last */
-    for(j = 0; j<rulesOptions[i].subSize; j++) {
-      if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) {
-        src->parsedToken.indirectIndex = (uint16_t)(i-10+j*2);         
-        result =  UCOL_TOK_SUCCESS | UCOL_TOK_TOP;;
-      }
-    }
-    if(result == 0) {
-      *status = U_ILLEGAL_ARGUMENT_ERROR;
-    }
-  } else if(i == 13 || i == 14) { // copy and remove are handled before normalization
-    // we need to move end here
-    int32_t noOpenBraces = 1;
-    src->current++; // skip opening brace
-    while(src->current < src->end && noOpenBraces != 0) {
-      if(*src->current == 0x005b) {
-        noOpenBraces++;
-      } else if(*src->current == 0x005D) { // closing brace
-        noOpenBraces--;
-      }
-      src->current++;
-    }
-    result = UCOL_TOK_SUCCESS;
  } else {
-    *status = U_UNSUPPORTED_ERROR;
+    int32_t noOpenBraces = 1;
+    switch(i) {
+    case OPTION_ALTERNATE_HANDLING:
+    case OPTION_FRENCH_COLLATION:
+    case OPTION_CASE_LEVEL:
+    case OPTION_CASE_FIRST:
+    case OPTION_NORMALIZATION_MODE:
+    case OPTION_HIRAGANA_QUATERNARY:
+    case OPTION_STRENGTH:
+    case OPTION_NUMERIC_COLLATION:
+      if(optionArg) {
+        for(j = 0; j<rulesOptions[i].subSize; j++) {
+          if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) {
+            ucol_uprv_tok_setOptionInImage(src->opts, rulesOptions[i].attr, rulesOptions[i].subopts[j].attrVal);
+            result =  UCOL_TOK_SUCCESS;
+          }
+        }
+      } 
+      if(result == 0) {
+        *status = U_ILLEGAL_ARGUMENT_ERROR;
+      }
+      break;
+    case OPTION_VARIABLE_TOP:
+      result = UCOL_TOK_SUCCESS | UCOL_TOK_VARIABLE_TOP;
+      break;
+    case OPTION_REARRANGE:
+      result = UCOL_TOK_SUCCESS;
+      break;
+    case OPTION_BEFORE:
+      if(optionArg) {
+        for(j = 0; j<rulesOptions[i].subSize; j++) {
+          if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) {
+          result = UCOL_TOK_SUCCESS | rulesOptions[i].subopts[j].attrVal + 1;
+          }
+        }
+      }
+      if(result == 0) {
+        *status = U_ILLEGAL_ARGUMENT_ERROR;
+      }
+      break;
+    case OPTION_TOP: /* we are going to have an array with structures of limit CEs */
+      /* index to this array will be src->parsedToken.indirectIndex*/
+      src->parsedToken.indirectIndex = 0;
+      result = UCOL_TOK_SUCCESS | UCOL_TOK_TOP;
+      break;
+    case OPTION_FIRST:
+    case OPTION_LAST: /* first, last */
+      for(j = 0; j<rulesOptions[i].subSize; j++) {
+        if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) {
+          // the calculation below assumes that OPTION_FIRST and OPTION_LAST are at i and i+1 and that the first
+          // element of indirect boundaries is reserved for top.
+          src->parsedToken.indirectIndex = (uint16_t)(i-OPTION_FIRST+1+j*2);
+          result =  UCOL_TOK_SUCCESS | UCOL_TOK_TOP;;
+        }
+      }
+      if(result == 0) {
+        *status = U_ILLEGAL_ARGUMENT_ERROR;
+      }
+      break;
+    case OPTION_OPTIMIZE:
+    case OPTION_SUPPRESS_CONTRACTIONS:  // copy and remove are handled before normalization
+      // we need to move end here
+      src->current++; // skip opening brace
+      while(src->current < src->end && noOpenBraces != 0) {
+        if(*src->current == 0x005b) {
+          noOpenBraces++;
+        } else if(*src->current == 0x005D) { // closing brace
+          noOpenBraces--;
+        }
+        src->current++;
+      }
+      result = UCOL_TOK_SUCCESS;
+      break;
+    default:
+      *status = U_UNSUPPORTED_ERROR;
+      break;
+    }
  }
-  //src->current = u_strchr(src->current, 0x005d /*']'*/);
  src->current = u_memchr(src->current, 0x005d, src->end-src->current);
  return result;
 }
@ -1546,7 +1596,7 @@ void ucol_tok_initTokenList(UColTokenParser *src, const UChar *rules, const uint
      // while((openBrace = u_strchr(openBrace, 0x005B)) != NULL) { // find open braces
      //optionNumber = ucol_uprv_tok_readOption(openBrace+1, rules+rulesLength, &setStart);
      optionNumber = ucol_uprv_tok_readOption(rules+i+1, rules+rulesLength, &setStart);
-      if(optionNumber == 13) { /* copy - parts of UCA to tailoring */
+      if(optionNumber == OPTION_OPTIMIZE) { /* copy - parts of UCA to tailoring */
        USet *newSet = ucol_uprv_tok_readAndSetUnicodeSet(setStart, rules+rulesLength, status);
        if(U_SUCCESS(*status)) {
          if(src->copySet == NULL) {
@ -1558,7 +1608,7 @@ void ucol_tok_initTokenList(UColTokenParser *src, const UChar *rules, const uint
        } else {
          return;
        }
-      } else if(optionNumber == 14) {
+      } else if(optionNumber == OPTION_SUPPRESS_CONTRACTIONS) {
        USet *newSet = ucol_uprv_tok_readAndSetUnicodeSet(setStart, rules+rulesLength, status);
        if(U_SUCCESS(*status)) {
          if(src->removeSet == NULL) {
@ -1698,3 +1748,4 @@ void ucol_tok_closeTokenList(UColTokenParser *src) {
 }

 #endif /* #if !UCONFIG_NO_COLLATION */
+
--- a/icu4c/source/i18n/unicode/ucol.h
+++ b/icu4c/source/i18n/unicode/ucol.h
@ -228,7 +228,13 @@ typedef enum {
      * non-ignorables on quaternary level
      * This is a sneaky way to produce JIS
      * sort order */     
-     UCOL_HIRAGANA_QUATERNARY_MODE, 
+     UCOL_HIRAGANA_QUATERNARY_MODE,
+     /** when turned on, this attribute 
+      * generates a collation key
+      * for the numeric value of substrings
+      * of digits. This is a way to get '100' 
+      * to sort AFTER '2'.*/          
+     UCOL_NUMERIC_COLLATION, 
     UCOL_ATTRIBUTE_COUNT
 } UColAttribute;

@ -793,3 +799,4 @@ ucol_getTailoredSet(const UCollator *coll, UErrorCode *status);
 #endif /* #if !UCONFIG_NO_COLLATION */

 #endif
+