ICU-861

Updated backwards collation element iterator codes. X-SVN-Rev: 3760
2001-02-23 23:36:42 +00:00 · 2001-02-23 23:36:42 +00:00 · ec4c07eeb0
commit ec4c07eeb0
parent 6d5b35e584
3 changed files with 123 additions and 147 deletions
--- a/icu4c/source/i18n/ucol.cpp
+++ b/icu4c/source/i18n/ucol.cpp
@ -7,7 +7,6 @@
 * Date        Name      Comments
 * 02/16/2001  synwee    Added internal method getPrevSpecialCE 
 */
-
 #include "ucolimp.h"
 #include "ucoltok.h"

@ -1068,12 +1067,13 @@ uint32_t ucol_getNextUCA(UChar ch, collIterate *collationSource, UErrorCode *sta
          T += TBase;

          // return the first CE, but first put the rest into the expansion buffer
-
          if (!collationSource->JamoSpecial) { // FAST PATH
+            
            *(collationSource->CEpos++) = ucmp32_get(UCA->mapping, V);
            if (T != TBase) {
                *(collationSource->CEpos++) = ucmp32_get(UCA->mapping, T);
            }
+            
            return ucmp32_get(UCA->mapping, L); // return first one

          } else { // Jamo is Special
@ -1103,6 +1103,7 @@ uint32_t ucol_getNextUCA(UChar ch, collIterate *collationSource, UErrorCode *sta
          }
          /* This is a code point minus 0x10000, that's what algorithm requires */
          order = 0xE0010303 | (cp & 0xFFE00) << 8;
+
          *(collationSource->CEpos++) = 0x80200080 | (cp & 0x001FF) << 22;
        } else {
          return 0; /* completely ignorable */
@ -1144,7 +1145,7 @@ uint32_t ucol_getPrevUCA(UChar ch, collIterate *collationSource,
    We have to check if ch is possibly a first surrogate - then we need to 
    take the next code unit and make a bigger CE 
    */
-    UChar nextChar;
+    UChar prevChar;
    const int 
      SBase = 0xAC00, LBase = 0x1100, VBase = 0x1161, TBase = 0x11A7,
      LCount = 19, VCount = 21, TCount = 28,
@ -1184,11 +1185,13 @@ uint32_t ucol_getPrevUCA(UChar ch, collIterate *collationSource,
      */
      if (!collationSource->JamoSpecial) 
      { 
+        *(collationSource->CEpos ++) = ucmp32_get(UCA->mapping, L); 
        *(collationSource->CEpos ++) = ucmp32_get(UCA->mapping, V);
        if (T != TBase)
-          *(collationSource->CEpos++) = ucmp32_get(UCA->mapping, T);
-        /* return first one */
-        return ucmp32_get(UCA->mapping, L); 
+          *(collationSource->CEpos ++) = ucmp32_get(UCA->mapping, T);
+        
+        collationSource->toReturn = collationSource->CEpos - 1;
+        return *(collationSource->toReturn);
      } else { 
        /* 
        Jamo is Special
@ -1213,28 +1216,23 @@ uint32_t ucol_getPrevUCA(UChar ch, collIterate *collationSource,

    if (UTF_IS_SECOND_SURROGATE(ch)) 
    {
-      if ((collationSource->len - collationSource->pos != length) &&
-                  (UTF_IS_FIRST_SURROGATE(nextChar = *collationSource->pos))) 
+      UChar *temp = collationSource->pos;
+      if (((collationSource->string < temp) ||
+          (collationSource->writableBuffer < temp)) &&
+          (UTF_IS_FIRST_SURROGATE(prevChar = *(collationSource->pos - 1)))) 
      {
-        uint32_t cp = ((ch << 10UL) + nextChar - ((0xd800 << 10UL) + 0xdc00));
-        if (collationSource->pos != collationSource->writableBuffer)
-          collationSource->pos --;
-        else
-        {
-          collationSource->pos = collationSource->string + 
-           (length - (collationSource->len - collationSource->writableBuffer));
-          collationSource->len = collationSource->string + length;
-          collationSource->isThai = TRUE;
-        }
+        uint32_t cp = ((prevChar << 10UL) + ch - ((0xd800 << 10UL) + 0xdc00));
+        collationSource->pos --;
        if ((cp & 0xFFFE) == 0xFFFE || (0xD800 <= cp && cp <= 0xDC00))
          return 0;  /* illegal code value, use completely ignoreable! */
        
        /* 
        This is a code point minus 0x10000, that's what algorithm requires 
        */
-        order = 0xE0010303 | (cp & 0xFFE00) << 8;
-        *(collationSource->CEpos ++) = 0x80200080 | (cp & 0x001FF) << 22;
-        collationSource->toReturn ++;
+        *(collationSource->CEpos ++) = 0xE0010303 | (cp & 0xFFE00) << 8;
+        order = 0x80200080 | (cp & 0x001FF) << 22;
+        collationSource->toReturn = collationSource->CEpos;
+        *(collationSource->CEpos ++) = order;
      } 
      else
        return 0; /* completely ignorable */
@ -1246,9 +1244,11 @@ uint32_t ucol_getPrevUCA(UChar ch, collIterate *collationSource,
        return 0; /* completely ignorable */
      
      /* Make up an artifical CE from code point as per UCA */
-      order = 0xD08003C3 | (ch & 0xF000) << 12 | (ch & 0x0FE0) << 11;
-      *(collationSource->CEpos ++) = 0x04000080 | (ch & 0x001F) << 27;
-      collationSource->toReturn ++;
+      *(collationSource->CEpos ++) = 0xD08003C3 | (ch & 0xF000) << 12 | 
+                                     (ch & 0x0FE0) << 11;
+      collationSource->toReturn = collationSource->CEpos;
+      order = 0x04000080 | (ch & 0x001F) << 27;
+      *(collationSource->CEpos ++) = order;
    }
  }
  return order; /* return the CE */
@ -1397,12 +1397,7 @@ uint32_t getSpecialPrevCE(const UCollator *coll, uint32_t CE,
      if (source->isThai == TRUE) 
      { /* if we encountered Thai prevowel & the string is not yet touched */
        source->isThai = FALSE;
-        /*
-        sigh... to cater for getNextCE, we'll have to modify and store the 
-        whole string instead of a substring as in getSpecialCE
-        */
-        UCharOffset = source->pos;
-        strend =  source->len;
+        strend =  source->pos;
        size = strend - source->string;
        if (size > UCOL_WRITABLE_BUFFER_SIZE) 
        {
@ -1417,22 +1412,21 @@ uint32_t getSpecialPrevCE(const UCollator *coll, uint32_t CE,
        } 
        UChar *sourceCopy = source->string;
        UChar *targetCopy = source->writableBuffer;
-        while (sourceCopy < strend)
+        while (sourceCopy <= strend)
        {
 	        if (UCOL_ISTHAIPREVOWEL(*sourceCopy) &&      
            /* This is the combination that needs to be swapped */
 		        UCOL_ISTHAIBASECONSONANT(*(sourceCopy + 1))) 
          {
-		        *(targetCopy) = *(sourceCopy + count + 1);
-		        *(targetCopy+1) = *(sourceCopy + count);
-		        targetCopy+=2;
-		        sourceCopy+=2;
+		        *(targetCopy)     = *(sourceCopy + 1);
+		        *(targetCopy + 1) = *(sourceCopy);
+		        targetCopy += 2;
+		        sourceCopy += 2;
 	        } 
          else
-		        *(targetCopy++) = *(sourceCopy++);
+		        *(targetCopy ++) = *(sourceCopy ++);
        }
-        source->pos   = source->writableBuffer + 
-                                               (UCharOffset - source->string);
+        source->pos   = targetCopy;
        source->len   = targetCopy;
        source->CEpos = source->toReturn = source->CEs;
        CE = UCOL_IGNORABLE;
@ -1470,32 +1464,22 @@ uint32_t getSpecialPrevCE(const UCollator *coll, uint32_t CE,
        */
        UCharOffset += *UCharOffset; 

-        schar = *source->pos;
+        schar = *(source->pos - 1);
        while (schar > (tchar = *UCharOffset)) 
          UCharOffset ++;
        
        if (schar != tchar) 
-        { 
+        {
          /* 
          we didn't find the correct codepoint. We can use either the first or 
          the last CE 
          */
-          if (tchar != 0xFFFF)
-            UCharOffset = constart; 
+          /* testing if (tchar != 0xFFFF) */
+          UCharOffset = constart; 
        } 
        else
-        {
          /* Move up one character */
-          if (source->pos != source->writableBuffer)
-            source->pos --;
-          else
-          {
-            source->pos = source->string + 
-                          (length - (source->len - source->writableBuffer));
-            source->len = source->string + length;
-            source->isThai = TRUE;
-          }
-        }
+          source->pos --;
        CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex));
        if (!isContraction(CE))
          break;  
@ -1521,7 +1505,7 @@ uint32_t getSpecialPrevCE(const UCollator *coll, uint32_t CE,
        while (*CEOffset != 0) 
          *(source->CEpos ++) = *CEOffset ++;
      source->toReturn = source->CEpos - 1;
-      return *(source->toReturn --);
+      return *(source->toReturn);
    case CHARSET_TAG:
      /* probably after 1.8 */
      return UCOL_NOT_FOUND;
--- a/icu4c/source/i18n/ucoleitr.cpp
+++ b/icu4c/source/i18n/ucoleitr.cpp
@ -57,6 +57,7 @@ ucol_openElements(const UCollator  *coll,
    textLength = u_strlen(text);

  result->length_ = textLength;
+  result->reset_   = TRUE;
  init_collIterate(text, textLength, &result->iteratordata_, FALSE);

  return result;
@ -77,6 +78,7 @@ U_CAPI void
 ucol_reset(UCollationElements *elems)
 {
  collIterate *ci = &(elems->iteratordata_);
+  elems->reset_   = TRUE;
  ci->pos         = ci->string;
  ci->len         = ci->string + elems->length_;
  ci->CEpos       = ci->toReturn = ci->CEs;
@ -97,40 +99,42 @@ U_CAPI int32_t
 ucol_next(UCollationElements *elems,
          UErrorCode         *status)
 {
+  int32_t result;
  if (U_FAILURE(*status)) 
    return UCOL_NULLORDER;

-  int32_t result;
+  elems->reset_ = FALSE;
+  
  UCOL_GETNEXTCE(result, elems->collator_, elems->iteratordata_, status);
  /*
  if ((elems->iteratordata_).CEpos > (elems->iteratordata_).toReturn) 
-  {                       
-    result = *((elems->iteratordata_).toReturn++);                                      
-    if ((elems->iteratordata_).CEpos == (elems->iteratordata_).toReturn)
-      (elems->iteratordata_).CEpos = (elems->iteratordata_).toReturn = 
-      (elems->iteratordata_).CEs; 
-  } 
-  else 
-    if ((elems->iteratordata_).pos < (elems->iteratordata_).len) 
-    {                        
-      UChar ch = *(elems->iteratordata_).pos++;     
-      if (ch <= 0xFF)
-        (result) = (elems->collator_)->latinOneMapping[ch];                                          
-      else
-        (result) = ucmp32_get((elems->collator_)->mapping, ch);                                      
-                                                                                    
-      if((result) >= UCOL_NOT_FOUND) 
-      {
-        (result) = getSpecialCE((elems->collator_), (result), 
-                                &(elems->iteratordata_), (status));        
-        if ((result) == UCOL_NOT_FOUND)
-          (result) = ucol_getNextUCA(ch, &(elems->iteratordata_), (status));                                                                            
-      }                                                                               
+    {                       
+      result = *((elems->iteratordata_).toReturn++);                                      
+      if ((elems->iteratordata_).CEpos == (elems->iteratordata_).toReturn)
+        (elems->iteratordata_).CEpos = (elems->iteratordata_).toReturn = 
+        (elems->iteratordata_).CEs; 
    } 
-    else
-      (result) = UCOL_NO_MORE_CES;                                                     
+    else 
+      if ((elems->iteratordata_).pos < (elems->iteratordata_).len) 
+      {                        
+        UChar ch = *(elems->iteratordata_).pos++;     
+        if (ch <= 0xFF)
+          (result) = (elems->collator_)->latinOneMapping[ch];                                          
+        else
+          (result) = ucmp32_get((elems->collator_)->mapping, ch);                                      
+                                                                                    
+        if((result) >= UCOL_NOT_FOUND) 
+        {
+          (result) = getSpecialCE((elems->collator_), (result), 
+                                  &(elems->iteratordata_), (status));        
+          if ((result) == UCOL_NOT_FOUND)
+            (result) = ucol_getNextUCA(ch, &(elems->iteratordata_), (status));                                                                            
+        }                                                                               
+      } 
+      else
+        (result) = UCOL_NO_MORE_CES;
  */
-    
+  
  if (result == UCOL_NO_MORE_CES)
    result = UCOL_NULLORDER;
  return result;
@ -142,62 +146,61 @@ ucol_previous(UCollationElements *elems,
 {
  if(U_FAILURE(*status)) 
    return UCOL_NULLORDER;
+  else
+  {
+    int32_t result;

-  int32_t result;
-  UCOL_GETPREVCE(result, elems->collator_, elems->iteratordata_, 
-                 elems->length_, status);
+    if (elems->reset_ && 
+        (elems->iteratordata_.pos == elems->iteratordata_.string))
+      elems->iteratordata_.pos = elems->iteratordata_.len;

-  /* synwee : to be removed, only for testing 
-  const UCollator   *coll  = elems->collator_;
-        collIterate *data  = &(elems->iteratordata_);
-        int32_t     length = elems->length_;
+    elems->reset_ = FALSE;

-  if (data->CEpos > data->CEs) 
-  {              
-    data->toReturn --;
-    (result) = *(data->toReturn);                                           
-    if (data->CEs == data->toReturn)                                
-      data->CEpos = data->toReturn = data->CEs; 
-  }                                                                          
-  else 
-  {                    
-    /* pointers are always at the next position to be retrieved for getnextce 
-    for every first previous step after a next, value returned will the same 
-    as the last next value
-    */
-    /*if (data->len - data->pos == length)
-      (result) = UCOL_NO_MORE_CES;                                                                                                                    
+    UCOL_GETPREVCE(result, elems->collator_, elems->iteratordata_, 
+                   elems->length_, status);
+
+    /* synwee : to be removed, only for testing */
+    /*
+    const UCollator   *coll  = elems->collator_;
+          collIterate *data  = &(elems->iteratordata_);
+          int32_t     length = elems->length_;
+
+    if (data->CEpos > data->CEs) 
+    {              
+      data->toReturn --;
+      (result) = *(data->toReturn);                                           
+      if (data->CEs == data->toReturn)                                
+        data->CEpos = data->toReturn = data->CEs; 
+    }                                                                          
    else 
-    {                  
-      if (data->pos != data->writableBuffer)
-        data->pos --;                                 
+    {                    
+      if (data->pos == data->string || data->pos == data->writableBuffer)
+        (result) = UCOL_NO_MORE_CES;                                                                                                                    
      else 
-      {                                                                 
-        data->pos = data->string +                                             
-                            (length - (data->len - data->writableBuffer));     
-        data->len = data->string + length;                                     
-        data->isThai = TRUE;                                                  
-      }                
-
-      UChar ch = *(data->pos);
-      if (ch <= 0xFF)                                                
-        (result) = (coll)->latinOneMapping[ch];                                                                                       
-      else
-        (result) = ucmp32_get((coll)->mapping, ch);                           
+      {                  
+        data->pos --;                                 
+      
+        UChar ch = *(data->pos);
+        if (ch <= 0xFF)                                                
+          (result) = (coll)->latinOneMapping[ch];                                                                                       
+        else
+          (result) = ucmp32_get((coll)->mapping, ch);                           
                                                                       
-      if ((result) >= UCOL_NOT_FOUND) 
-      {
-        (result) = getSpecialPrevCE(coll, result, data, length, status);      
-        if ((result) == UCOL_NOT_FOUND)
-          (result) = ucol_getPrevUCA(ch, data, length, status);                                      
-      }                                                                      
-    }                                                                        
-  }   */
+        if ((result) >= UCOL_NOT_FOUND) 
+        {
+          (result) = getSpecialPrevCE(coll, result, data, length, status);      
+          if ((result) == UCOL_NOT_FOUND)
+            (result) = ucol_getPrevUCA(ch, data, length, status);                                      
+        }                                                                      
+      }                                                                        
+    } 
+    */
+    
+    if (result == UCOL_NO_MORE_CES)
+      result = UCOL_NULLORDER;

-  if (result == UCOL_NO_MORE_CES)
-    result = UCOL_NULLORDER;
-
-  return result;
+    return result;
+  }
 }

 U_CAPI int32_t
@ -240,14 +243,7 @@ ucol_getOffset(const UCollationElements *elems)
  if (ci->isThai == TRUE)
    return ci->pos - ci->string;

-  /* 
-  if it is a thai string with reversed elements, since getNextCE does not 
-  store only a substring in writeablebuffer, we'll have to do some calculation
-  to get the offset out.
-  need discussion to see if it is a better idea to store the whole string 
-  instead.
-  */
-  return elems->length_ - (ci->len - ci->pos);
+  return ci->pos - ci->writableBuffer;
 }

 U_CAPI void
--- a/icu4c/source/i18n/ucolimp.h
+++ b/icu4c/source/i18n/ucolimp.h
@ -87,6 +87,10 @@ struct UCollationElements
  * Source text length
  */
        int32_t            length_;
+  /**
+  * Indicates if this data has been reset.
+  */
+        UBool              reset_;
 };

 struct incrementalContext {
@ -240,20 +244,12 @@ struct incrementalContext {
    }                                                                        \
  }                                                                          \
  else {                                                                     \
-    if ((data).len - (data).pos == length) {                                 \
+    if ((data).pos == (data).string || (data).pos == (data).writableBuffer) {\
      (order) = UCOL_NO_MORE_CES;                                            \
    }                                                                        \
    else {                                                                   \
      UChar ch;                                                              \
-      if ((data).pos != (data).writableBuffer) {                             \
-        (data).pos --;                                                       \
-      }                                                                      \
-      else {                                                                 \
-        (data).pos = (data).string +                                         \
-                            (length - ((data).len - (data).writableBuffer)); \
-        (data).len = (data).string + length;                                 \
-        (data).isThai = TRUE;                                                \
-      }                                                                      \
+      (data).pos --;                                                         \
      ch = *((data).pos);                                                    \
      if (ch <= 0xFF) {                                                      \
        (order) = (coll)->latinOneMapping[ch];                               \