scuffed-code/icu4c/source/i18n/ubrk.h

/*
*****************************************************************************************
*                                                                                       *
* COPYRIGHT:                                                                            *
*   (C) Copyright Taligent, Inc.,  1996                                                 *
*   (C) Copyright International Business Machines Corporation,  1998-1999               *
*   Licensed Material - Program-Property of IBM - All Rights Reserved.                  *
*   US Government Users Restricted Rights - Use, duplication, or disclosure             *
*   restricted by GSA ADP Schedule Contract with IBM Corp.                              *
*                                                                                       *
*****************************************************************************************
*/

#ifndef UBRK_H
#define UBRK_H

#include "utypes.h"
/**
 * @name BreakIterator C API
 *
 * The BreakIterator C API defines  methods for finding the location
 * of boundaries in text. Pointer to a UBreakIterator maintain a
 * current position and scan over text returning the index of characters
 * where boundaries occur.
 * <P>
 * Line boundary analysis determines where a text string can be broken
 * when line-wrapping. The mechanism correctly handles punctuation and
 * hyphenated words.
 * <P>
 * Sentence boundary analysis allows selection with correct
 * interpretation of periods within numbers and abbreviations, and
 * trailing punctuation marks such as quotation marks and parentheses.
 * <P>
 * Word boundary analysis is used by search and replace functions, as
 * well as within text editing applications that allow the user to
 * select words with a double click. Word selection provides correct
 * interpretation of punctuation marks within and following
 * words. Characters that are not part of a word, such as symbols or
 * punctuation marks, have word-breaks on both sides.
 * <P>
 * Character boundary analysis allows users to interact with
 * characters as they expect to, for example, when moving the cursor
 * through a text string. Character boundary analysis provides correct
 * navigation of through character strings, regardless of how the
 * character is stored.  For example, an accented character might be
 * stored as a base character and a diacritical mark. What users
 * consider to be a character can differ between languages.
 * <P>
 * This is the interface for all text boundaries.
 * <P>
 * Examples:
 * <P>
 * Helper function to output text
 * <pre>
 * .   void printTextRange(UChar* str, UTextOffset start, UTextOffset end ) {
 * .        UChar* result;
 * .        UChar* temp;
 * .        const char* res;
 * .        temp=(UChar*)malloc(sizeof(UChar) * ((u_strlen(str)-start)+1));
 * .        result=(UChar*)malloc(sizeof(UChar) * ((end-start)+1));
 * .        u_strcpy(temp, &str[start]);
 * .        u_strncpy(result, temp, end-start);
 * .        res=(char*)malloc(sizeof(char) * (u_strlen(result)+1));
 * .        u_austrcpy(res, result);
 * .        printf("%s\n", res);
 * .   }
 * </pre>
 * Print each element in order:
 * <pre>
 * .   void printEachForward( UBreakIterator* boundary, UChar* str) {
 * .      UTextOffset end;
 * .      UTextOffset start = ubrk_first(boundary);
 * .      for (end = ubrk_next(boundary)); end != UBRK_DONE; start = end, end = ubrk_next(boundary)) {
 * .            printTextRange(str, start, end );
 * .        }
 * .   }
 * </pre>
 * Print each element in reverse order:
 * <pre>
 * .   void printEachBackward( UBreakIterator* boundary, UChar* str) {
 * .      UTextOffset start;
 * .      UTextOffset end = ubrk_last(boundary);
 * .      for (start = ubrk_previous(boundary); start != UBRK_DONE;  end = start, start =ubrk_previous(boundary)) {
 * .            printTextRange( str, start, end );
 * .        }
 * .   }
 * </pre>
 * Print first element
 * <pre>
 * .   void printFirst(UBreakIterator* boundary, UChar* str) {
 * .       UTextOffset end;
 * .       UTextOffset start = ubrk_first(boundary);
 * .       end = ubrk_next(boundary);
 * .       printTextRange( str, start, end );
 * .   }
 * </pre>
 * Print last element
 * <pre>
 * .   void printLast(UBreakIterator* boundary, UChar* str) {
 * .       UTextOffset start;
 * .       UTextOffset end = ubrk_last(boundary);
 * .       start = ubrk_previous(boundary);
 * .       printTextRange(str, start, end );
 * .   }
 * </pre>
 * Print the element at a specified position
 * <pre>
 * .   void printAt(UBreakIterator* boundary, UTextOffset pos , UChar* str) {
 * .       UTextOffset start;
 * .       UTextOffset end = ubrk_following(boundary, pos);
 * .       start = ubrk_previous(boundary);
 * .       printTextRange(str, start, end );
 * .   }
 * </pre>
 * Creating and using text boundaries
 * <pre>
 * .      void BreakIterator_Example( void ) {
 * .          UBreakIterator* boundary;
 * .          UChar *stringToExamine;
 * .          stringToExamine=(UChar*)malloc(sizeof(UChar) * (strlen("Aaa bbb ccc. Ddd eee fff.")+1) );
 * .          u_uastrcpy(stringToExamine, "Aaa bbb ccc. Ddd eee fff.");
 * .          printf("Examining: "Aaa bbb ccc. Ddd eee fff.");
 * .
 * .          //print each sentence in forward and reverse order
 * .          boundary = ubrk_open(UBRK_SENTENCE, "en_us", stringToExamine, u_strlen(stringToExamine), &status);
 * .          printf("----- forward: -----------\n");
 * .          printEachForward(boundary, stringToExamine);
 * .          printf("----- backward: ----------\n");
 * .          printEachBackward(boundary, stringToExamine);
 * .          ubrk_close(boundary);
 * .
 * .          //print each word in order
 * .          boundary = ubrk_open(UBRK_WORD, "en_us", stringToExamine, u_strlen(stringToExamine), &status);
 * .          printf("----- forward: -----------\n");
 * .          printEachForward(boundary, stringToExamine);
 * .          printf("----- backward: ----------\n");
 * .          printEachBackward(boundary, stringToExamine);
 * .          //print first element
 * .          printf("----- first: -------------\n");
 * .          printFirst(boundary, stringToExamine);
 * .          //print last element
 * .          printf("----- last: --------------\n");
 * .          printLast(boundary, stringToExamine);
 * .          //print word at charpos 10
 * .          printf("----- at pos 10: ---------\n");
 * .          printAt(boundary, 10 , stringToExamine);
 * .
 * .          ubrk_close(boundary);
 * .      }
 * </pre>
 */

/**
 * A text-break iterator.
 *  For usage in C programs.
 */
typedef void* UBreakIterator;

/** The possible types of text boundaries. */
enum UBreakIteratorType {
  /** Character breaks */
  UBRK_CHARACTER,
  /** Word breaks */
  UBRK_WORD,
  /** Line breaks */
  UBRK_LINE,
  /** Sentence breaks */
  UBRK_SENTENCE
};
typedef enum UBreakIteratorType UBreakIteratorType;

/** Value indicating all text boundaries have been returned.
 *
 */
#define UBRK_DONE ((UTextOffset) -1)

/**
 * Open a new UBreakIterator for locating text boundaries for a specified locale.
 * A UBreakIterator may be used for detecting character, line, word,
 * and sentence breaks in text.
 * @param type The type of UBreakIterator to open: one of UBRK_CHARACTER, UBRK_WORD,
 * UBRK_LINE, UBRK_SENTENCE
 * @param locale The locale specifying the text-breaking conventions.
 * @param text The text to be iterated over.
 * @param textLength The number of characters in text, or -1 if null-terminated.
 * @param status A UErrorCode to receive any errors.
 * @return A UBreakIterator for the specified locale.
 * @see ubrk_openRules
 */
U_CAPI UBreakIterator*
ubrk_open(UBreakIteratorType type,
      const char *locale,
      const UChar *text,
      int32_t textLength,
      UErrorCode *status);

/**
 * Open a new UBreakIterator for locating text boundaries using specified breaking rules.
 * The rule syntax is ... (TBD)
 * @param rules A set of rules specifying the text breaking conventions.
 * @param rulesLength The number of characters in rules, or -1 if null-terminated.
 * @param text The text to be iterated over.
 * @param textLength The number of characters in text, or -1 if null-terminated.
 * @param status A UErrorCode to receive any errors.
 * @return A UBreakIterator for the specified rules.
 * @see ubrk_open
 */
U_CAPI UBreakIterator*
ubrk_openRules(const UChar *rules,
           int32_t rulesLength,
           const UChar *text,
           int32_t textLength,
           UErrorCode *status);

/**
* Close a UBreakIterator.
* Once closed, a UBreakIterator may no longer be used.
* @param bi The break iterator to close.
*/
U_CAPI void
ubrk_close(UBreakIterator *bi);

/**
 * Determine the most recently-returned text boundary.
 *
 * @param bi The break iterator to use.
 * @return The character index most recently returned by \Ref{ubrk_next}, \Ref{ubrk_previous},
 * \Ref{ubrk_first}, or \Ref{ubrk_last}.
 */
U_CAPI UTextOffset
ubrk_current(const UBreakIterator *bi);

/**
 * Determine the text boundary following the current text boundary.
 *
 * @param bi The break iterator to use.
 * @return The character index of the next text boundary, or UBRK_DONE
 * if all text boundaries have been returned.
 * @see ubrk_previous
 */
U_CAPI UTextOffset
ubrk_next(UBreakIterator *bi);

/**
 * Determine the text boundary preceding the current text boundary.
 *
 * @param bi The break iterator to use.
 * @return The character index of the preceding text boundary, or UBRK_DONE
 * if all text boundaries have been returned.
 * @see ubrk_next
 */
U_CAPI UTextOffset
ubrk_previous(UBreakIterator *bi);

/**
 * Determine the index of the first character in the text being scanned.
 * This is not always the same as index 0 of the text.
 * @param bi The break iterator to use.
 * @return The character index of the first character in the text being scanned.
 * @see ubrk_last
 */
U_CAPI UTextOffset
ubrk_first(UBreakIterator *bi);

/**
 * Determine the index immediately <EM>beyond</EM> the last character in the text being
 * scanned.
 * This is not the same as the last character.
 * @param bi The break iterator to use.
 * @return The character offset immediately <EM>beyond</EM> the last character in the
 * text being scanned.
 * @see ubrk_first
 */
U_CAPI UTextOffset
ubrk_last(UBreakIterator *bi);

/**
 * Determine the text boundary preceding the specified offset.
 * The value returned is always smaller than offset, or UBRK_DONE.
 * @param bi The break iterator to use.
 * @param offset The offset to begin scanning.
 * @return The text boundary preceding offset, or UBRK_DONE.
 * @see ubrk_following
 */
U_CAPI UTextOffset
ubrk_preceding(UBreakIterator *bi,
           UTextOffset offset);

/**
 * Determine the text boundary following the specified offset.
 * The value returned is always greater than offset, or UBRK_DONE.
 * @param bi The break iterator to use.
 * @param offset The offset to begin scanning.
 * @return The text boundary following offset, or UBRK_DONE.
 * @see ubrk_preceding
 */
U_CAPI UTextOffset
ubrk_following(UBreakIterator *bi,
           UTextOffset offset);

/**
* Get a locale for which text breaking information is available.
* A UBreakIterator in a locale returned by this function will perform the correct
* text breaking for the locale.
* @param index The index of the desired locale.
* @return A locale for which number text breaking information is available, or 0 if none.
* @see ubrk_countAvailable
*/
U_CAPI const char*
ubrk_getAvailable(int32_t index);

/**
* Determine how many locales have text breaking information available.
* This function is most useful as determining the loop ending condition for
* calls to \Ref{ubrk_getAvailable}.
* @return The number of locales for which text breaking information is available.
* @see ubrk_getAvailable
*/
U_CAPI int32_t
ubrk_countAvailable(void);

#endif