2013-02-11 04:51:14 +00:00
|
|
|
/*
|
|
|
|
**********************************************************************
|
|
|
|
* Copyright (C) 2013, International Business Machines
|
|
|
|
* Corporation and others. All Rights Reserved.
|
|
|
|
**********************************************************************
|
|
|
|
*
|
|
|
|
* indentifier_info.h
|
|
|
|
*
|
|
|
|
* created on: 2013 Jan 7
|
|
|
|
* created by: Andy Heninger
|
|
|
|
*/
|
|
|
|
|
|
|
|
#ifndef __IDENTIFIER_INFO_H__
|
|
|
|
#define __IDENTIFIER_INFO_H__
|
|
|
|
|
|
|
|
#include "unicode/utypes.h"
|
|
|
|
|
|
|
|
#include "unicode/uniset.h"
|
|
|
|
#include "unicode/uspoof.h"
|
|
|
|
#include "uhash.h"
|
|
|
|
|
|
|
|
U_NAMESPACE_BEGIN
|
|
|
|
|
|
|
|
class ScriptSet;
|
|
|
|
|
|
|
|
// TODO(andy): review consistency of reference vs pointer arguments to the funcions.
|
|
|
|
|
|
|
|
/**
|
|
|
|
* This class analyzes a possible identifier for script and identifier status. Use it by calling setIdentifierProfile
|
|
|
|
* then setIdentifier. Available methods include:
|
|
|
|
* <ol>
|
|
|
|
* <li>call getScripts for the specific scripts in the identifier. The identifier contains at least one character in
|
|
|
|
* each of these.
|
|
|
|
* <li>call getAlternates to get cases where a character is not limited to a single script. For example, it could be
|
|
|
|
* either Katakana or Hiragana.
|
|
|
|
* <li>call getCommonAmongAlternates to find out if any scripts are common to all the alternates.
|
|
|
|
* <li>call getNumerics to get a representative character (with value zero) for each of the decimal number systems in
|
|
|
|
* the identifier.
|
|
|
|
* <li>call getRestrictionLevel to see what the UTS36 restriction level is.
|
|
|
|
* </ol>
|
|
|
|
*
|
|
|
|
* This is a port from ICU4J of class com.ibm.icu.text.IdentifierInfo
|
|
|
|
*/
|
|
|
|
class U_I18N_API IdentifierInfo : public UMemory {
|
|
|
|
|
|
|
|
public:
|
|
|
|
/**
|
|
|
|
* Create an identifier info object. Subsequently, call setIdentifier(), etc.
|
|
|
|
* @internal
|
|
|
|
*/
|
|
|
|
IdentifierInfo(UErrorCode &status);
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Destructor
|
|
|
|
*/
|
|
|
|
virtual ~IdentifierInfo();
|
|
|
|
|
|
|
|
private:
|
|
|
|
/* Disallow copying for now. Can be added if there's a need. */
|
|
|
|
IdentifierInfo(const IdentifierInfo &other);
|
|
|
|
|
|
|
|
public:
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Set the identifier profile: the characters that are to be allowed in the identifier.
|
|
|
|
*
|
|
|
|
* @param identifierProfile the characters that are to be allowed in the identifier
|
|
|
|
* @return this
|
|
|
|
* @internal
|
|
|
|
*/
|
|
|
|
IdentifierInfo &setIdentifierProfile(const UnicodeSet &identifierProfile);
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Get the identifier profile: the characters that are to be allowed in the identifier.
|
|
|
|
*
|
|
|
|
* @return The characters that are to be allowed in the identifier.
|
|
|
|
* @internal
|
|
|
|
*/
|
|
|
|
const UnicodeSet &getIdentifierProfile() const;
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Set an identifier to analyze. Afterwards, call methods like getScripts()
|
|
|
|
*
|
|
|
|
* @param identifier the identifier to analyze
|
|
|
|
* @param status Errorcode, set if errors occur.
|
|
|
|
* @return this
|
|
|
|
* @internal
|
|
|
|
*/
|
|
|
|
IdentifierInfo &setIdentifier(const UnicodeString &identifier, UErrorCode &status);
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Get the identifier that was analyzed. The returned string is owned by the ICU library,
|
|
|
|
* and must not be deleted by the caller.
|
|
|
|
*
|
|
|
|
* @return the identifier that was analyzed.
|
|
|
|
* @internal
|
|
|
|
*/
|
|
|
|
const UnicodeString *getIdentifier() const;
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Get the scripts found in the identifiers.
|
|
|
|
*
|
|
|
|
* @return the set of explicit scripts.
|
|
|
|
* @internal
|
|
|
|
*/
|
|
|
|
const ScriptSet *getScripts() const;
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Get the set of alternate scripts found in the identifiers. That is, when a character can be in two scripts, then
|
|
|
|
* the set consisting of those scripts will be returned.
|
|
|
|
*
|
|
|
|
* @return a uhash, with each key being of type (ScriptSet *).
|
|
|
|
* This is a set, not a map, so the value stored in the uhash is not relevant.
|
|
|
|
* (It is, in fact, 1).
|
|
|
|
* Ownership of the uhash and its contents remains with the IndetifierInfo object,
|
|
|
|
* and remains valid until a new identifer is set or until the object is deleted.
|
|
|
|
* @internal
|
|
|
|
*/
|
|
|
|
const UHashtable *getAlternates() const;
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Get the representative characters (zeros) for the numerics found in the identifier.
|
|
|
|
*
|
|
|
|
* @return the set of explicit scripts.
|
|
|
|
* @internal
|
|
|
|
*/
|
|
|
|
const UnicodeSet *getNumerics() const;
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Find out which scripts are in common among the alternates.
|
|
|
|
*
|
|
|
|
* @return the set of scripts that are in common among the alternates.
|
|
|
|
* @internal
|
|
|
|
*/
|
|
|
|
const ScriptSet *getCommonAmongAlternates() const;
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Get the number of scripts appearing in the identifier.
|
|
|
|
* Note: Common and Inherited scripts are omitted from the count.
|
|
|
|
* Note: Result may be high when the identifier contains characters
|
|
|
|
* with alternate scripts. The distinction between
|
|
|
|
* 0, 1 and > 1 will remain valid, however.
|
|
|
|
* @return the number of scripts.
|
|
|
|
*/
|
|
|
|
int32_t getScriptCount() const;
|
|
|
|
|
2013-02-23 16:40:06 +00:00
|
|
|
#if !UCONFIG_NO_NORMALIZATION
|
|
|
|
|
2013-02-11 04:51:14 +00:00
|
|
|
/**
|
|
|
|
* Find the "tightest" restriction level that the identifier satisfies.
|
|
|
|
*
|
|
|
|
* @return the restriction level.
|
|
|
|
* @internal
|
|
|
|
*/
|
|
|
|
URestrictionLevel getRestrictionLevel(UErrorCode &status) const;
|
|
|
|
|
2013-02-23 16:40:06 +00:00
|
|
|
#endif /*!UCONFIG_NO_NORMALIZATION */
|
|
|
|
|
2013-02-11 04:51:14 +00:00
|
|
|
UnicodeString toString() const;
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Produce a readable string of alternates.
|
|
|
|
*
|
|
|
|
* @param alternates a UHashtable of UScriptSets.
|
|
|
|
* Keys only, no meaningful values in the UHash.
|
|
|
|
* @return display form
|
|
|
|
* @internal
|
|
|
|
*/
|
|
|
|
static UnicodeString &displayAlternates(UnicodeString &dest, const UHashtable *alternates, UErrorCode &status);
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Static memory cleanup function.
|
|
|
|
* @internal
|
|
|
|
*/
|
|
|
|
static UBool cleanup();
|
|
|
|
private:
|
|
|
|
|
|
|
|
IdentifierInfo & clear();
|
|
|
|
UBool containsWithAlternates(const ScriptSet &container, const ScriptSet &containee) const;
|
|
|
|
|
|
|
|
UnicodeString *fIdentifier;
|
|
|
|
ScriptSet *fRequiredScripts;
|
|
|
|
UHashtable *fScriptSetSet;
|
|
|
|
ScriptSet *fCommonAmongAlternates;
|
|
|
|
UnicodeSet *fNumerics;
|
|
|
|
UnicodeSet *fIdentifierProfile;
|
|
|
|
|
|
|
|
static UnicodeSet *ASCII;
|
|
|
|
static ScriptSet *JAPANESE;
|
|
|
|
static ScriptSet *CHINESE;
|
|
|
|
static ScriptSet *KOREAN;
|
|
|
|
static ScriptSet *CONFUSABLE_WITH_LATIN;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
};
|
|
|
|
|
|
|
|
U_NAMESPACE_END
|
|
|
|
|
|
|
|
#endif // __IDENTIFIER_INFO_H__
|
|
|
|
|