ICU-20088 User Guide migration: Add navigation structure and TOC for Jekyll, fix formatting issues, etc.
This commit is contained in:
parent
b2ada378be
commit
2e56c4264a
@ -101,6 +101,7 @@ icu4j/perf-tests/data/conversion/*
|
||||
# docs
|
||||
#
|
||||
docs/ide4c/vscode/*.json
|
||||
docs/Gemfile.lock
|
||||
#
|
||||
# tools
|
||||
#
|
||||
|
@ -5,6 +5,8 @@ The ICU project is under the stewardship of [The Unicode Consortium](https://www
|
||||
|
||||
- Source: https://github.com/unicode-org/icu
|
||||
- Bugs: https://unicode-org.atlassian.net/projects/ICU
|
||||
- API Docs: https://unicode-org.github.io/icu-docs/
|
||||
- User Guide: https://unicode-org.github.io/icu/
|
||||
|
||||
![ICU Logo](./tools/images/iculogo_64.png)
|
||||
|
||||
|
@ -1,7 +1,10 @@
|
||||
---
|
||||
permalink: /404.html
|
||||
layout: default
|
||||
---
|
||||
<!--
|
||||
© 2020 and later: Unicode, Inc. and others.
|
||||
License & terms of use: http://www.unicode.org/copyright.html
|
||||
-->
|
||||
|
||||
<style type="text/css" media="screen">
|
||||
.container {
|
||||
|
10
docs/Gemfile
10
docs/Gemfile
@ -1,3 +1,6 @@
|
||||
# © 2020 and later: Unicode, Inc. and others.
|
||||
# License & terms of use: http://www.unicode.org/copyright.html
|
||||
|
||||
source "https://rubygems.org"
|
||||
# Hello! This is where you manage which Jekyll version is used to run.
|
||||
# When you want to use a different version, change it below, save the
|
||||
@ -8,10 +11,13 @@ source "https://rubygems.org"
|
||||
# This will help ensure the proper Jekyll version is running.
|
||||
# Happy Jekylling!
|
||||
|
||||
# gem "jekyll", "~> 3.8.7"
|
||||
|
||||
# This is the default theme for new Jekyll sites. You may change this to anything you like.
|
||||
gem "minima", "~> 2.5"
|
||||
# If you want to use GitHub Pages, remove the "gem "jekyll"" above and
|
||||
# uncomment the line below. To upgrade, run `bundle update github-pages`.
|
||||
# If you aren't using GitHub Pages, then uncomment out the line above that starts with
|
||||
# 'gem "jekyll"' and then comment out the line below that starts with 'gem "github-pages"'.
|
||||
# To upgrade, run `bundle update github-pages`.
|
||||
gem "github-pages", "~> 206", group: :jekyll_plugins
|
||||
gem "just-the-docs"
|
||||
# If you have any plugins, put them here!
|
||||
|
103
docs/_config.yml
103
docs/_config.yml
@ -1,52 +1,95 @@
|
||||
remote_theme: pmarsceill/just-the-docs
|
||||
|
||||
# GitHub uses it's own markdown renderer.
|
||||
# We use this in order to try and have the docs pages match ones in the repo.
|
||||
markdown: CommonMarkGhPages
|
||||
|
||||
# Plugins:
|
||||
# jekyll-commonmark-ghpages -- This is needed in order to use CommonMarkGhPages
|
||||
# jemoji -- For rendering emoji like :point_right:
|
||||
plugins:
|
||||
- jekyll-commonmark-ghpages
|
||||
- jekyll-remote-theme
|
||||
- jemoji
|
||||
|
||||
commonmark:
|
||||
options: ["SMART", "FOOTNOTES"]
|
||||
extensions: ["strikethrough", "autolink", "table", "tagfilter"]
|
||||
# © 2020 and later: Unicode, Inc. and others.
|
||||
# License & terms of use: http://www.unicode.org/copyright.html
|
||||
|
||||
title: ICU Documentation
|
||||
description: ICU Documentation
|
||||
email: icu-support@icu-project.org
|
||||
description: >- # this means to ignore newlines until "baseurl:"
|
||||
ICU is a mature, widely used set of C/C++ and Java libraries providing Unicode
|
||||
and Globalization support for software applications. The ICU User Guide provides
|
||||
documentation on how to use ICU.
|
||||
|
||||
baseurl: "/icu" # the subpath for the site
|
||||
url: "https://jefgen.github.io" # the base hostname & protocol for your site
|
||||
url: "https://unicode-org.github.io" # the base hostname & protocol for your site
|
||||
|
||||
twitter_username: unicode
|
||||
github_username: unicode-org
|
||||
|
||||
remote_theme: pmarsceill/just-the-docs
|
||||
|
||||
# GitHub uses its own markdown renderer called CommonMarkGhPages.
|
||||
#
|
||||
# To use this we'd need to set the following options:
|
||||
# markdown: CommonMarkGhPages
|
||||
# Plugins:
|
||||
# jekyll-commonmark-ghpages
|
||||
#
|
||||
# commonmark:
|
||||
# options: ["SMART", "FOOTNOTES"]
|
||||
# extensions: ["strikethrough", "autolink", "table", "tagfilter"]
|
||||
#
|
||||
# However, it doesn't work well with Jekyll and the Just-The-Docs theme,
|
||||
# and causes too many formatting problems. Additionally, it also doesn't
|
||||
# have any support for auto-generating a Table of Contents (TOC) section
|
||||
# for a page. Due to these reasons, we instead use the recommended
|
||||
# markdown renderer for Jekyll, called kramdown.
|
||||
#
|
||||
markdown: kramdown
|
||||
|
||||
# Notes on plugins:
|
||||
# - GitHub Pages ignores all plugins included in the Gemfile.
|
||||
# - Only the ones listed in the _config.yml file (this file) are used.
|
||||
# - There is a very limited set of plugins that GH Pages supports (for security
|
||||
# reasons). The listing is here: https://pages.github.com/versions
|
||||
# - The jekyll-sitemap plugin generates a "sitemap.xml" for search engines.
|
||||
# - We need the "jemoji" plugin for rendering emoji like ":point_right:".
|
||||
plugins:
|
||||
- jekyll-remote-theme
|
||||
- jekyll-sitemap
|
||||
- jemoji
|
||||
|
||||
search_enabled: true
|
||||
|
||||
# Color scheme currently only supports "dark" or nil (default)
|
||||
# Color scheme currently only supports "dark" or nil (light).
|
||||
color_scheme: nil
|
||||
|
||||
# Aux links for the upper right navigation
|
||||
aux_links:
|
||||
"API Docs":
|
||||
- "https://unicode-org.github.io/icu-docs"
|
||||
"ICU on GitHub":
|
||||
- "//github.com/jefgen/icu"
|
||||
- "https://github.com/unicode-org/icu"
|
||||
"ICU Home Page":
|
||||
- "http://site.icu-project.org/"
|
||||
"Unicode Home Page":
|
||||
- "https://www.unicode.org"
|
||||
|
||||
# Back to top link
|
||||
# Makes Aux links open in a new tab.
|
||||
aux_links_new_tab: true
|
||||
|
||||
# Add a "Back to top" link at the bottom of each page.
|
||||
back_to_top: true
|
||||
back_to_top_text: "Back to top"
|
||||
|
||||
# Enable heading anchors
|
||||
heading_anchors: true
|
||||
|
||||
# Footer content
|
||||
# appears at the bottom of every page's main content
|
||||
footer_content: "Copyright © 2016 and later Unicode, Inc. and others. All Rights Reserved. <a href=\"http://www.unicode.org/copyright.html\">Terms of use and License</a>."
|
||||
# Footer content: appears at the bottom of every page after the main content.
|
||||
footer_content: "© 2016 and later: Unicode, Inc. and others. License & terms of use: <a href=\"http://www.unicode.org/copyright.html\">http://www.unicode.org/copyright.html</a>"
|
||||
|
||||
# TODO: I think this would be better at the top of the page, but I'm not sure how to do that.
|
||||
# Footer "Edit this page on GitHub" link text
|
||||
# Footer content: Add a "Edit this page on GitHub" link.
|
||||
# Note: I think this would be better at the *top* of the page, but it isn't possible to do that by default with just-the-docs.
|
||||
gh_edit_link: true # show or hide edit this page link
|
||||
gh_edit_link_text: "Edit this page on GitHub."
|
||||
gh_edit_repository: "https://github.com/jefgen/icu" # the github URL for your repo
|
||||
gh_edit_branch: "master" # the branch that your docs is served from
|
||||
gh_edit_view_mode: "tree" # "tree" or "edit" if you want the user to jump into the editor immediately
|
||||
gh_edit_repository: "https://github.com/unicode-org/icu"
|
||||
# NOTE: Since we serve the docs out of the "docs" folder, we need to include that here even though
|
||||
# it is technically *not* part of the branch name.
|
||||
gh_edit_branch: "master/docs"
|
||||
# Can be "tree" or "edit". Edit takes the user to jump into the editor immediately.
|
||||
gh_edit_view_mode: "tree"
|
||||
|
||||
# Exclude from processing.
|
||||
# Excluded items can be processed by explicitly listing the directories or
|
||||
# their entries' file path in the `include:` list.
|
||||
exclude:
|
||||
- Gemfile
|
||||
- Gemfile.lock
|
||||
|
8
docs/_sass/color_schemes/light.scss
Normal file
8
docs/_sass/color_schemes/light.scss
Normal file
@ -0,0 +1,8 @@
|
||||
// © 2020 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
|
||||
// Increase the side bar width as some of the page titles don't fit in the default size.
|
||||
$nav-width: 284px;
|
||||
|
||||
// Increase the main content area width as some tables assume a larger size.
|
||||
$content-width: 900px;
|
@ -1,7 +0,0 @@
|
||||
---
|
||||
layout: page
|
||||
title: About
|
||||
permalink: /about/
|
||||
---
|
||||
|
||||
This is a proof-of-concept of using the Jekyll-compatible Markdown file headers to assign site structure location to a pre-existing Markdown file. The site structure is reflected in the navigation links that Jekyll generates on all the pages.
|
@ -3,7 +3,6 @@ layout: default
|
||||
title: ICU Documentation
|
||||
nav_order: 1
|
||||
description: ICU Documentation
|
||||
permalink: /
|
||||
---
|
||||
|
||||
<!--
|
||||
@ -17,7 +16,7 @@ License & terms of use: http://www.unicode.org/copyright.html
|
||||
|
||||
## ICU User Guide
|
||||
|
||||
The [ICU User Guide](./userguideintro) provides information on i18n topics for which ICU has services, and
|
||||
The [ICU User Guide](./userguide) provides information on i18n topics for which ICU has services, and
|
||||
includes details that go beyond the C, C++, and Java API docs (and avoids some duplication between them).
|
||||
|
||||
This is the new home of the User Guide (since 2020 August).
|
||||
|
@ -1,17 +1,32 @@
|
||||
---
|
||||
layout: default
|
||||
title: Break Rules
|
||||
nav_order: 1
|
||||
parent: Boundary Analysis
|
||||
---
|
||||
<!--
|
||||
© 2020 and later: Unicode, Inc. and others.
|
||||
License & terms of use: http://www.unicode.org/copyright.html
|
||||
-->
|
||||
|
||||
# Break Rules
|
||||
{: .no_toc }
|
||||
|
||||
## Contents
|
||||
{: .no_toc .text-delta }
|
||||
|
||||
1. TOC
|
||||
{:toc}
|
||||
|
||||
---
|
||||
|
||||
## Introduction
|
||||
|
||||
ICU locates boundary positions within text by means of rules, which are a form
|
||||
of regular expressions. The form of the rules is similar, but not identical,
|
||||
to the boundary rules from the Unicode specifications
|
||||
[[UAX-14](https://unicode.org/reports/tr14/),
|
||||
[UAX-29](https://unicode.org/reports/tr29/)], and there is a reasonably close
|
||||
[ [UAX-14](https://unicode.org/reports/tr14/),
|
||||
[UAX-29](https://unicode.org/reports/tr29/) ], and there is a reasonably close
|
||||
correspondence between the two.
|
||||
|
||||
Taken as a set, the ICU rules describe how to move forward to the next boundary,
|
||||
@ -29,12 +44,13 @@ customizations.
|
||||
Rules most commonly describe a range of text that should remain together,
|
||||
unbroken. For example, this rule
|
||||
|
||||
```
|
||||
[\p{Letter}]+;
|
||||
```
|
||||
|
||||
matches a run of one or more letters, and would cause them to remain unbroken.
|
||||
|
||||
The part within `[`brackets`]` follows normal ICU [UnicodeSet pattern
|
||||
syntax](../strings/unicodeset.md).
|
||||
The part within `[`brackets`]` follows normal ICU [UnicodeSet pattern syntax](../strings/unicodeset.md).
|
||||
|
||||
The qualifier, '`+`' in this case, can be one of
|
||||
|
||||
@ -56,10 +72,12 @@ of a constant expression.
|
||||
|
||||
They start with a '`$`', both in the definition and use.
|
||||
|
||||
```
|
||||
# Variable Definition
|
||||
$ASCIILetNum = [A-Za-z0-9];
|
||||
# Variable Use
|
||||
$ASCIILetNum+;
|
||||
```
|
||||
|
||||
#### Comments and Semicolons
|
||||
|
||||
@ -81,11 +99,13 @@ would be difficult to implement without it.
|
||||
|
||||
Starting with an example,
|
||||
|
||||
```
|
||||
!!chain;
|
||||
word_char = [\p{Letter}];
|
||||
word_joiner = [_-];
|
||||
$word_char+;
|
||||
$word_char $word_joiner $word_char;
|
||||
```
|
||||
|
||||
These rules will match "`abc`", "`hello_world`", `"hi-there"`,
|
||||
"`a-bunch_of-joiners-here`".
|
||||
@ -101,9 +121,11 @@ In the example below, matching "`hello_world`",
|
||||
|
||||
* '`2`' shows matches of the second rule, `$word_char $word_joiner $word_char`
|
||||
|
||||
```
|
||||
hello_world
|
||||
11111 11111
|
||||
222
|
||||
11111 11111
|
||||
222
|
||||
```
|
||||
|
||||
There is an overlap of the matched regions, which causes the chaining mechanism
|
||||
to join them into a single overall match.
|
||||
@ -125,14 +147,18 @@ behavior.
|
||||
|
||||
For example, the following would match a simplified identifier:
|
||||
|
||||
```
|
||||
$Letter ($Letter | $Digit)*;
|
||||
```
|
||||
|
||||
#### String and Character Literals
|
||||
|
||||
Similarly to common regular expressions, literal characters that do not have
|
||||
other special meaning represent themselves. So the rule
|
||||
|
||||
```
|
||||
Hello;
|
||||
```
|
||||
|
||||
would match the literal input "`Hello`".
|
||||
|
||||
@ -142,7 +168,9 @@ character properties; literal characters in rules are very rare.
|
||||
To prevent random typos in rules from being treated as literals, use this
|
||||
option:
|
||||
|
||||
```
|
||||
!!quoted_literals_only;
|
||||
```
|
||||
|
||||
With the option, the naked `Hello` becomes a rule syntax error while a quoted
|
||||
`"hello"` still matches a literal hello.
|
||||
@ -156,7 +184,9 @@ A rule containing a slash (`/`) will force a boundary when it matches, even when
|
||||
other rules or chaining would otherwise lead to a longer match. Also called Hard
|
||||
Break Rules, these have the form
|
||||
|
||||
```
|
||||
pre-context / post-context;
|
||||
```
|
||||
|
||||
where the pre and post-context look like normal break rules. Both the pre and
|
||||
post context are required, and must not allow a zero-length match. There should
|
||||
@ -182,8 +212,8 @@ property is Combining Mark. This option is subject to change or removal, and
|
||||
should not be used in general. Within ICU, it is used only with the line break
|
||||
rules. We hope to replace it with something more general.~~
|
||||
|
||||
> :point_right: **Note**: `!!LBCMNoChain` is deprecated, and will be removed completely from a future
|
||||
version of ICU.
|
||||
> :point_right: **Note**: `!!LBCMNoChain` is deprecated, and will be removed
|
||||
> completely from a future version of ICU.
|
||||
|
||||
## Rule Status Values
|
||||
|
||||
@ -218,7 +248,9 @@ and are enclosed in `{`braces`}`.
|
||||
Hard break rules that also have a status value place the status at the end, for
|
||||
example
|
||||
|
||||
```
|
||||
pre-context / post-context {1234};
|
||||
```
|
||||
|
||||
### Word Dictionaries
|
||||
|
||||
@ -239,12 +271,13 @@ The dictionary implementation, on receiving a range of text, will map it to a
|
||||
specific dictionary based on script, and then delegate to that dictionary for
|
||||
subdividing the range into words.
|
||||
|
||||
See, for example, this snippet from the [line break
|
||||
rules](https://github.com/unicode-org/icu/blob/master/icu4c/source/data/brkitr/rules/line.txt):
|
||||
See, for example, this snippet from the [line break rules](https://github.com/unicode-org/icu/blob/master/icu4c/source/data/brkitr/rules/line.txt):
|
||||
|
||||
# Dictionary character set, for triggering language-based break engines. Currently
|
||||
# limited to LineBreak=Complex_Context (SA).
|
||||
```
|
||||
# Dictionary character set, for triggering language-based break engines. Currently
|
||||
# limited to LineBreak=Complex_Context (SA).
|
||||
$dictionary = [$SA];
|
||||
```
|
||||
|
||||
## Rule Options
|
||||
|
||||
@ -367,15 +400,14 @@ Here is the syntax for the boundary rules. (The EBNF Syntax is given below.)
|
||||
|
||||
## Additional Sample Code
|
||||
|
||||
**C/C++**: See
|
||||
[icu/source/samples/break/](https://github.com/unicode-org/icu/tree/master/icu4c/source/samples/break/)
|
||||
in the ICU source distribution for code samples showing the use of ICU boundary
|
||||
analysis.
|
||||
**C/C++**
|
||||
See [icu/source/samples/break/](https://github.com/unicode-org/icu/tree/master/icu4c/source/samples/break/)
|
||||
in the ICU source distribution for code samples showing the use of ICU boundary analysis.
|
||||
|
||||
## Details about Dictionary-Based Break Iteration
|
||||
|
||||
> :point_right: **Note**: This section originally from August 2012.
|
||||
> It is probably out of date, for example `brkfiles.mk` does not exist anyore.
|
||||
> :point_right: **Note**: This section below is originally from August 2012.
|
||||
> It is probably out of date, for example `brkfiles.mk` does not exist anymore.
|
||||
|
||||
Certain Unicode characters have a "dictionary" bit set in the break iteration
|
||||
rules, and text made up of these characters cannot be handled by the rules-based
|
||||
@ -428,10 +460,14 @@ add a similar set of lines for your script. Lastly, in
|
||||
`source/data/brkitr/root.txt`, add a line to the dictionaries `{}` section of the
|
||||
form:
|
||||
|
||||
```
|
||||
shortscriptname:process(dependency){"dictionaryname.dict"}
|
||||
```
|
||||
|
||||
For example, for Katakana:
|
||||
|
||||
```
|
||||
Kata:process(dependency){"cjdict.dict"}
|
||||
```
|
||||
|
||||
Make sure to add appropriate tests for the new implementation.
|
||||
|
@ -1,9 +1,24 @@
|
||||
---
|
||||
layout: default
|
||||
title: Boundary Analysis
|
||||
nav_order: 10
|
||||
has_children: true
|
||||
---
|
||||
<!--
|
||||
© 2020 and later: Unicode, Inc. and others.
|
||||
License & terms of use: http://www.unicode.org/copyright.html
|
||||
-->
|
||||
|
||||
# Boundary Analysis
|
||||
{: .no_toc }
|
||||
|
||||
## Contents
|
||||
{: .no_toc .text-delta }
|
||||
|
||||
1. TOC
|
||||
{:toc}
|
||||
|
||||
---
|
||||
|
||||
## Overview of Text Boundary Analysis
|
||||
|
||||
|
@ -1,9 +1,26 @@
|
||||
---
|
||||
layout: default
|
||||
title: API Details
|
||||
nav_order: 6
|
||||
parent: Collation
|
||||
---
|
||||
<!--
|
||||
© 2020 and later: Unicode, Inc. and others.
|
||||
License & terms of use: http://www.unicode.org/copyright.html
|
||||
-->
|
||||
|
||||
# Collation API Details
|
||||
{: .no_toc }
|
||||
|
||||
## Contents
|
||||
{: .no_toc .text-delta }
|
||||
|
||||
1. TOC
|
||||
{:toc}
|
||||
|
||||
---
|
||||
|
||||
## Overview
|
||||
|
||||
This section describes some of the usage conventions for the ICU Collation
|
||||
Service API.
|
||||
@ -59,7 +76,7 @@ This example demonstrates the instantiation of a collator.
|
||||
|
||||
**C:**
|
||||
|
||||
```C
|
||||
```c
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
UCollator *coll = ucol_open("en_US", &status);
|
||||
if(U_SUCCESS(status)) {
|
||||
@ -70,7 +87,7 @@ if(U_SUCCESS(status)) {
|
||||
|
||||
**C++:**
|
||||
|
||||
```C++
|
||||
```c++
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
Collator *coll = Collator::createInstance(Locale("en", "US"), status);
|
||||
if(U_SUCCESS(status)) {
|
||||
@ -81,7 +98,7 @@ if(U_SUCCESS(status)) {
|
||||
|
||||
**Java:**
|
||||
|
||||
```Java
|
||||
```java
|
||||
Collator col = null;
|
||||
try {
|
||||
col = Collator.getInstance(Locale.US);
|
||||
@ -94,16 +111,14 @@ try {
|
||||
### Instantiating Collators Using Custom Rules
|
||||
|
||||
If the ICU predefined collators are not appropriate for your intended usage, you
|
||||
can
|
||||
define your own set of rules and instantiate a collator that uses them. For more
|
||||
details, please see [the section on collation
|
||||
customization](customization/index.md).
|
||||
can define your own set of rules and instantiate a collator that uses them. For more
|
||||
details, please see [the section on collation customization](customization/index.md).
|
||||
|
||||
This example demonstrates the instantiation of a collator.
|
||||
|
||||
**C:**
|
||||
|
||||
```C
|
||||
```c
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
U_STRING_DECL(rules, "&9 < a, A < b, B < c, C; ch, cH, Ch, CH < d, D, e, E", 52);
|
||||
UCollator *coll;
|
||||
@ -118,7 +133,7 @@ if(U_SUCCESS(status)) {
|
||||
|
||||
**C++:**
|
||||
|
||||
```C++
|
||||
```c++
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
UnicodeString rules(u"&9 < a, A < b, B < c, C; ch, cH, Ch, CH < d, D, e, E");
|
||||
Collator *coll = new RuleBasedCollator(rules, status);
|
||||
@ -130,7 +145,7 @@ if(U_SUCCESS(status)) {
|
||||
|
||||
**Java:**
|
||||
|
||||
```Java
|
||||
```java
|
||||
RuleBasedCollator coll = null;
|
||||
String ruleset = "&9 < a, A < b, B < c, C; ch, cH, Ch, CH < d, D, e, E";
|
||||
try {
|
||||
@ -185,7 +200,7 @@ value, such as `ucol_greater`, `ucol_greaterOrEqual`, `ucol_equal` (in C)
|
||||
|
||||
**C:**
|
||||
|
||||
```C
|
||||
```c
|
||||
UChar *s [] = { /* list of Unicode strings */ };
|
||||
uint32_t listSize = sizeof(s)/sizeof(s[0]);
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
@ -205,7 +220,7 @@ ucol_close(coll);
|
||||
|
||||
**C++:**
|
||||
|
||||
```C++
|
||||
```c++
|
||||
UnicodeString s [] = { /* list of Unicode strings */ };
|
||||
uint32_t listSize = sizeof(s)/sizeof(s[0]);
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
@ -225,7 +240,7 @@ delete coll;
|
||||
|
||||
**Java:**
|
||||
|
||||
```Java
|
||||
```java
|
||||
String s [] = { /* list of Unicode strings */ };
|
||||
try {
|
||||
Collator coll = Collator.getInstance(Locale.US);
|
||||
@ -358,7 +373,7 @@ The best way to generate a series of sort keys is to do the following:
|
||||
Fill in the sort key values in the overflow buffer.
|
||||
|
||||
4. Allocate the sort key buffer with the size returned by `ucol_getSortKey()` and
|
||||
call memcpy to copy the sort key content from the temp buffer to the sort
|
||||
call `memcpy` to copy the sort key content from the temp buffer to the sort
|
||||
key buffer.
|
||||
|
||||
5. Loop back to step 1 until you are done.
|
||||
@ -367,7 +382,7 @@ The best way to generate a series of sort keys is to do the following:
|
||||
|
||||
### Example
|
||||
|
||||
```C
|
||||
```c
|
||||
void GetSortKeys(const Ucollator* coll, const UChar*
|
||||
const *source, uint32_t arrayLength)
|
||||
{
|
||||
@ -413,7 +428,7 @@ first would probably be wasteful, since `strcoll` usually gives the result
|
||||
before whole strings are processed. This API is implemented only as a C function
|
||||
in ICU4C. There are no equivalent C++ or ICU4J functions.
|
||||
|
||||
```C
|
||||
```c
|
||||
...
|
||||
/* we are arriving with two char*: utf8Source and utf8Target, with their
|
||||
* lengths in utf8SourceLen and utf8TargetLen
|
||||
@ -435,7 +450,7 @@ uncompressed sort key. Between calls to the API you need to save a 64-bit state.
|
||||
Following is an example of simulating a string compare function using the partial
|
||||
sort key API. Your usage model is bound to look much different.
|
||||
|
||||
```C
|
||||
```c
|
||||
static UCollationResult compareUsingPartials(UCollator *coll,
|
||||
const UChar source[], int32_t sLen,
|
||||
const UChar target[], int32_t tLen,
|
||||
@ -478,7 +493,7 @@ of the usage model.
|
||||
|
||||
**C:**
|
||||
|
||||
```C
|
||||
```c
|
||||
#define MAX_KEY_SIZE 100
|
||||
#define MAX_BUFFER_SIZE 10000
|
||||
#define MAX_LIST_LENGTH 5
|
||||
@ -514,7 +529,7 @@ if(U_SUCCESS(status)) {
|
||||
if (expectedLen > length) {
|
||||
if (temp2 == temp) {
|
||||
temp2 =(char*)malloc(expectedLen);
|
||||
} else
|
||||
} else {
|
||||
temp2 =(char*)realloc(temp2, expectedLen);
|
||||
}
|
||||
length =ucol_getSortKey(coll, s[i], -1, temp2, expectedLen);
|
||||
@ -531,7 +546,7 @@ ucol_close(coll);
|
||||
|
||||
**C++:**
|
||||
|
||||
```C++
|
||||
```c++
|
||||
#define MAX_LIST_LENGTH 5
|
||||
const UnicodeString s [] = {
|
||||
"Quick",
|
||||
@ -556,7 +571,7 @@ if(U_SUCCESS(status)) {
|
||||
|
||||
**Java:**
|
||||
|
||||
```Java
|
||||
```java
|
||||
String s [] = {
|
||||
"Quick",
|
||||
"fox",
|
||||
@ -605,7 +620,7 @@ which can be found in the samples section.
|
||||
|
||||
**C:**
|
||||
|
||||
```C
|
||||
```c
|
||||
UCollator *coll = ucol_open("en_US",status);
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
UChar text[20];
|
||||
@ -625,7 +640,7 @@ ucol_close(coll);
|
||||
|
||||
**C++:**
|
||||
|
||||
```C++
|
||||
```c++
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
Collator *coll = Collator::createInstance(Locale::getUS(), status);
|
||||
UnicodeString text("text");
|
||||
@ -641,7 +656,7 @@ delete coll;
|
||||
|
||||
**Java:**
|
||||
|
||||
```Java
|
||||
```java
|
||||
try {
|
||||
RuleBasedCollator coll = (RuleBasedCollator)Collator.getInstance(Locale.US);
|
||||
String text = "text";
|
||||
@ -675,7 +690,7 @@ error code. Similarly to the setter APIs for the Java version, no generic getter
|
||||
API is provided. Each attribute has its own setter API of the form
|
||||
`RuleBasedCollator.getATTRIBUTE_NAME()` in the Java version.
|
||||
|
||||
## References:
|
||||
## References
|
||||
|
||||
1. Ken Whistler, Markus Scherer: "Unicode Technical Standard #10, Unicode Collation
|
||||
Algorithm" (<http://www.unicode.org/unicode/reports/tr10/>)
|
||||
|
@ -1,9 +1,26 @@
|
||||
---
|
||||
layout: default
|
||||
title: Architecture
|
||||
nav_order: 2
|
||||
parent: Collation
|
||||
---
|
||||
<!--
|
||||
© 2020 and later: Unicode, Inc. and others.
|
||||
License & terms of use: http://www.unicode.org/copyright.html
|
||||
-->
|
||||
|
||||
# Collation Service Architecture
|
||||
{: .no_toc }
|
||||
|
||||
## Contents
|
||||
{: .no_toc .text-delta }
|
||||
|
||||
1. TOC
|
||||
{:toc}
|
||||
|
||||
---
|
||||
|
||||
## Overview
|
||||
|
||||
This section describes the design principles, architecture and coding
|
||||
conventions of the ICU Collation Service.
|
||||
|
@ -1,9 +1,26 @@
|
||||
---
|
||||
layout: default
|
||||
title: Concepts
|
||||
nav_order: 1
|
||||
parent: Collation
|
||||
---
|
||||
<!--
|
||||
© 2020 and later: Unicode, Inc. and others.
|
||||
License & terms of use: http://www.unicode.org/copyright.html
|
||||
-->
|
||||
|
||||
# Collation Concepts
|
||||
{: .no_toc }
|
||||
|
||||
## Contents
|
||||
{: .no_toc .text-delta }
|
||||
|
||||
1. TOC
|
||||
{:toc}
|
||||
|
||||
---
|
||||
|
||||
## Overview
|
||||
|
||||
The previous section demonstrated many of the requirements imposed on string
|
||||
comparison routines that try to correctly collate strings according to
|
||||
@ -568,10 +585,10 @@ padding spaces are removed, as in 1" and 2"
|
||||
|
||||
## Collator naming scheme
|
||||
|
||||
***Starting with ICU 54, the following naming scheme and its API functions are
|
||||
deprecated.*** Use ucol_open() with language tag collation keywords instead (see
|
||||
[Collation API Details](api.md)). For example,
|
||||
ucol_open("de-u-co-phonebk-ka-shifted", &errorCode) for German Phonebook order
|
||||
***Starting with ICU 54, the following naming scheme and its API functions are deprecated.***
|
||||
Use `ucol_open()` with language tag collation keywords instead
|
||||
(see [Collation API Details](api.md)). For example,
|
||||
`ucol_open("de-u-co-phonebk-ka-shifted", &errorCode)` for German Phonebook order
|
||||
with "ignore punctuation" mode.
|
||||
|
||||
When collating or matching text, a number of attributes can be used to affect
|
||||
|
@ -1,9 +1,26 @@
|
||||
---
|
||||
layout: default
|
||||
title: Ignore Punctuation Options
|
||||
nav_order: 8
|
||||
parent: Collation
|
||||
---
|
||||
<!--
|
||||
© 2020 and later: Unicode, Inc. and others.
|
||||
License & terms of use: http://www.unicode.org/copyright.html
|
||||
-->
|
||||
|
||||
# “Ignore Punctuation” Options
|
||||
{: .no_toc }
|
||||
|
||||
## Contents
|
||||
{: .no_toc .text-delta }
|
||||
|
||||
1. TOC
|
||||
{:toc}
|
||||
|
||||
---
|
||||
|
||||
## Overview
|
||||
|
||||
By default, spaces and punctuation characters add primary (base character)
|
||||
differences. Such characters sort less-than digits and letters. For example, the
|
||||
|
@ -1,9 +1,26 @@
|
||||
---
|
||||
layout: default
|
||||
title: Customization
|
||||
nav_order: 3
|
||||
parent: Collation
|
||||
---
|
||||
<!--
|
||||
© 2020 and later: Unicode, Inc. and others.
|
||||
License & terms of use: http://www.unicode.org/copyright.html
|
||||
-->
|
||||
|
||||
# Collation Customization
|
||||
{: .no_toc }
|
||||
|
||||
## Contents
|
||||
{: .no_toc .text-delta }
|
||||
|
||||
1. TOC
|
||||
{:toc}
|
||||
|
||||
---
|
||||
|
||||
## Overview
|
||||
|
||||
ICU uses the [CLDR root collation
|
||||
order](http://www.unicode.org/reports/tr35/tr35-collation.html#Root_Collation)
|
||||
@ -918,7 +935,7 @@ given locales. Here is an example of this, which fetches the rules for a
|
||||
particular locale (Danish), then overrides some part (sorting '%' after 'm').
|
||||
The syntax is Java, but C/C++ has similar features.
|
||||
|
||||
```Java
|
||||
```java
|
||||
ULocale myLocale = new ULocale("da");
|
||||
try {
|
||||
|
||||
|
@ -1,9 +1,24 @@
|
||||
---
|
||||
layout: default
|
||||
title: Collation Examples
|
||||
nav_order: 7
|
||||
parent: Collation
|
||||
---
|
||||
<!--
|
||||
© 2020 and later: Unicode, Inc. and others.
|
||||
License & terms of use: http://www.unicode.org/copyright.html
|
||||
-->
|
||||
|
||||
# Collation Examples
|
||||
{: .no_toc }
|
||||
|
||||
## Contents
|
||||
{: .no_toc .text-delta }
|
||||
|
||||
1. TOC
|
||||
{:toc}
|
||||
|
||||
---
|
||||
|
||||
## Simple Collation Sample Customization
|
||||
|
||||
@ -12,7 +27,7 @@ default locale.
|
||||
|
||||
In **C:**
|
||||
|
||||
```C
|
||||
```c
|
||||
#include <stdio.h>
|
||||
#include <memory.h>
|
||||
#include <string.h>
|
||||
@ -20,8 +35,10 @@ In **C:**
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/uloc.h"
|
||||
#include "unicode/ucol.h"
|
||||
|
||||
#define MAXBUFFERSIZE 100
|
||||
#define BIGBUFFERSIZE 5000
|
||||
|
||||
UBool collateWithLocaleInC(const char* locale, UErrorCode *status)
|
||||
{
|
||||
UChar dispName [MAXBUFFERSIZE];
|
||||
@ -85,7 +102,7 @@ UBool collateWithLocaleInC(const char* locale, UErrorCode *status)
|
||||
|
||||
In **C++:**
|
||||
|
||||
```C++
|
||||
```c++
|
||||
#include <stdio.h>
|
||||
#include "unicode/unistr.h"
|
||||
#include "unicode/utypes.h"
|
||||
@ -152,7 +169,7 @@ UBool collateWithLocaleInCPP(const Locale& locale, UErrorCode& status)
|
||||
|
||||
### Main Function
|
||||
|
||||
```C++
|
||||
```c++
|
||||
extern "C" UBool collateWithLocaleInC(const char* locale, UErrorCode *status);
|
||||
int main()
|
||||
{
|
||||
@ -182,7 +199,7 @@ int main()
|
||||
|
||||
In **Java:**
|
||||
|
||||
```Java
|
||||
```java
|
||||
import com.ibm.icu.text.Collator;
|
||||
import com.ibm.icu.text.CollationElementIterator;
|
||||
import com.ibm.icu.text.CollationKey;
|
||||
@ -254,7 +271,7 @@ However, implementing collation-based search with the Boyer-Moore method
|
||||
while getting correct results is very tricky,
|
||||
and ICU no longer uses this method.
|
||||
|
||||
Please see the (String Search Service)[string-search.md] chapter.
|
||||
Please see the [String Search Service](./string-search) chapter.
|
||||
|
||||
## Using large buffers to manage sort keys
|
||||
|
||||
@ -268,11 +285,10 @@ return the maximum size for a sort key. Once you have done this to your string,
|
||||
you just need to allocate a field of maximum size and copy your sortkeys from
|
||||
the buffer to fields.
|
||||
|
||||
```C++
|
||||
uint32_t
|
||||
|
||||
fillBufferWithKeys(UCollator *coll, UChar **source, uint32_t *keys, uint32_t sourceSize,
|
||||
uint8_t **buffer, uint32_t *maxSize, UErrorCode *status)
|
||||
```c++
|
||||
uint32_t fillBufferWithKeys(UCollator *coll, UChar **source, uint32_t *keys,
|
||||
uint32_t sourceSize, uint8_t **buffer,
|
||||
uint32_t *maxSize, UErrorCode *status)
|
||||
{
|
||||
if(status == NULL || U_FAILURE(*status)) {
|
||||
return 0;
|
||||
|
@ -1,9 +1,24 @@
|
||||
---
|
||||
layout: default
|
||||
title: Collation FAQ
|
||||
nav_order: 5
|
||||
parent: Collation
|
||||
---
|
||||
<!--
|
||||
© 2020 and later: Unicode, Inc. and others.
|
||||
License & terms of use: http://www.unicode.org/copyright.html
|
||||
-->
|
||||
|
||||
# Collation FAQ
|
||||
{: .no_toc }
|
||||
|
||||
## Contents
|
||||
{: .no_toc .text-delta }
|
||||
|
||||
1. TOC
|
||||
{:toc}
|
||||
|
||||
---
|
||||
|
||||
## Q. Should I turn Full Normalization on all the time?
|
||||
|
||||
|
@ -1,3 +1,9 @@
|
||||
---
|
||||
layout: default
|
||||
title: Collation
|
||||
nav_order: 9
|
||||
has_children: true
|
||||
---
|
||||
<!--
|
||||
© 2020 and later: Unicode, Inc. and others.
|
||||
License & terms of use: http://www.unicode.org/copyright.html
|
||||
@ -131,7 +137,7 @@ The basic ICU Collation Service is provided by two main categories of APIs:
|
||||
ICU provides an AlphabeticIndex API for generating language-appropriate
|
||||
sorted-section labels like in dictionaries and phone books.
|
||||
|
||||
ICU also provides a higher-level [string search](icu-string-search-service.md)
|
||||
ICU also provides a higher-level [string search](string-search.md)
|
||||
API which can be used, for example, for case-insensitive or accent-insensitive
|
||||
search in an editor or in a web page. ICU string search is based on the
|
||||
low-level [collation element iteration](architecture.md).
|
||||
|
@ -1,9 +1,24 @@
|
||||
---
|
||||
layout: default
|
||||
title: String Search
|
||||
nav_order: 4
|
||||
parent: Collation
|
||||
---
|
||||
<!--
|
||||
© 2020 and later: Unicode, Inc. and others.
|
||||
License & terms of use: http://www.unicode.org/copyright.html
|
||||
-->
|
||||
|
||||
# String Search Service
|
||||
{: .no_toc }
|
||||
|
||||
## Contents
|
||||
{: .no_toc .text-delta }
|
||||
|
||||
1. TOC
|
||||
{:toc}
|
||||
|
||||
---
|
||||
|
||||
## Overview
|
||||
|
||||
@ -159,7 +174,7 @@ the user searches for the pattern "´" (\\u00b4) in the string "A´´B",
|
||||
|
||||
**In C:**
|
||||
|
||||
```C
|
||||
```c
|
||||
char *tgtstr = "The quick brown fox jumps over the lazy dog.";
|
||||
char *patstr = "fox";
|
||||
UChar target[64];
|
||||
@ -196,7 +211,7 @@ the user searches for the pattern "´" (\\u00b4) in the string "A´´B",
|
||||
|
||||
**In C++:**
|
||||
|
||||
```C++
|
||||
```c++
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
UnicodeString target("Jackdaws love my big sphinx of quartz.");
|
||||
UnicodeString pattern("sphinx");
|
||||
@ -222,7 +237,7 @@ the user searches for the pattern "´" (\\u00b4) in the string "A´´B",
|
||||
|
||||
**In Java:**
|
||||
|
||||
```Java
|
||||
```java
|
||||
StringCharacterIterator target = new StringCharacterIterator(
|
||||
"Pack my box with five dozen liquor jugs.");
|
||||
String pattern = "box";
|
||||
|
@ -1,7 +1,6 @@
|
||||
---
|
||||
layout: default
|
||||
title: Compression
|
||||
permalink: /conversion/compression
|
||||
nav_order: 4
|
||||
parent: Conversion
|
||||
---
|
||||
@ -11,6 +10,15 @@ License & terms of use: http://www.unicode.org/copyright.html
|
||||
-->
|
||||
|
||||
# Compression
|
||||
{: .no_toc }
|
||||
|
||||
## Contents
|
||||
{: .no_toc .text-delta }
|
||||
|
||||
1. TOC
|
||||
{:toc}
|
||||
|
||||
---
|
||||
|
||||
## Overview of SCSU
|
||||
|
||||
|
@ -1,8 +1,7 @@
|
||||
---
|
||||
layout: default
|
||||
title: Converter
|
||||
permalink: /conversion/converter
|
||||
nav_order: 4
|
||||
nav_order: 1
|
||||
parent: Conversion
|
||||
---
|
||||
<!--
|
||||
@ -11,6 +10,15 @@ License & terms of use: http://www.unicode.org/copyright.html
|
||||
-->
|
||||
|
||||
# Using Converters
|
||||
{: .no_toc }
|
||||
|
||||
## Contents
|
||||
{: .no_toc .text-delta }
|
||||
|
||||
1. TOC
|
||||
{:toc}
|
||||
|
||||
---
|
||||
|
||||
## Overview
|
||||
|
||||
@ -108,8 +116,7 @@ character set names. Many of these names are aliases to converters within ICU.
|
||||
|
||||
In order to help identify which names are recognized by certain platforms, ICU
|
||||
provides several converter alias functions. The complete description of these
|
||||
functions can be found in the [ICU API
|
||||
Reference](https://unicode-org.github.io/icu-docs/apidoc/released/icu4c/ucnv_8h.html) .
|
||||
functions can be found in the [ICU API Reference](https://unicode-org.github.io/icu-docs/apidoc/released/icu4c/ucnv_8h.html) .
|
||||
|
||||
| Function Names | Short Description |
|
||||
| -------------- | ----------------- |
|
||||
@ -128,8 +135,8 @@ Even though IANA specifies a list of aliases, it usually does not specify the
|
||||
mappings or the actual character set for the aliases. Sometimes vendors will map
|
||||
similar glyph variants to different Unicode code points or sometimes they will
|
||||
assign completely different glyphs for the same codepage code point. Because of
|
||||
these ambiguities, you can sometimes get U_AMBIGUOUS_ALIAS_WARNING for the
|
||||
returned UErrorCode when more than one converter uses the requested alias. This
|
||||
these ambiguities, you can sometimes get `U_AMBIGUOUS_ALIAS_WARNING` for the
|
||||
returned `UErrorCode` when more than one converter uses the requested alias. This
|
||||
is only a warning, and the results can still be used. This UErrorCode value is
|
||||
just a reminder that you may not get what you expected. The above functions can
|
||||
help you to determine which converter you actually wanted.
|
||||
@ -159,16 +166,20 @@ There are four ways to create a converter:
|
||||
"ibm-949_P110-2000" (Shift-JIS with '\\' <-> '¥' mapping) or
|
||||
"ibm-949_P11A-2000" (Shift-JIS with '\\' <-> '\\' mapping) for data-file
|
||||
based conversions.
|
||||
```C
|
||||
|
||||
```c
|
||||
UConverter *conv = ucnv_open("shift_jis", &myError);
|
||||
```
|
||||
|
||||
As a convenience, converter names can be passed in as Unicode. (for example,
|
||||
if a user passed in the string from a Unicode-based user interface).
|
||||
However, the actual names are restricted to an invariant ASCII/EBCDIC
|
||||
subset.
|
||||
```C
|
||||
|
||||
```c
|
||||
UChar *name = ...; UConverter *conv = ucnv_openU(name, &myError);
|
||||
```
|
||||
|
||||
Converter names are case-insensitive. In addition, beginning with ICU 3.6,
|
||||
leading zeroes are ignored in sequences of digits (if further digits
|
||||
follow), and all non-alphanumeric characters are ignored. Thus the strings
|
||||
@ -176,6 +187,7 @@ There are four ways to create a converter:
|
||||
leading zeroes were not ignored, and only spaces, dashes and underscores
|
||||
were ignored.) The `ucnv_compareNames()` function provides such string
|
||||
comparisons.
|
||||
|
||||
Unlike the names of resources or other types of ICU data, converter names
|
||||
can **not** be qualified with a path that indicates the directory or common
|
||||
data file containing the corresponding converter data. The requested
|
||||
@ -183,7 +195,8 @@ There are four ways to create a converter:
|
||||
separate file located in the ICU data directory. However, you can always
|
||||
create a package of converters with pkgdata and open a converter from the
|
||||
package with `ucnv_openPackage()`
|
||||
```C
|
||||
|
||||
```c
|
||||
UConverter *conv = ucnv_openPackage("./myPackage.dat", "customConverter", &myError);
|
||||
```
|
||||
|
||||
@ -194,7 +207,8 @@ There are four ways to create a converter:
|
||||
API for opening a codepage by number must be given a vendor along with the
|
||||
number. Currently, only IBM (`UCNV_IBM`) is supported. For example, the US
|
||||
EBCDIC codepage (IBM #37) can be opened with the following code:
|
||||
```C
|
||||
|
||||
```c
|
||||
ucnv_openCCSID(37, UCNV_IBM, &myErr);
|
||||
```
|
||||
|
||||
@ -213,7 +227,8 @@ There are four ways to create a converter:
|
||||
is not sufficient information, once a converter is opened, it can be queried
|
||||
for its type, min and max char size, etc. This information is not available
|
||||
without actually opening the converter (a fairly lightweight process.)
|
||||
```C
|
||||
|
||||
```c
|
||||
/* Returns count of the number of available names */
|
||||
int count = ucnv_countAvailable();
|
||||
/* get the canonical name of the 36th available converter */
|
||||
@ -233,59 +248,60 @@ There are four ways to create a converter:
|
||||
|
||||
4. **By using the default converter**: The default converter can be opened by
|
||||
passing a NULL as the name of the converter.
|
||||
```C
|
||||
|
||||
```c
|
||||
ucnv_open(NULL, &myErr);
|
||||
```
|
||||
|
||||
> :point_right: **Note**: ICU chooses this converter based on the best information available to it.
|
||||
The purpose of this converter is to interface with the OS using a codepage (i.e. `char *`).
|
||||
Do not use it as a way of determining the best overall converter to use.
|
||||
Usually any Unicode encoding form is the best way to store and send text data,
|
||||
so that important data does not get lost in the conversion.\
|
||||
Also, if the OS supports Unicode-based API's (such as Win32),
|
||||
it is better to use only those Unicode API's.
|
||||
As an example, the new Windows 2000 locales (such as Hindi) do not
|
||||
define the default codepage to something that supports Hindi.
|
||||
The default converter is used in expressions such as: `UnicodeString text("abc");`
|
||||
to convert 'abc', and in the u_uastrcpy() C functions.\
|
||||
Code operating at the [OS level](../design.md) MAY choose to
|
||||
change the default converter with `ucnv_setDefaultName()`.
|
||||
However, be aware that this change has inconsistent results if it is done after
|
||||
ICU components are initialized.
|
||||
> The purpose of this converter is to interface with the OS using a codepage (i.e. `char *`).
|
||||
> Do not use it as a way of determining the best overall converter to use.
|
||||
> Usually any Unicode encoding form is the best way to store and send text data,
|
||||
> so that important data does not get lost in the conversion.
|
||||
> Also, if the OS supports Unicode-based API's (such as Win32),
|
||||
> it is better to use only those Unicode API's.
|
||||
> As an example, the new Windows 2000 locales (such as Hindi) do not
|
||||
> define the default codepage to something that supports Hindi.
|
||||
> The default converter is used in expressions such as: `UnicodeString text("abc");`
|
||||
> to convert 'abc', and in the `u_uastrcpy()` C functions.
|
||||
> Code operating at the [OS level](../design.md) MAY choose to
|
||||
> change the default converter with `ucnv_setDefaultName()`.
|
||||
> However, be aware that this change has inconsistent results if it is done after
|
||||
> ICU components are initialized.
|
||||
|
||||
### Closing a Converter
|
||||
|
||||
Closing a converter frees memory occupied by that instance of the converter.
|
||||
However it does not release the larger shared data tables the converter might
|
||||
use. OS-level code may call `ucnv_flushCache()` to explicitly free memory occupied
|
||||
by [unused tables](../design.md) .
|
||||
by [unused tables](../design.md).
|
||||
|
||||
```C
|
||||
```c
|
||||
ucnv_close(conv)
|
||||
```
|
||||
|
||||
### Converter Life Cycle
|
||||
|
||||
Note that a Converter is created with a certain type (for instance, ISO-8859-3)
|
||||
which does not change over the life of that [object](../design.md) . Converters
|
||||
which does not change over the life of that [object](../design.md). Converters
|
||||
should be allocated one per thread. They are cheap to create, as the shared data
|
||||
doesn't need to be reallocated.
|
||||
|
||||
This is the typical life cycle of a converter, as shown step-by-step:
|
||||
|
||||
1. First, open up the converter with a specified name (or alias name).
|
||||
```C
|
||||
```c
|
||||
UConverter *conv = ucnv_open("shift_jis", &status);
|
||||
```
|
||||
|
||||
2. Target here is the `char s[]` to write into, and targetSize is how big the
|
||||
target buffer is. Source is the UChars that are being converted.
|
||||
```C
|
||||
```c
|
||||
int32_t len = ucnv_fromUChars(conv, target, targetSize, source, u_strlen(source), &status);
|
||||
```
|
||||
|
||||
3. Clean up the converter.
|
||||
```C
|
||||
```c
|
||||
ucnv_close(conv);
|
||||
```
|
||||
|
||||
@ -301,8 +317,8 @@ the same converter for converting data from ISO-8859-3 back into Unicode.
|
||||
If it is necessary to convert a large quantity of data in smaller buffers, use
|
||||
the same converter to convert each buffer. This will make sure any state is
|
||||
preserved from one chunk to the next. Doing this conversion is known as
|
||||
streaming or buffering, and is mentioned Buffered Conversion section (§) later
|
||||
in this chapter.
|
||||
streaming or buffering, and is mentioned [Buffered or Streamed](#3-buffered-or-streamed)
|
||||
section (§) later in this chapter.
|
||||
|
||||
### Cloning a Converter
|
||||
|
||||
@ -315,7 +331,7 @@ produces incorrect results. Also note that the caller owns the cloned object and
|
||||
has to call `ucnv_close()` to dispose of the object. Calling `ucnv_reset()` before
|
||||
cloning will reset the converter to its original state.
|
||||
|
||||
```C
|
||||
```c
|
||||
UConverter* newCnv = ucnv_safeClone(oldCnv, 0, &bufferSize, &err)
|
||||
```
|
||||
|
||||
@ -333,10 +349,10 @@ UConverter* newCnv = ucnv_safeClone(oldCnv, 0, &bufferSize, &err)
|
||||
conversion.
|
||||
|
||||
3. In conversions to Unicode from Multi-byte encodings or conversions from
|
||||
Unicode involving surrogates, if a) only a partial byte sequence is
|
||||
retrieved from the source buffer, b) the "flush" parameter is set to "TRUE"
|
||||
and c) the end of source is reached, then the callback is called with
|
||||
U_TRUNCATED_CHAR_FOUND.
|
||||
Unicode involving surrogates, if (a) only a partial byte sequence is
|
||||
retrieved from the source buffer, (b) the "flush" parameter is set to "TRUE"
|
||||
and (c) the end of source is reached, then the callback is called with
|
||||
`U_TRUNCATED_CHAR_FOUND`.
|
||||
|
||||
### Reset
|
||||
|
||||
@ -410,20 +426,20 @@ In conversion to Unicode, errors are normally due to ill-formed byte sequences:
|
||||
Unused byte values, or lead bytes not followed by trail bytes according to the
|
||||
encoding scheme. Well-formed but unmappable sequences are unusual but possible.
|
||||
|
||||
The ICU default behavior is to emit an U+FFFD REPLACEMENT CHARACTER per
|
||||
The ICU default behavior is to emit an `U+FFFD REPLACEMENT CHARACTER` per
|
||||
offending sequence.
|
||||
|
||||
If the conversion table .ucm file contains a <subchar1> entry (such as in the
|
||||
If the conversion table .ucm file contains a `<subchar1>` entry (such as in the
|
||||
ibm-943 table), a U+001A C0 control ("SUB") is emitted for single-byte
|
||||
illegal/unmappable input rather than U+FFFD REPLACEMENT CHARACTER. For details
|
||||
illegal/unmappable input rather than `U+FFFD REPLACEMENT CHARACTER`. For details
|
||||
on this behavior look for "001A" in the [Conversion Data](data.md) chapter.
|
||||
|
||||
* This behavior originates from mainframes with dedicated
|
||||
single-byte-to-single-byte and double-to-double conversions.
|
||||
* Emitting U+001A for single-byte errors can be avoided by (a) removing the
|
||||
<subchar1> mapping or (b) using a similar conversion table that does not
|
||||
have this mapping (e.g., windows-932 instead of ibm-943) or (c) writing a
|
||||
custom callback function.
|
||||
* This behavior originates from mainframes with dedicated single-byte-to-single-byte
|
||||
and double-to-double conversions.
|
||||
* Emitting U+001A for single-byte errors can be avoided by (a) removing the
|
||||
`<subchar1>` mapping or (b) using a similar conversion table that does not
|
||||
have this mapping (e.g., windows-932 instead of ibm-943) or (c) writing a
|
||||
custom callback function.
|
||||
|
||||
### Error Codes
|
||||
|
||||
@ -501,7 +517,7 @@ new ones. The "callbacks" are either From Unicode (to codepage), or To Unicode
|
||||
4. UCNV_FROM_U_CALLBACK_STOP, UCNV_TO_U_CALLBACK_STOP: Stop at the error.
|
||||
Return the error to the caller. (When using the 'BUFFER' mode of conversion,
|
||||
the source and target pointers returned can be examined to determine where
|
||||
the error occurred. ucnv_getInvalidUChars() and ucnv_getInvalidChars()
|
||||
the error occurred. `ucnv_getInvalidUChars()` and `ucnv_getInvalidChars()`
|
||||
return the actual text which failed).
|
||||
|
||||
5. UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_TO_U_CALLBACK_ESCAPE: This callback is
|
||||
@ -509,16 +525,19 @@ new ones. The "callbacks" are either From Unicode (to codepage), or To Unicode
|
||||
strings such as '%U094D' with the Unicode value, and missing Unicode chars
|
||||
are replaced with text of the form '%X0A' where the codepage had the
|
||||
unconvertible byte hex 0A.
|
||||
|
||||
When a callback is set, a "context" pointer is also provided. How this
|
||||
pointer is created depends on the specific callback. There is usually a
|
||||
createContext() function for that specific callback, where the caller can
|
||||
`createContext()` function for that specific callback, where the caller can
|
||||
set certain options for the callback. Consult the documentation for the
|
||||
specific callback you are using. For ICU's canned callbacks, this pointer
|
||||
may be set to NULL. The functions for setting a different callback also
|
||||
return the old callback, and the old context pointer. These may be stored so
|
||||
that the old callback is re-installed when an operation is finished.
|
||||
|
||||
Additionally the following options can be passed as the context parameter to
|
||||
UCNV_FROM_U_CALLBACK_ESCAPE callback function to produce different outputs.
|
||||
|
||||
| UCNV_ESCAPE_ICU | %U12345 |
|
||||
| ------------------- | ------- |
|
||||
| UCNV_ESCAPE_JAVA | \\u1234 |
|
||||
@ -528,16 +547,23 @@ new ones. The "callbacks" are either From Unicode (to codepage), or To Unicode
|
||||
|
||||
Here are some examples of how to use callbacks.
|
||||
|
||||
```C
|
||||
UConverter *u;
|
||||
void *oldContext, *newContext;
|
||||
```c
|
||||
UConverter *u;
|
||||
void *oldContext, *newContext;
|
||||
UConverterFromUCallback oldAction, newAction;
|
||||
u = ucnv_open("shift_jis", &myError);
|
||||
|
||||
... /* do some conversion with u from unicode.. */
|
||||
ucnv_setFromUCallBack(u, MY_FROMU_CALLBACK, newContext, &oldAction, &oldContext, &myError);
|
||||
|
||||
ucnv_setFromUCallBack(
|
||||
u, MY_FROMU_CALLBACK, newContext, &oldAction, &oldContext, &myError);
|
||||
|
||||
... /* do some other conversion from unicode */
|
||||
|
||||
/* Now, set the callback back */
|
||||
ucnv_setFromUCallBack(u, oldAction, oldContext, &newAction, &newContext, &myError);
|
||||
ucnv_setFromUCallBack(
|
||||
u, oldAction, oldContext, &newAction, &newContext, &myError);
|
||||
|
||||
```
|
||||
|
||||
### Custom Callbacks
|
||||
@ -548,23 +574,23 @@ callbacks as a starting point, and address any further questions to the mailing
|
||||
list.
|
||||
|
||||
Basically, callback, unlike other ICU functions which expect to be called with
|
||||
U_ZERO_ERROR as the input, is called in an exceptional error condition. The
|
||||
`U_ZERO_ERROR` as the input, is called in an exceptional error condition. The
|
||||
callback is a kind of 'last ditch effort' to rectify the error which occurred,
|
||||
before it is returned back to the caller. This is why the implementation of STOP
|
||||
is very simple:
|
||||
|
||||
```C
|
||||
```c
|
||||
void UCNV_FROM_U_CALLBACK_STOP(...) { }
|
||||
```
|
||||
|
||||
The error code such as U_INVALID_CHAR_FOUND is returned to the user. If the
|
||||
The error code such as `U_INVALID_CHAR_FOUND` is returned to the user. If the
|
||||
callback determines that no error should be returned to the user, then the
|
||||
callback must set the error code to U_ZERO_ERROR. Note that this is a departure
|
||||
callback must set the error code to `U_ZERO_ERROR`. Note that this is a departure
|
||||
from most ICU functions, which are supposed to check the error code and return
|
||||
immediately if it is set.
|
||||
|
||||
> :point_right: **Note**: See the functions `ucnv_cb_write...()` for
|
||||
functions which a callback may use to perform its task.
|
||||
> functions which a callback may use to perform its task.
|
||||
|
||||
#### Ignore Default_Ignorable_Code_Point
|
||||
|
||||
@ -581,12 +607,12 @@ character preceding a Variation Selector.
|
||||
Unicode has a character property to identify such characters, as well as
|
||||
currently-unassigned code points that are intended to be used for similar
|
||||
purposes: Default_Ignorable_Code_Point, or "DI" for short:
|
||||
<http://unicode.org/cldr/utility/list-unicodeset.jsp?a=[:DI:]>
|
||||
http://unicode.org/cldr/utility/list-unicodeset.jsp?a=[:DI:]
|
||||
|
||||
Most charsets do not have most or any of these characters.
|
||||
|
||||
**ICU 54 and above by default skip default-ignorable code points if they are
|
||||
unmappable**. (Ticket #[10551](http://bugs.icu-project.org/trac/ticket/10551))
|
||||
unmappable**. (Ticket #[10551](https://unicode-org.atlassian.net/browse/ICU-10551))
|
||||
|
||||
**Older versions of ICU** replaced unmappable default-ignorable code points like
|
||||
any other unmappable code points, by a question mark or whatever substitution
|
||||
@ -598,21 +624,21 @@ are removed from the charset output rather than replaced by a visible character.
|
||||
|
||||
This is a code snippet for use in a custom from-Unicode callback:
|
||||
|
||||
```C
|
||||
```c
|
||||
#include "unicode/uchar.h"
|
||||
...
|
||||
// ...
|
||||
(from-Unicode callback)
|
||||
switch(reason) {
|
||||
case UCNV_UNASSIGNED:
|
||||
if(u_hasBinaryProperty(codePoint, UCHAR_DEFAULT_IGNORABLE_CODE_POINT)) {
|
||||
// Ignore/drop default ignorable code points that cannot be converted,
|
||||
// rather than treating them like errors/writing a substitution character etc.
|
||||
// For example, U+200B Zero Width Space,
|
||||
// U+200E Left-To-Right Mark, U+FE0F Variation Selector 16.
|
||||
*pErrorCode = U_ZERO_ERROR;
|
||||
return;
|
||||
} else {
|
||||
...
|
||||
switch(reason) {
|
||||
case UCNV_UNASSIGNED:
|
||||
if(u_hasBinaryProperty(codePoint, UCHAR_DEFAULT_IGNORABLE_CODE_POINT)) {
|
||||
// Ignore/drop default ignorable code points that cannot be converted,
|
||||
// rather than treating them like errors/writing a substitution character etc.
|
||||
// For example, U+200B Zero Width Space,
|
||||
// U+200E Left-To-Right Mark, U+FE0F Variation Selector 16.
|
||||
*pErrorCode = U_ZERO_ERROR;
|
||||
return;
|
||||
} else {
|
||||
// ...
|
||||
```
|
||||
|
||||
## Modes of Conversion
|
||||
@ -638,11 +664,13 @@ not require the instantiation of a converter.
|
||||
|
||||
Data must be contained entirely within a single string or buffer.
|
||||
|
||||
```C
|
||||
```c
|
||||
conv = ucnv_open("shift_jis", &status);
|
||||
|
||||
/* Convert from Unicode to Shift JIS */
|
||||
len = ucnv_fromUChars(conv, target, targetLen, source, sourceLen, &status);
|
||||
ucnv_close(conv);
|
||||
|
||||
conv = ucnv_open("iso-8859-3", &status);
|
||||
/* Convert from ISO-8859-3 to Unicode */
|
||||
len = ucnv_toUChars(conv, target, targetSize, source, sourceLen, &status);
|
||||
@ -657,7 +685,7 @@ most efficient way to scan for a certain character, or other processing of a
|
||||
single character at a time, because converters are stateful. This works even for
|
||||
multibyte charsets, and for stateful ones such as iso-2022-jp.
|
||||
|
||||
```C
|
||||
```c
|
||||
conv = ucnv_open("Big-5", &status);
|
||||
UChar32 target;
|
||||
while(source < sourceLimit) {
|
||||
@ -685,21 +713,30 @@ The basic loop that is used with the ICU buffer conversion routines is the same
|
||||
in the to and from Unicode directions. In the following pseudocode, either
|
||||
'source' (for fromUnicode) or 'target' (for toUnicode) are UTF-16 UChars.
|
||||
|
||||
```C
|
||||
```c
|
||||
UErrorCode err = U_ZERO_ERROR;
|
||||
|
||||
while (... /*input data available*/ ) {
|
||||
... /* read input data into buffer */
|
||||
|
||||
source = ... /* beginning of read data */;
|
||||
sourceLimit = source + readLength; // end + 1
|
||||
|
||||
UBool flush = (further input data still available) // (i.e. feof())
|
||||
|
||||
/* loop until all source has been processed */
|
||||
do {
|
||||
/* set up target pointers */
|
||||
target = ... /* beginning of output buffer */;
|
||||
targetLimit = target + sizeOfOutput;
|
||||
|
||||
err = U_ZERO_ERROR; /* so that the to/from does not fail */
|
||||
ucnv_to/fromUnicode(converter, &target, targetLimit, &source, sourceLimit, NULL, flush, &err);
|
||||
... /* write (target-beginningOfOutputBuffer) items starting at beginning of output buffer */
|
||||
|
||||
ucnv_to/fromUnicode(converter, &target, targetLimit,
|
||||
&source, sourceLimit, NULL, flush, &err);
|
||||
|
||||
... /* write (target-beginningOfOutputBuffer) items
|
||||
starting at beginning of output buffer */
|
||||
} while (err == U_BUFFER_OVERFLOW_ERROR);
|
||||
if(U_FAILURE(error)) {
|
||||
... /* process error */
|
||||
@ -715,7 +752,7 @@ if(U_FAILURE(error)) {
|
||||
The above code optimizes for processing entire chunks of input data. An
|
||||
efficient size for the output buffer can be calculated as follows. (in bytes):
|
||||
|
||||
```C
|
||||
```c
|
||||
ucnv_getMinCharSize() * inputBufferSize * sizeof(UChar)
|
||||
ucnv_getMaxCharSize() * inputBufferSize
|
||||
```
|
||||
@ -729,7 +766,7 @@ cases where the size of the output buffer is fixed. For instance, in network
|
||||
applications it is sometimes desirable to fill every output packet completely
|
||||
(not including the last packet in the sequence). The above loop does not ensure
|
||||
that every output buffer is completely full. For example, if a 4 UChar input
|
||||
buffer was used, and a 3 byte output buffer with fromUnicode(), the loop would
|
||||
buffer was used, and a 3 byte output buffer with `fromUnicode()`, the loop would
|
||||
typically write 3 bytes, then 1, then 3, and so on. If, instead of efficient use
|
||||
of the input data, the goal is filling output buffers, a slightly different loop
|
||||
can be used.
|
||||
@ -737,7 +774,7 @@ can be used.
|
||||
In such a scenario, the inner write does not occur unless a buffer overflow
|
||||
occurs OR 'flush' is true. So, the 'write' and resetting of the target and
|
||||
targetLimit pointers would only happen
|
||||
`if(err == U_BUFFER_OVERFLOW_ERROR || flush == TRUE)`
|
||||
`if (err == U_BUFFER_OVERFLOW_ERROR || flush == TRUE)`
|
||||
|
||||
The flush parameter on each conversion call should be set to FALSE, until the
|
||||
conversion call is called for the last time for the buffer. This is because the
|
||||
@ -753,33 +790,40 @@ buffer required. (For a more general discussion, see the Preflighting section
|
||||
|
||||
This is accomplished by calling the `ucnv_fromUChars` and `ucnv_toUChars` functions.
|
||||
|
||||
```C
|
||||
```c
|
||||
UChar uchar2;
|
||||
char input_char_buffer = "This is some text";
|
||||
targetsize = ucnv_toUChars(myConverter, NULL, targetcapacity, input_char_buffer, sizeof(input_char_buffer), &err);
|
||||
|
||||
targetsize = ucnv_toUChars(myConverter, NULL, targetcapacity,
|
||||
input_char_buffer, sizeof(input_char_buffer), &err);
|
||||
|
||||
if(err==U_BUFFER_OVERFLOW_ERROR) {
|
||||
err=U_ZERO_ERROR;
|
||||
uchar2=(UChar*)malloc((targetsize) * sizeof(UChar));
|
||||
targetsize = ucnv_toUChars(myConverter, uchar2, targetsize,
|
||||
input_char_buffer, sizeof(input_char_buffer), &err);
|
||||
input_char_buffer, sizeof(input_char_buffer), &err);
|
||||
if(U_FAILURE(err)) {
|
||||
printf("ucnv_toUChars() FAILED %s\\n", myErrorName(err));
|
||||
} else {
|
||||
printf("ucnv_toUChars() o.k.\\n");
|
||||
printf("ucnv_toUChars() FAILED %s\n", myErrorName(err));
|
||||
}
|
||||
else {
|
||||
printf("ucnv_toUChars() o.k.\n");
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
> :point_right: **Note**: This is inefficient since the conversion is performed twice, once for finding
|
||||
the size of target and once for writing to the target.
|
||||
> :point_right: **Note**: *This is inefficient since the conversion is performed
|
||||
> **twice**, once for finding the size of target and once for writing to the target*.
|
||||
|
||||
### 5. Convenience
|
||||
|
||||
ICU provides some convenience functions for conversions:
|
||||
|
||||
```C
|
||||
ucnv_toUChars(myConverter, target_uchars, targetsize, input_char_buffer, sizeof(input_char_buffer), &err);
|
||||
ucnv_fromUChars(cnv, cTarget, (cTargetLimit-cTarget), uSource, (uSourceLimit-uSource), &errorCode);
|
||||
```c
|
||||
ucnv_toUChars(myConverter, target_uchars, targetsize,
|
||||
input_char_buffer, sizeof(input_char_buffer), &err);
|
||||
ucnv_fromUChars(cnv, cTarget, (cTargetLimit-cTarget),
|
||||
uSource, (uSourceLimit-uSource), &errorCode);
|
||||
|
||||
char target[100];
|
||||
UnicodeString str("ABCDEF", "iso-8859-1");
|
||||
int32_t targetsize = str.extract(0, str.length(), target, sizeof(target), "SJIS");
|
||||
@ -788,6 +832,4 @@ target[targetsize] = 0; /* NULL termination */
|
||||
|
||||
## Conversion Examples
|
||||
|
||||
See the [ICU Conversion
|
||||
Examples](https://github.com/unicode-org/icu/blob/master/icu4c/source/samples/ucnv/convsamp.cpp)
|
||||
for more information.
|
||||
See the [ICU Conversion Examples](https://github.com/unicode-org/icu/blob/master/icu4c/source/samples/ucnv/convsamp.cpp) for more information.
|
||||
|
@ -1,8 +1,7 @@
|
||||
---
|
||||
layout: default
|
||||
title: Conversion Data
|
||||
permalink: /conversion/data
|
||||
nav_order: 4
|
||||
nav_order: 2
|
||||
parent: Conversion
|
||||
---
|
||||
<!--
|
||||
@ -11,6 +10,15 @@ License & terms of use: http://www.unicode.org/copyright.html
|
||||
-->
|
||||
|
||||
# Conversion Data
|
||||
{: .no_toc }
|
||||
|
||||
## Contents
|
||||
{: .no_toc .text-delta }
|
||||
|
||||
1. TOC
|
||||
{:toc}
|
||||
|
||||
---
|
||||
|
||||
## Introduction
|
||||
|
||||
@ -141,10 +149,10 @@ following parameters:
|
||||
5. Simple stateful encodings are also handled using only Shift-In and Shift-Out
|
||||
(SI/SO) codes and one single-byte and one double-byte state.
|
||||
|
||||
> :point_right: **Note**: *In the context of conversion tables, "unassigned" code points or codepage byte
|
||||
sequences are valid but do not have a **mapping**. This is different from
|
||||
"unassigned" code points in a character set like Unicode or Shift-JIS which are
|
||||
codes that do not have assigned **characters**.*
|
||||
> :point_right: **Note**: *In the context of conversion tables, "unassigned" code
|
||||
> points or codepage byte sequences are valid but do not have a **mapping**. This
|
||||
> is different from "unassigned" code points in a character set like Unicode or
|
||||
> Shift-JIS which are codes that do not have assigned **characters**.*
|
||||
|
||||
Prior to version 1.8, ICU used more specific, more limited, converter
|
||||
implementations for Single Byte Character Set (SBCS), Double Byte Character Set
|
||||
@ -166,23 +174,23 @@ between ICU versions even for the same .ucm files. The .ucm file format may be
|
||||
extended to include more features.
|
||||
|
||||
The following sections concentrate on the .ucm file format. The .cnv file format
|
||||
is described in the source code in the icu/source/common/ucnvmbcs.c directory
|
||||
is described in the source code in the `icu/source/common/ucnvmbcs.c` directory
|
||||
and is updated using the MBCS converter implementation.
|
||||
|
||||
These conversion tables can have more than one name. ICU allows multiple names
|
||||
("aliases") for the same encoding. It matches a requested encoding name against
|
||||
a list of names in icu/source/data/mappings/convrtrs.txt and when it finds a
|
||||
a list of names in `icu/source/data/mappings/convrtrs.txt` and when it finds a
|
||||
match, ICU opens a converter with the name in the leftmost position in the
|
||||
matching line. The name matching is not case-sensitive and ICU ignores spaces,
|
||||
dashes, and underscores. At build time, the gencnval tool located in the
|
||||
icu/source/tools/gencnval directory, generates a binary form of the convrtrs.txt
|
||||
`icu/source/tools/gencnval` directory, generates a binary form of the convrtrs.txt
|
||||
file as a data file for runtime for the cnvalias.icu file ("Converter Aliases
|
||||
data file").
|
||||
|
||||
### .ucm File Format
|
||||
|
||||
.ucm files are line-oriented text files. Empty lines and comments starting with
|
||||
'#' are ignored.
|
||||
'`#`' are ignored.
|
||||
|
||||
A .ucm file contains two sections:
|
||||
|
||||
@ -193,21 +201,21 @@ A .ucm file contains two sections:
|
||||
For example:
|
||||
|
||||
```
|
||||
<code_set_name> "IBM-943"
|
||||
<char_name_mask> "AXXXX"
|
||||
<mb_cur_min> 1
|
||||
<mb_cur_max> 2
|
||||
<uconv_class> "MBCS"
|
||||
<subchar> \xFC\xFC
|
||||
<subchar1> \x7F
|
||||
<icu:state> 0-7f, 81-9f:1, a0-df, e0-fc:1
|
||||
<icu:state> 40-7e, 80-fc
|
||||
<code_set_name> "IBM-943"
|
||||
<char_name_mask> "AXXXX"
|
||||
<mb_cur_min> 1
|
||||
<mb_cur_max> 2
|
||||
<uconv_class> "MBCS"
|
||||
<subchar> \xFC\xFC
|
||||
<subchar1> \x7F
|
||||
<icu:state> 0-7f, 81-9f:1, a0-df, e0-fc:1
|
||||
<icu:state> 40-7e, 80-fc
|
||||
#
|
||||
CHARMAP
|
||||
#
|
||||
#
|
||||
#ISO 10646 IBM-943
|
||||
#_________ _________
|
||||
#ISO 10646 IBM-943
|
||||
#_________ _________
|
||||
<U0000> \x00 |0
|
||||
<U0001> \x01 |0
|
||||
<U0002> \x02 |0
|
||||
@ -217,6 +225,7 @@ CHARMAP
|
||||
<UFFE5> \x81\x8F |0
|
||||
<UFFFD> \xFC\xFC |2
|
||||
END CHARMAP
|
||||
|
||||
```
|
||||
|
||||
The header fields are:
|
||||
@ -283,21 +292,21 @@ following conditions outline when each are used:
|
||||
|
||||
In the CHARMAP section of a .ucm file, each line contains a Unicode code point
|
||||
(like <U(*1-6 hexadecimal digits for the code point*)> ), a codepage character
|
||||
byte sequence (each byte like \\x*hh* (2 hexadecimal digits} ), and an optional
|
||||
byte sequence (each byte like `\xhh` (2 hexadecimal digits) ), and an optional
|
||||
"precision" or "fallback" indicator.
|
||||
|
||||
The precision indicator either must be present in all mappings or in none of
|
||||
them. The indicator is a pipe symbol ‘|’ followed by a 0, 1, 2, 3, or 4 that has
|
||||
them. The indicator is a pipe symbol `|` followed by a 0, 1, 2, 3, or 4 that has
|
||||
the following meaning:
|
||||
|
||||
* |0 - A "normal", roundtrip mapping from a Unicode code point and back.
|
||||
* |1 - A "fallback" mapping only from Unicode to the codepage, but not back.
|
||||
* |2 – A subchar1 mapping. The code point is unmappable, and if a substitution
|
||||
* `|0` - A "normal", roundtrip mapping from a Unicode code point and back.
|
||||
* `|1` - A "fallback" mapping only from Unicode to the codepage, but not back.
|
||||
* `|2` - A subchar1 mapping. The code point is unmappable, and if a substitution
|
||||
is performed, then the subchar1 should be used rather than the subchar.
|
||||
Otherwise, such mappings are ignored.
|
||||
* |3 - A "reverse fallback" mapping only from the codepage to Unicode, but not
|
||||
* `|3` - A "reverse fallback" mapping only from the codepage to Unicode, but not
|
||||
back to the codepage.
|
||||
* |4 - A "good one-way" mapping only from Unicode to the codepage, but not
|
||||
* `|4` - A "good one-way" mapping only from Unicode to the codepage, but not
|
||||
back.
|
||||
|
||||
Fallback mappings from Unicode typically do not map codes for the same
|
||||
@ -306,7 +315,7 @@ exists in Unicode but not in the codepage. To replace it, ICU maps a codepage
|
||||
code to a similar-looking code for human-readable output. This mapping feature
|
||||
is not useful for text data transmission especially in markup languages where a
|
||||
Unicode code point can be escaped with its code point value. The ICU application
|
||||
programming interface (API) ucnv_setFallback() controls this fallback behavior.
|
||||
programming interface (API) `ucnv_setFallback()` controls this fallback behavior.
|
||||
|
||||
"Reverse fallbacks" are technically similar, but the same Unicode character can
|
||||
be encoded twice in the codepage. ICU always uses reverse fallbacks at runtime.
|
||||
@ -329,11 +338,11 @@ PUA and reverse fallbacks are assumed to be for "the same character", just an
|
||||
older code for it.
|
||||
|
||||
Something similar happens with from-Unicode Variation Selector sequences. It is
|
||||
possible to round-trip (|0) either the unadorned character or the sequence with
|
||||
a variation selector, and add a "good one-way" mapping (|4) from the other
|
||||
possible to round-trip (`|0`) either the unadorned character or the sequence with
|
||||
a variation selector, and add a "good one-way" mapping (`|4`) from the other
|
||||
version. That "good one-way" mapping does not lose much information, and it is
|
||||
used even if the "use fallback" API flag is false. Alternatively, both mappings
|
||||
could be fallbacks (|1) that should be controlled by the "use fallback"
|
||||
could be fallbacks (`|1`) that should be controlled by the "use fallback"
|
||||
attribute.
|
||||
|
||||
### State table syntax in .ucm files
|
||||
@ -346,9 +355,9 @@ not easily) be computed from the pure mapping data. Instead, the .ucm files for
|
||||
MBCS encodings have additional entries that are specific to the ICU makeconv
|
||||
tool. The state tables for SBCS, DBCS, and EBCDIC_STATEFUL are implied, but they
|
||||
can be overridden (see the examples below). These state tables are specified in
|
||||
the header section of the .ucm file that contains the <icu:state> element. Each
|
||||
the header section of the .ucm file that contains the `<icu:state>` element. Each
|
||||
line defines one aspect of the state machine. The state machine uses a table of
|
||||
as many rows as there are states (= as many as there are <icu:state> lines).
|
||||
as many rows as there are states (= as many as there are `<icu:state>` lines).
|
||||
Each row has 256 entries; one for each possible byte value.
|
||||
|
||||
The state table lines in the .ucm header conform to the following Extended
|
||||
@ -360,24 +369,24 @@ firstentry="initial" | "surrogates"
|
||||
(initial state (default for state 0), output is all surrogate pairs)
|
||||
```
|
||||
|
||||
Each state table row description (that follows the <icu:state>) begins with an
|
||||
Each state table row description (that follows the `<icu:state>`) begins with an
|
||||
optional initial or surrogates keyword and is followed by one or more column
|
||||
entries. For the purpose of codepage state tables, the states=rows in the table
|
||||
are numbered beginning at 0 for the first line in the .ucm file header. The
|
||||
numbers are assigned implicitly by the makeconv tool in order of the <icu:state>
|
||||
numbers are assigned implicitly by the makeconv tool in order of the `<icu:state>`
|
||||
lines.
|
||||
|
||||
A row may be empty (nothing following the <icu:state>) — that is equivalent to
|
||||
A row may be empty (nothing following the `<icu:state>`) - that is equivalent to
|
||||
"all illegal" or 0-ff.i and is useful for trail byte states for all-illegal byte
|
||||
sequences.
|
||||
|
||||
```
|
||||
entry=range ':' nextstate] ['.' [action]]
|
||||
range = number ['-' number]
|
||||
entry=range [':' nextstate] ['.' [action]]
|
||||
range = number ['-' number]
|
||||
nextstate = number (0..7f)
|
||||
action = 'u' | 's' | 'p' | 'i'
|
||||
(unassigned, state change only, surrogate pair, illegal)
|
||||
number = (1- or 2-digit hexadecimal number)
|
||||
action = 'u' | 's' | 'p' | 'i'
|
||||
(unassigned, state change only, surrogate pair, illegal)
|
||||
number = (1- or 2-digit hexadecimal number)
|
||||
```
|
||||
|
||||
Each column entry contains at least one hexadecimal byte value or value range
|
||||
@ -385,7 +394,7 @@ and is separated by a comma. The column entry specifies how to interpret an
|
||||
input byte in the row's state. If neither a next state nor an action is
|
||||
explicitly specified (only the byte range is given) then the byte value
|
||||
terminates the byte sequence, results in a valid mapping to a Unicode BMP
|
||||
character, and resets the state number to 0. The first line with <icu:state> is
|
||||
character, and resets the state number to 0. The first line with `<icu:state>` is
|
||||
called state 0.
|
||||
|
||||
The next state can be explicitly specified with a separating colon ( : )
|
||||
@ -464,7 +473,7 @@ and test3.ucm contains
|
||||
<U101234>+<U50005>+<U60006> \x07+\x00+\x01\x02\x0f+\x09 |0
|
||||
|
||||
For more examples see the ICU conversion data and the
|
||||
icu/source/test/testdata/test*.ucm test data files.
|
||||
`icu/source/test/testdata/test*.ucm` test data files.
|
||||
|
||||
ICU 2.8 supports up to 19 UChars on the Unicode side of a mapping and up to 31
|
||||
bytes on the codepage side.
|
||||
@ -642,15 +651,17 @@ structure above. All double-byte sequences return to state 1 and SI switches
|
||||
back to state 0. SI and SO are also allowed in their own states with no effect.
|
||||
|
||||
> :point_right: **Note**: *If a DBCS or EBCDIC_STATEFUL codepage maps supplementary (non-BMP) Unicode
|
||||
characters, then a modified state table needs to be specified in the .ucm file.
|
||||
The state table needs to use the surrogates designation for a table row or .p
|
||||
for some entries.<br/> The reuse of a final or intermediate state (shown for EUC-JP) is valid for as
|
||||
long as there is no circle in the state chain. The mappings will be unique
|
||||
because of the different path to the shared state (sharing a state saves some
|
||||
memory; each state table row occupies 1kB in the .cnv file). This table also
|
||||
shows the redefinition of byte value ranges within one state row (State number
|
||||
3)as shorthand. State 3 defines bytes a1-fe to go to state 1, but the following
|
||||
entries redefine and override certain bytes to go to state 4.*
|
||||
> characters, then a modified state table needs to be specified in the .ucm file.
|
||||
> The state table needs to use the surrogates designation for a table row or .p
|
||||
> for some entries.*
|
||||
>
|
||||
> *The reuse of a final or intermediate state (shown for EUC-JP) is valid for as
|
||||
> long as there is no circle in the state chain. The mappings will be unique
|
||||
> because of the different path to the shared state (sharing a state saves some
|
||||
> memory; each state table row occupies 1kB in the .cnv file). This table also
|
||||
> shows the redefinition of byte value ranges within one state row (State number
|
||||
> 3) as shorthand. State 3 defines bytes a1-fe to go to state 1, but the following
|
||||
> entries redefine and override certain bytes to go to state 4.*
|
||||
|
||||
An initial state never needs a surrogates designation or .p because Unicode
|
||||
mapping results in initial states that are stored directly in the state table,
|
||||
|
@ -1,8 +1,7 @@
|
||||
---
|
||||
layout: default
|
||||
title: CharSet Detection
|
||||
permalink: /conversion/detection
|
||||
nav_order: 4
|
||||
title: Charset Detection
|
||||
nav_order: 3
|
||||
parent: Conversion
|
||||
---
|
||||
<!--
|
||||
@ -11,6 +10,15 @@ License & terms of use: http://www.unicode.org/copyright.html
|
||||
-->
|
||||
|
||||
# Character Set Detection
|
||||
{: .no_toc }
|
||||
|
||||
## Contents
|
||||
{: .no_toc .text-delta }
|
||||
|
||||
1. TOC
|
||||
{:toc}
|
||||
|
||||
---
|
||||
|
||||
## Overview
|
||||
|
||||
@ -41,15 +49,16 @@ error rate, particularly when working with short samples of text.
|
||||
|
||||
## CharsetMatch
|
||||
|
||||
The CharsetMatch class holds the result of comparing the input data to a
|
||||
The `CharsetMatch` class holds the result of comparing the input data to a
|
||||
particular encoding. You can use an instance of this class to get the name of
|
||||
the character set, the language, and how good the match is. You can also use
|
||||
this class to decode the input data.
|
||||
|
||||
To find out how good the match is, you use the getConfidence() method to get a
|
||||
To find out how good the match is, you use the `getConfidence()` method to get a
|
||||
*confidence value*. This is an integer from 0 to 100. The higher the value, the
|
||||
more confidence there is in the match For example:
|
||||
|
||||
```java
|
||||
CharsetMatch match = ...;
|
||||
int confidence;
|
||||
confidence = match.getConfidence();
|
||||
@ -58,12 +67,12 @@ if (confidence < 50 ) {
|
||||
} else {
|
||||
// handle a good match...
|
||||
}
|
||||
```
|
||||
|
||||
In C, you can use the
|
||||
`ucsdet_getConfidence(const UCharsetMatch *ucsm, UErrorCode *status)`
|
||||
method to get a confidence value
|
||||
In C, you can use the `ucsdet_getConfidence(const UCharsetMatch *ucsm, UErrorCode *status)`
|
||||
method to get a confidence value.
|
||||
|
||||
```C
|
||||
```c
|
||||
const UCharsetMatch *ucm;
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
int32_t confidence = ucsdet_getConfidence(ucm, &status);
|
||||
@ -75,9 +84,9 @@ if (confidence <50) {
|
||||
```
|
||||
|
||||
To get the name of the character set, which can be used as an encoding name in
|
||||
Java, you use the getName() method:
|
||||
Java, you use the `getName()` method:
|
||||
|
||||
```Java
|
||||
```java
|
||||
CharsetMatch match = ...;
|
||||
byte characterData[] = ...;
|
||||
String charsetName;
|
||||
@ -86,21 +95,21 @@ charsetName = match.getName();
|
||||
unicodeData = new String(characterData, charsetName);
|
||||
```
|
||||
|
||||
To get the name of the character set in C :
|
||||
To get the name of the character set in C:
|
||||
|
||||
```C
|
||||
```c
|
||||
const UCharsetMatch *ucm;
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
const char *name = ucsdet_getName(ucm, &status);
|
||||
```
|
||||
|
||||
To get the three letter ISO code for the detected language, you use the
|
||||
getLanguage() method. If the language could not be determined, getLanguage()
|
||||
will return null. Note that language detection does not work with all charsets,
|
||||
`getLanguage()` method. If the language could not be determined, `getLanguage()`
|
||||
will return `null`. Note that language detection does not work with all charsets,
|
||||
and includes only a very small set of possible languages. It should not used if
|
||||
robust, reliable language detection is required.
|
||||
|
||||
```Java
|
||||
```java
|
||||
CharsetMatch match = ...;
|
||||
String languageCode;
|
||||
languageCode = match.getLanguage();
|
||||
@ -113,33 +122,33 @@ The `ucsdet_getLanguage(const UCharsetMatch *ucsm, UErrorCode *status)` method
|
||||
can be used in C to get the language code. If the language could not be
|
||||
determined, the method will return an empty string.
|
||||
|
||||
```C
|
||||
```c
|
||||
const UCharsetMatch *ucm;
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
const char *language = ucsdet_getLanguage(ucm, &status);
|
||||
```
|
||||
|
||||
If you want to get a Java String containing the converted data you can use the
|
||||
getString() method:
|
||||
`getString()` method:
|
||||
|
||||
```Java
|
||||
```java
|
||||
CharsetMatch match = ...;
|
||||
String unicodeData;
|
||||
unicodeData = match.getString();
|
||||
```
|
||||
|
||||
If you want to limit the number of characters in the string, pass the maximum
|
||||
number of characters you want to the getString() method:
|
||||
number of characters you want to the `getString()` method:
|
||||
|
||||
```Java
|
||||
```java
|
||||
CharsetMatch match = ...;
|
||||
String unicodeData;
|
||||
unicodeData = match.getString(1024);
|
||||
```
|
||||
|
||||
To get a java.io.Reader to read the converted data, use the getReader() method:
|
||||
To get a `java.io.Reader` to read the converted data, use the `getReader()` method:
|
||||
|
||||
```Java
|
||||
```java
|
||||
CharsetMatch match = ...;
|
||||
Reader reader;
|
||||
StringBuffer sb = new StringBuffer();
|
||||
@ -154,16 +163,16 @@ reader.close();
|
||||
|
||||
## CharsetDetector
|
||||
|
||||
The CharsetDetector class does the actual detection. It matches the input data
|
||||
against all character sets, and computes a list of CharsetMatch objects to hold
|
||||
The `CharsetDetector` class does the actual detection. It matches the input data
|
||||
against all character sets, and computes a list of `CharsetMatch` objects to hold
|
||||
the results. The input data can be supplied as an array of bytes, or as a
|
||||
java.io.InputStream.
|
||||
`java.io.InputStream`.
|
||||
|
||||
To use a CharsetDetector object, first you construct it, and then you set the
|
||||
input data, using the setText() method. Because setting the input data is
|
||||
separate from the construction, it is easy to reuse a CharsetDetector object:
|
||||
To use a `CharsetDetector` object, first you construct it, and then you set the
|
||||
input data, using the `setText()` method. Because setting the input data is
|
||||
separate from the construction, it is easy to reuse a `CharsetDetector` object:
|
||||
|
||||
```Java
|
||||
```java
|
||||
CharsetDetector detector;
|
||||
byte[] byteData = ...;
|
||||
InputStream streamData = ...;
|
||||
@ -175,10 +184,10 @@ detector.setText(streamData);
|
||||
```
|
||||
|
||||
If you want to know which character set matches your input data with the highest
|
||||
confidence, you can use the detect() method, which will return a CharsetMatch
|
||||
confidence, you can use the `detect()` method, which will return a `CharsetMatch`
|
||||
object for the match with the highest confidence:
|
||||
|
||||
```Java
|
||||
```java
|
||||
CharsetDetector detector;
|
||||
CharsetMatch match;
|
||||
byte[] byteData = ...;
|
||||
@ -190,7 +199,7 @@ match = detector.detect();
|
||||
If you want to know which character set matches your input data in C, you can
|
||||
use the `ucsdet_detect(UCharsetDetector *csd , UErrorCode *status)` method.
|
||||
|
||||
```C
|
||||
```c
|
||||
UCharsetDetector *csd;
|
||||
const UCharsetMatch *ucm;
|
||||
static char buffer[BUFFER_SIZE] = {....};
|
||||
@ -201,11 +210,11 @@ ucm = ucsdet_detect(csd, &status);
|
||||
```
|
||||
|
||||
If you want to know all of the character sets that could match your input data
|
||||
with a non-zero confidence, you can use the detectAll() method, which will
|
||||
return an array of CharsetMatch objects sorted by confidence, from highest to
|
||||
with a non-zero confidence, you can use the `detectAll()` method, which will
|
||||
return an array of `CharsetMatch` objects sorted by confidence, from highest to
|
||||
lowest.:
|
||||
|
||||
```Java
|
||||
```java
|
||||
CharsetDetector detector;
|
||||
CharsetMatch matches[];
|
||||
byte[] byteData = ...;
|
||||
@ -217,19 +226,17 @@ for (int m = 0; m < matches.length; m += 1) {
|
||||
}
|
||||
```
|
||||
|
||||
> :point_right: **Note**: The
|
||||
`ucsdet_detectALL(UCharsetDetector *csd , int32_t *matchesFound, UErrorCode *status)`
|
||||
method can be used in C in order to detect all of the
|
||||
character sets where matchesFound is a pointer to a variable that will be set to
|
||||
the number of charsets identified that are consistent with the input data.
|
||||
> :point_right: **Note**: The `ucsdet_detectALL(UCharsetDetector *csd , int32_t *matchesFound, UErrorCode *status)`
|
||||
> method can be used in C in order to detect all of the character sets where `matchesFound` is a pointer
|
||||
> to a variable that will be set to the number of charsets identified that are consistent with the input data.
|
||||
|
||||
The CharsetDetector class also implements a crude *input filter* that can strip
|
||||
The `CharsetDetector` class also implements a crude *input filter* that can strip
|
||||
out html and xml style tags. If you want to enable the input filter, which is
|
||||
disabled when you construct a CharsetDetector, you use the enableInputFilter()
|
||||
disabled when you construct a `CharsetDetector`, you use the `enableInputFilter()`
|
||||
method, which takes a boolean. Pass in true if you want to enable the input
|
||||
filter, and false if you want to disable it:
|
||||
|
||||
```Java
|
||||
```java
|
||||
CharsetDetector detector;
|
||||
CharsetMatch match;
|
||||
byte[] byteDataWithTags = ...;
|
||||
@ -242,7 +249,7 @@ match = detector.detect();
|
||||
To enable an input filter in C, you can use
|
||||
`ucsdet_enableInputFilter(UCharsetDetector *csd, UBool filter)` function.
|
||||
|
||||
```C
|
||||
```c
|
||||
UCharsetDetector *csd;
|
||||
const UCharsetMatch *ucm;
|
||||
static char buffer[BUFFER_SIZE] = {....};
|
||||
@ -258,10 +265,10 @@ better to filter the data yourself before you pass it to CharsetDetector. For
|
||||
example, you might know that the data is from an html page that contains CSS
|
||||
styles, which will not be stripped by the input filter.
|
||||
|
||||
You can use the inputFilterEnabled() method to see if the input filter is
|
||||
You can use the `inputFilterEnabled()` method to see if the input filter is
|
||||
enabled:
|
||||
|
||||
```Java
|
||||
```java
|
||||
CharsetDetector detector;
|
||||
detector = new CharsetDetector();
|
||||
// do a bunch of stuff with detector
|
||||
@ -273,13 +280,13 @@ if (detector.inputFilterEnabled()) {
|
||||
}
|
||||
```
|
||||
|
||||
> :point_right: **Note**: The ICU4C API provide uscdet_isInputFilterEnabled(const UCharsetDetector\*
|
||||
csd) function to check whether the input filter is enabled.
|
||||
> :point_right: **Note**: The ICU4C API provide `uscdet_isInputFilterEnabled(const UCharsetDetector* csd)` function
|
||||
> to check whether the input filter is enabled.
|
||||
|
||||
The CharsetDetector class also has two convenience methods that let you detect
|
||||
and convert the input data in one step: the getReader() and getString() methods:
|
||||
The `CharsetDetector` class also has two convenience methods that let you detect
|
||||
and convert the input data in one step: the `getReader()` and `getString()` methods:
|
||||
|
||||
```Java
|
||||
```java
|
||||
CharsetDetector detector;
|
||||
byte[] byteData = ...;
|
||||
InputStream streamData = ...;
|
||||
@ -290,13 +297,13 @@ unicodeData = detector.getString(byteData, null);
|
||||
unicodeReader = detector.getReader(streamData, null);
|
||||
```
|
||||
|
||||
> :point_right: **Note**: The second argument to the getReader() and getString() methods is a
|
||||
String called declaredEncoding, which is not currently used. There is also a
|
||||
setDeclaredEncoding() method, which is also not currently used.
|
||||
> :point_right: **Note**: The second argument to the `getReader()` and `getString()` methods
|
||||
> is a String called `declaredEncoding`, which is not currently used. There is also a
|
||||
> `setDeclaredEncoding()` method, which is also not currently used.
|
||||
|
||||
The following code is equivalent to using the convenience methods:
|
||||
|
||||
```Java
|
||||
```java
|
||||
CharsetDetector detector;
|
||||
CharsetMatch match;
|
||||
byte[] byteData = ...;
|
||||
@ -315,7 +322,7 @@ unicodeReader = match.getReader();CharsetDetector
|
||||
## Detected Encodings
|
||||
|
||||
The following table shows all the encodings that can be detected. You can get
|
||||
this list (without the languages) by calling the getAllDetectableCharsets()
|
||||
this list (without the languages) by calling the `getAllDetectableCharsets()`
|
||||
method:
|
||||
|
||||
| **Character Set** | **Languages** |
|
||||
|
@ -1,7 +1,6 @@
|
||||
---
|
||||
layout: default
|
||||
title: Conversion
|
||||
permalink: /conversion
|
||||
nav_order: 4
|
||||
has_children: true
|
||||
---
|
||||
@ -11,6 +10,15 @@ License & terms of use: http://www.unicode.org/copyright.html
|
||||
-->
|
||||
|
||||
# Conversion
|
||||
{: .no_toc }
|
||||
|
||||
## Contents
|
||||
{: .no_toc .text-delta }
|
||||
|
||||
1. TOC
|
||||
{:toc}
|
||||
|
||||
---
|
||||
|
||||
## Conversion Overview
|
||||
|
||||
|
@ -1,9 +1,24 @@
|
||||
---
|
||||
layout: default
|
||||
title: Calendar Examples
|
||||
nav_order: 2
|
||||
parent: Date/Time
|
||||
---
|
||||
<!--
|
||||
© 2020 and later: Unicode, Inc. and others.
|
||||
License & terms of use: http://www.unicode.org/copyright.html
|
||||
-->
|
||||
|
||||
# Calendar Examples
|
||||
{: .no_toc }
|
||||
|
||||
## Contents
|
||||
{: .no_toc .text-delta }
|
||||
|
||||
1. TOC
|
||||
{:toc}
|
||||
|
||||
---
|
||||
|
||||
## Calendar for Default Time Zone
|
||||
|
||||
@ -11,7 +26,8 @@ These C++, C , and Java examples get a Calendar based on the default time zone
|
||||
and add days to a date.
|
||||
|
||||
**C++**
|
||||
```C++
|
||||
|
||||
```c++
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
GregorianCalendar* gc = new GregorianCalendar(status);
|
||||
if (U_FAILURE(status)) {
|
||||
@ -48,7 +64,8 @@ delete gc;
|
||||
```
|
||||
|
||||
**C**
|
||||
```C
|
||||
|
||||
```c
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
int32_t i;
|
||||
UCalendar* cal = ucal_open(NULL, -1, NULL, UCAL_GREGORIAN, &status);
|
||||
@ -88,7 +105,8 @@ ucal_close(cal);
|
||||
```
|
||||
|
||||
**Java**
|
||||
```Java
|
||||
|
||||
```java
|
||||
Calendar cal = new GregorianCalendar();
|
||||
if (cal == null) {
|
||||
System.out.println("Couldn't create GregorianCalendar");
|
||||
@ -113,12 +131,14 @@ for (int i = 0; i < 30; i++) {
|
||||
}
|
||||
```
|
||||
|
||||
## Converting dates between calendars
|
||||
|
||||
These C++, C , and Java examples demonstrates converting dates from one calendar
|
||||
(Gregorian) to another calendar (Japanese).
|
||||
|
||||
**C++**
|
||||
|
||||
```C++
|
||||
```c++
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
UDate time;
|
||||
Calendar *cal1, *cal2;
|
||||
@ -172,7 +192,8 @@ delete cal2;
|
||||
```
|
||||
|
||||
**C**
|
||||
```C
|
||||
|
||||
```c
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
UDate time;
|
||||
UCalendar *cal1, *cal2;
|
||||
@ -230,7 +251,7 @@ ucal_close(cal2);
|
||||
|
||||
**Java**
|
||||
|
||||
```Java
|
||||
```java
|
||||
Calendar cal1, cal2;
|
||||
// Create a new Gregorian Calendar.
|
||||
cal1 = new GregorianCalendar();
|
||||
|
@ -1,9 +1,24 @@
|
||||
---
|
||||
layout: default
|
||||
title: Calendar Services
|
||||
nav_order: 1
|
||||
parent: Date/Time
|
||||
---
|
||||
<!--
|
||||
© 2020 and later: Unicode, Inc. and others.
|
||||
License & terms of use: http://www.unicode.org/copyright.html
|
||||
-->
|
||||
|
||||
# Calendar Classes
|
||||
{: .no_toc }
|
||||
|
||||
## Contents
|
||||
{: .no_toc .text-delta }
|
||||
|
||||
1. TOC
|
||||
{:toc}
|
||||
|
||||
---
|
||||
|
||||
## Overview
|
||||
|
||||
@ -148,7 +163,7 @@ field by field.
|
||||
|
||||
This is demonstrated in the following example:
|
||||
|
||||
```C++
|
||||
```c++
|
||||
Calendar cal = Calendar.getInstance();
|
||||
cal.set(2000, Calendar.MARCH, 15);
|
||||
Date date = new Date(2000-1900, Calendar.JULY, 4);
|
||||
@ -241,7 +256,7 @@ directly access the GregorianCalendar-specific methods not present in Calendar.
|
||||
The correct way to handle this is to perform a dynamic cast, after testing the
|
||||
type of the object using getDynamicClassID(). For example:
|
||||
|
||||
```C++
|
||||
```c++
|
||||
void setCutover(Calendar *cal, UDate myCutover) {
|
||||
if (cal->getDynamicClassID() == GregorianCalendar::getStaticClassID()) {
|
||||
GregorianCalendar *gc = (GregorianCalendar*)cal;
|
||||
|
@ -1,9 +1,24 @@
|
||||
---
|
||||
layout: default
|
||||
title: Date/Time
|
||||
nav_order: 6
|
||||
has_children: true
|
||||
---
|
||||
<!--
|
||||
© 2020 and later: Unicode, Inc. and others.
|
||||
License & terms of use: http://www.unicode.org/copyright.html
|
||||
-->
|
||||
|
||||
# Date/Time Services
|
||||
{: .no_toc }
|
||||
|
||||
## Contents
|
||||
{: .no_toc .text-delta }
|
||||
|
||||
1. TOC
|
||||
{:toc}
|
||||
|
||||
---
|
||||
|
||||
## Overview of ICU System Time Zones
|
||||
|
||||
|
@ -1,15 +1,30 @@
|
||||
---
|
||||
layout: default
|
||||
title: Date and Time Zone Examples
|
||||
nav_order: 4
|
||||
parent: Date/Time
|
||||
---
|
||||
<!--
|
||||
© 2020 and later: Unicode, Inc. and others.
|
||||
License & terms of use: http://www.unicode.org/copyright.html
|
||||
-->
|
||||
|
||||
# Date and Time Zone Examples
|
||||
{: .no_toc }
|
||||
|
||||
## Contents
|
||||
{: .no_toc .text-delta }
|
||||
|
||||
1. TOC
|
||||
{:toc}
|
||||
|
||||
---
|
||||
|
||||
## C++ TimeZone example code
|
||||
|
||||
This example code illustrates some time zone operations.
|
||||
|
||||
```C++
|
||||
```c++
|
||||
UErrorCode success = U_ZERO_ERROR;
|
||||
UnicodeString dateReturned, curTZNameEn, curTZNameFr;
|
||||
UDate curDate;
|
||||
|
@ -1,9 +1,24 @@
|
||||
---
|
||||
layout: default
|
||||
title: TimeZone Classes
|
||||
nav_order: 3
|
||||
parent: Date/Time
|
||||
---
|
||||
<!--
|
||||
© 2020 and later: Unicode, Inc. and others.
|
||||
License & terms of use: http://www.unicode.org/copyright.html
|
||||
-->
|
||||
|
||||
# ICU TimeZone Classes
|
||||
{: .no_toc }
|
||||
|
||||
## Contents
|
||||
{: .no_toc .text-delta }
|
||||
|
||||
1. TOC
|
||||
{:toc}
|
||||
|
||||
---
|
||||
|
||||
## Overview
|
||||
|
||||
@ -164,7 +179,7 @@ zone resources in that file.
|
||||
3. Locate the .dat file to be updated, and do the update. The commands below
|
||||
are for a .dat file named icudt55l.dat.
|
||||
|
||||
```Shell
|
||||
```shell
|
||||
icupkg -a zoneinfo64.res icudt55l.dat
|
||||
icupkg -a windowsZones.res icudt55l.dat
|
||||
icupkg -a timezoneTypes.res icudt55l.dat
|
||||
@ -220,9 +235,9 @@ the ICU data path.
|
||||
5. Copy the freshly built ICU data shared library to the desired destination.
|
||||
|
||||
> :point_right: **Note**: The standard ICU download package contains pre-built
|
||||
ICU data. To rebuild ICU data from .txt files, you will need to replace the
|
||||
contents of `icu4c/source/data` with the contents of ICU4C data.zip. See
|
||||
[ICU Data Build Tool](../../icu_data/buildtool.md) for more details.
|
||||
> ICU data. To rebuild ICU data from .txt files, you will need to replace the
|
||||
> contents of `icu4c/source/data` with the contents of ICU4C data.zip. See
|
||||
> [ICU Data Build Tool](../../icu_data/buildtool.md) for more details.
|
||||
|
||||
There are too many possible platform variations to be more specific about how to
|
||||
rebuild ICU4C in these instructions. See the ReadMe file included with the ICU
|
||||
@ -237,6 +252,6 @@ are [here](https://htmlpreview.github.io/?https://github.com/unicode-org/icu-dat
|
||||
|
||||
The updater will work with ICU version 3.4.2 and newer.
|
||||
|
||||
Sample Code
|
||||
## Sample Code
|
||||
|
||||
See the [Date and Time Zone Examples](examples.md) subpage.
|
||||
|
@ -1,9 +1,24 @@
|
||||
---
|
||||
layout: default
|
||||
title: Universal Time Scale
|
||||
nav_order: 5
|
||||
parent: Date/Time
|
||||
---
|
||||
<!--
|
||||
© 2020 and later: Unicode, Inc. and others.
|
||||
License & terms of use: http://www.unicode.org/copyright.html
|
||||
-->
|
||||
|
||||
# Universal Time Scale
|
||||
{: .no_toc }
|
||||
|
||||
## Contents
|
||||
{: .no_toc .text-delta }
|
||||
|
||||
1. TOC
|
||||
{:toc}
|
||||
|
||||
---
|
||||
|
||||
## Overview
|
||||
|
||||
|
@ -1,9 +1,8 @@
|
||||
---
|
||||
layout: default
|
||||
title: ICU Design
|
||||
permalink: /design
|
||||
nav_order: 5
|
||||
parent: Introduction
|
||||
parent: ICU
|
||||
---
|
||||
<!--
|
||||
© 2020 and later: Unicode, Inc. and others.
|
||||
@ -11,6 +10,17 @@ License & terms of use: http://www.unicode.org/copyright.html
|
||||
-->
|
||||
|
||||
# ICU Architectural Design
|
||||
{: .no_toc }
|
||||
|
||||
## Contents
|
||||
{: .no_toc .text-delta }
|
||||
|
||||
1. TOC
|
||||
{:toc}
|
||||
|
||||
---
|
||||
|
||||
# Overview
|
||||
|
||||
This chapter discusses the ICU design structure, the ICU versioning support, and
|
||||
the introduction of namespace in C++.
|
||||
@ -471,7 +481,7 @@ The data stored in resource bundles is tagged with version numbers. A resource
|
||||
bundle can contain a tagged string named "Version" that declares the version
|
||||
number in dotted-integer format. For example,
|
||||
|
||||
```Text
|
||||
```text
|
||||
en {
|
||||
Version { "1.0.3.5" }
|
||||
...
|
||||
@ -487,7 +497,7 @@ version number 0.
|
||||
|
||||
Elements within a resource bundle may also contain version numbers. For example:
|
||||
|
||||
```Text
|
||||
```text
|
||||
be {
|
||||
CollationElements {
|
||||
Version { "1.0.0.0" }
|
||||
@ -718,7 +728,7 @@ For example, here is how an API might be tagged in various versions:
|
||||
|
||||
* **In ICU 0.2**: The API is newly introduced as a draft in this release.
|
||||
|
||||
```Text
|
||||
```text
|
||||
@draft ICU 0.2
|
||||
f(x)
|
||||
```
|
||||
@ -726,7 +736,7 @@ For example, here is how an API might be tagged in various versions:
|
||||
* **In ICU 0.4**: The draft version number is updated, because the signature
|
||||
changed.
|
||||
|
||||
```Text
|
||||
```text
|
||||
@draft ICU 0.4
|
||||
f(x, y)
|
||||
```
|
||||
@ -734,7 +744,7 @@ For example, here is how an API might be tagged in various versions:
|
||||
* **In ICU 0.6**: The API is promoted from draft to stable, but the version
|
||||
number does not change, as the signature is the same.
|
||||
|
||||
```Text
|
||||
```text
|
||||
@stable ICU 0.4
|
||||
f(x, y)
|
||||
```
|
||||
@ -745,21 +755,21 @@ For example, here is how an API might be tagged in various versions:
|
||||
calling code continues to work unchanged (so we retain @stable if that's what
|
||||
it was.)
|
||||
|
||||
```Text
|
||||
```text
|
||||
@stable ICU 1.0
|
||||
f(xbase, y)
|
||||
```
|
||||
|
||||
* **In ICU 1.2**: The API is demoted to deprecated (or obsolete) status.
|
||||
|
||||
```Text
|
||||
```text
|
||||
@deprecated ICU 1.2 Use g(x,y,z) instead.
|
||||
f(xbase, y)
|
||||
```
|
||||
|
||||
or, when this API is planned to be removed in ICU 1.4:
|
||||
|
||||
```Text
|
||||
```text
|
||||
@obsolete ICU 1.4. Use g(x,y,z) instead.
|
||||
f(xbase, y)
|
||||
```
|
||||
@ -816,7 +826,7 @@ Function renaming is enabled by default, and must be disabled at ICU build time
|
||||
to enable release to release binary compatibility. To disable renaming, use the
|
||||
configure option
|
||||
|
||||
```Shell
|
||||
```shell
|
||||
configure -–disable-renaming [other configure options]
|
||||
```
|
||||
|
||||
|
@ -1,9 +1,24 @@
|
||||
---
|
||||
layout: default
|
||||
title: Coding Guidelines
|
||||
nav_order: 1
|
||||
parent: Misc
|
||||
---
|
||||
<!--
|
||||
© 2020 and later: Unicode, Inc. and others.
|
||||
License & terms of use: http://www.unicode.org/copyright.html
|
||||
-->
|
||||
|
||||
# Coding Guidelines
|
||||
{: .no_toc }
|
||||
|
||||
## Contents
|
||||
{: .no_toc .text-delta }
|
||||
|
||||
1. TOC
|
||||
{:toc}
|
||||
|
||||
---
|
||||
|
||||
## Overview
|
||||
|
||||
@ -24,17 +39,17 @@ for the following reasons:
|
||||
* It is useful in the same form for C also
|
||||
* Some C++ compilers do not support exceptions
|
||||
|
||||
> :point_right: **Note**: This error code mechanism, in fact, works similar to
|
||||
exceptions. If users call several ICU functions in a sequence, as soon as one
|
||||
sets a failure code, the functions in the following example will not work. This
|
||||
procedure prevents the API function from processing data that is not valid in
|
||||
the sequence of function calls and relieves the caller from checking the error
|
||||
code after each call. It is somewhat similar to how an exception terminates a
|
||||
function block or try block early.*
|
||||
> :point_right: **Note**: *This error code mechanism, in fact, works similar to
|
||||
> exceptions. If users call several ICU functions in a sequence, as soon as one
|
||||
> sets a failure code, the functions in the following example will not work. This
|
||||
> procedure prevents the API function from processing data that is not valid in
|
||||
> the sequence of function calls and relieves the caller from checking the error
|
||||
> code after each call. It is somewhat similar to how an exception terminates a
|
||||
> function block or try block early.*
|
||||
|
||||
The following code shows the inside of an ICU function implementation:
|
||||
|
||||
```C++
|
||||
```c++
|
||||
U_CAPI const UBiDiLevel * U_EXPORT2
|
||||
ubidi_getLevels(UBiDi *pBiDi, UErrorCode *pErrorCode) {
|
||||
int32_t start, length;
|
||||
@ -66,27 +81,27 @@ method on that object, not even one with a `UErrorCode` parameter.
|
||||
|
||||
### Sample Function with Error Checking
|
||||
|
||||
```C++
|
||||
```c++
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
uplrules_select(const UPluralRules *uplrules, // Do not check
|
||||
// "this"/uplrules vs. NULL.
|
||||
double number,
|
||||
UChar *keyword, int32_t capacity,
|
||||
UErrorCode *status) // Do not check status!=NULL.
|
||||
double number,
|
||||
UChar *keyword, int32_t capacity,
|
||||
UErrorCode *status) // Do not check status!=NULL.
|
||||
{
|
||||
if (U_FAILURE(*status)) { // Do check for U_FAILURE()
|
||||
if (U_FAILURE(*status)) { // Do check for U_FAILURE()
|
||||
// before setting *status
|
||||
return 0; // or calling UErrorCode-less
|
||||
return 0; // or calling UErrorCode-less
|
||||
// select(number).
|
||||
}
|
||||
if (keyword == NULL ? capacity != 0 : capacity < 0) {
|
||||
}
|
||||
if (keyword == NULL ? capacity != 0 : capacity < 0) {
|
||||
// Standard destination buffer
|
||||
// checks.
|
||||
*status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return 0;
|
||||
}
|
||||
UnicodeString result = ((PluralRules*)uplrules)->select(number);
|
||||
return result.extract(keyword, capacity, *status);
|
||||
*status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return 0;
|
||||
}
|
||||
UnicodeString result = ((PluralRules*)uplrules)->select(number);
|
||||
return result.extract(keyword, capacity, *status);
|
||||
}
|
||||
```
|
||||
|
||||
@ -219,19 +234,19 @@ which is new in Doxygen 1.7.5) can cite a fragment of existing sample or test co
|
||||
|
||||
Example in `ucnv.h`:
|
||||
|
||||
```C++
|
||||
/**
|
||||
* \snippet samples/ucnv/convsamp.cpp ucnv_open
|
||||
*/
|
||||
ucnv_open( ... ) ...
|
||||
```c++
|
||||
/**
|
||||
* \snippet samples/ucnv/convsamp.cpp ucnv_open
|
||||
*/
|
||||
ucnv_open( ... ) ...
|
||||
```
|
||||
|
||||
This cites code in icu4c/source/samples/ucnv/convsamp.cpp as follows:
|
||||
|
||||
```C++
|
||||
//! [ucnv_open]
|
||||
conv = ucnv_open("koi8-r", &status);
|
||||
//! [ucnv_open]
|
||||
```c++
|
||||
//! [ucnv_open]
|
||||
conv = ucnv_open("koi8-r", &status);
|
||||
//! [ucnv_open]
|
||||
```
|
||||
|
||||
Notice the tag "`ucnv_open`" which must be the same in all three places (in
|
||||
@ -253,21 +268,21 @@ Note: The `@system` tag is *in addition to* the
|
||||
|
||||
Copy/paste the appropriate #ifndef..#endif pair from the following:
|
||||
|
||||
```C++
|
||||
```c++
|
||||
#ifndef U_HIDE_DRAFT_API
|
||||
#endif // U_HIDE_DRAFT_API
|
||||
#endif // U_HIDE_DRAFT_API
|
||||
|
||||
#ifndef U_HIDE_DEPRECATED_API
|
||||
#endif // U_HIDE_DEPRECATED_API
|
||||
#endif // U_HIDE_DEPRECATED_API
|
||||
|
||||
#ifndef U_HIDE_OBSOLETE_API
|
||||
#endif // U_HIDE_OBSOLETE_API
|
||||
#endif // U_HIDE_OBSOLETE_API
|
||||
|
||||
#ifndef U_HIDE_SYSTEM_API
|
||||
#endif // U_HIDE_SYSTEM_API
|
||||
#endif // U_HIDE_SYSTEM_API
|
||||
|
||||
#ifndef U_HIDE_INTERNAL_API
|
||||
#endif // U_HIDE_INTERNAL_API
|
||||
#endif // U_HIDE_INTERNAL_API
|
||||
```
|
||||
|
||||
We `#ifndef` `@draft`/`@deprecated`/... APIs as much as possible, including C
|
||||
@ -293,12 +308,12 @@ We do not #ifndef APIs where that would be problematic:
|
||||
|
||||
More handy copy-paste text:
|
||||
|
||||
```C++
|
||||
// Do not enclose the protected default constructor with #ifndef U_HIDE_INTERNAL_API
|
||||
// or else the compiler will create a public default constructor.
|
||||
```c++
|
||||
// Do not enclose the protected default constructor with #ifndef U_HIDE_INTERNAL_API
|
||||
// or else the compiler will create a public default constructor.
|
||||
|
||||
// Do not enclose protected default/copy constructors with #ifndef U_HIDE_INTERNAL_API
|
||||
// or else the compiler will create public ones.
|
||||
// Do not enclose protected default/copy constructors with #ifndef U_HIDE_INTERNAL_API
|
||||
// or else the compiler will create public ones.
|
||||
```
|
||||
|
||||
### C and C++ Type and Format Convention Guidelines
|
||||
@ -490,19 +505,21 @@ satisfy all the compilers' requirements.
|
||||
For example, use the following
|
||||
convention:
|
||||
|
||||
```C++
|
||||
```c++
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
u_formatMessage(...);
|
||||
```
|
||||
|
||||
> :point_right: **Note**: The `U_CAPI`/`U_DRAFT`/... and `U_EXPORT2` qualifiers
|
||||
are required for both the declaration and the definiton of *exported C and
|
||||
static C++ functions*. Use `U_CAPI` (or `U_DRAFT` etc.) before and `U_EXPORT2`
|
||||
after the return type of *exported C and static C++ functions*. Internal
|
||||
functions that are visible outside a compilation unit need a `U_CFUNC` before
|
||||
the return type. *Non-static C++ class member functions* do *not* get
|
||||
`U_CAPI`/`U_EXPORT2` because they are exported and declared together with their
|
||||
class exports.
|
||||
> are required for both the declaration and the definiton of *exported C and
|
||||
> static C++ functions*. Use `U_CAPI` (or `U_DRAFT` etc.) before and `U_EXPORT2`
|
||||
> after the return type of *exported C and static C++ functions*.
|
||||
>
|
||||
> Internal functions that are visible outside a compilation unit need a `U_CFUNC`
|
||||
> before the return type.
|
||||
>
|
||||
> *Non-static C++ class member functions* do *not* get `U_CAPI`/`U_EXPORT2`
|
||||
> because they are exported and declared together with their class exports.
|
||||
|
||||
#### Use Anonymous Namesapces or Static For File Scope
|
||||
|
||||
@ -517,7 +534,7 @@ z/OS and Windows COM wrappers around ICU need `__cdecl` for callback functions.
|
||||
The reason is that C++ can have a different function calling convention from C.
|
||||
These callback functions also usually need to be private. So the following code
|
||||
|
||||
```C++
|
||||
```c++
|
||||
UBool
|
||||
isAcceptable(void * /* context */,
|
||||
const char * /* type */, const char * /* name */,
|
||||
@ -530,7 +547,7 @@ isAcceptable(void * /* context */,
|
||||
should be changed to look like the following by adding `U_CDECL_BEGIN`, `static`,
|
||||
`U_CALLCONV` and `U_CDECL_END`.
|
||||
|
||||
```C++
|
||||
```c++
|
||||
U_CDECL_BEGIN
|
||||
static UBool U_CALLCONV
|
||||
isAcceptable(void * /* context */,
|
||||
@ -663,7 +680,7 @@ string as an array. This reduces the time to load the library and all its
|
||||
pointers. This should be done so that the same library data can be shared across
|
||||
processes automatically. Here is an example:
|
||||
|
||||
```C++
|
||||
```c++
|
||||
#define MY_MACRO_DEFINED_STR "macro string"
|
||||
const char *myCString = "myCString";
|
||||
int16_t myNumbers[] = {1, 2, 3};
|
||||
@ -671,7 +688,7 @@ int16_t myNumbers[] = {1, 2, 3};
|
||||
|
||||
This should be changed to the following:
|
||||
|
||||
```C++
|
||||
```c++
|
||||
static const char MY_MACRO_DEFINED_STR[] = "macro string";
|
||||
static const char myCString[] = "myCString";
|
||||
static const int16_t myNumbers[] = {1, 2, 3};
|
||||
@ -682,14 +699,14 @@ static const int16_t myNumbers[] = {1, 2, 3};
|
||||
The most common reason to have static initialization is to declare a
|
||||
`static const UnicodeString`, for example (see `utypes.h` about invariant characters):
|
||||
|
||||
```C++
|
||||
```c++
|
||||
static const UnicodeString myStr("myStr", "");
|
||||
```
|
||||
|
||||
The most portable and most efficient way to declare ASCII text as a Unicode
|
||||
string is to do the following instead:
|
||||
|
||||
```C++
|
||||
```c++
|
||||
static const UChar myStr[] = { 0x6D, 0x79, 0x53, 0x74, 0x72, 0}; /* "myStr" */
|
||||
```
|
||||
|
||||
@ -821,7 +838,7 @@ function, i.e. the first non-pure virtual function that is not inline at the
|
||||
point of class definition. If there is no key function, it is emitted everywhere
|
||||
used."
|
||||
|
||||
(This was first done in ICU 49; see [ticket #8454](http://bugs.icu-project.org/trac/ticket/8454.)
|
||||
(This was first done in ICU 49; see [ticket #8454](https://unicode-org.atlassian.net/browse/ICU-8454.)
|
||||
|
||||
#### Namespaces
|
||||
|
||||
@ -836,7 +853,7 @@ Starting with ICU 49, we require C++ namespace support.
|
||||
Class declarations, even forward declarations, must be scoped to the ICU
|
||||
namespace. For example:
|
||||
|
||||
```C++
|
||||
```c++
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
class Locale;
|
||||
@ -873,17 +890,17 @@ Pretty much everyone agrees that inline implementations are ok if they fit on
|
||||
the same line as the function signature, even if that means bending the
|
||||
single-statement-per-line rule slightly:
|
||||
|
||||
```C++
|
||||
```c++
|
||||
T *orphan() { T *p=ptr; ptr=NULL; return p; }
|
||||
```
|
||||
|
||||
Most people also agree that very short multi-line implementations are ok inline
|
||||
in the class declaration. Something like the following is probably the maximum:
|
||||
|
||||
```C++
|
||||
```c++
|
||||
Value *getValue(int index) {
|
||||
if(index>=0 && index<fLimit) {
|
||||
return fArray[index];
|
||||
if(index>=0 && index<fLimit) {
|
||||
return fArray[index];
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
@ -994,8 +1011,8 @@ adopt-on-success):
|
||||
it is 0 because the constructor was not called. (Typically, a `UErrorCode`
|
||||
must be set to `U_MEMORY_ALLOCATION_ERROR`.)
|
||||
|
||||
**Pitfall**: If you allocate/construct via "`ClassName *p = new
|
||||
ClassName(adoptee);`" and the memory allocation failed (p==NULL), then the
|
||||
**Pitfall**: If you allocate/construct via "`ClassName *p = new ClassName(adoptee);`"
|
||||
and the memory allocation failed (p==NULL), then the
|
||||
constructor has not been called, the adoptee has not been adopted, and you
|
||||
are still responsible for deleting it!
|
||||
|
||||
@ -1011,7 +1028,7 @@ adopt-on-success):
|
||||
Example: (This is a best-practice example. It does not reflect current `Calendar`
|
||||
code.)
|
||||
|
||||
```C++
|
||||
```c++
|
||||
Calendar*
|
||||
Calendar::createInstance(TimeZone* zone, UErrorCode& errorCode) {
|
||||
LocalPointer<TimeZone> adoptedZone(zone);
|
||||
@ -1071,7 +1088,7 @@ factory method must be deleted by the user/owner.
|
||||
#### Memory Allocation Failures
|
||||
|
||||
All memory allocations and object creations should be checked for success. In
|
||||
the event of a failure (a NULL returned), a `U_MEMORY_ALLOCATION_ERROR` status
|
||||
the event of a failure (a `NULL` returned), a `U_MEMORY_ALLOCATION_ERROR` status
|
||||
should be returned by the ICU function in question. If the allocation failure
|
||||
leaves the ICU service in an invalid state, such that subsequent ICU operations
|
||||
could also fail, the situation should be flagged so that the subsequent
|
||||
@ -1126,16 +1143,16 @@ pointers to owned memory must always be either NULL or point to owned objects.
|
||||
|
||||
Internally:
|
||||
|
||||
[cmemory.h](http://bugs.icu-project.org/trac/browser/icu/trunk/source/common/cmemory.h)
|
||||
[cmemory.h](https://github.com/unicode-org/icu/blob/master/icu4c/source/common/cmemory.h)
|
||||
defines the `LocalMemory` class for chunks of memory of primitive types which
|
||||
will be `uprv_free()`'ed.
|
||||
|
||||
[cmemory.h](http://bugs.icu-project.org/trac/browser/icu/trunk/source/common/cmemory.h)
|
||||
[cmemory.h](https://github.com/unicode-org/icu/blob/master/icu4c/source/common/cmemory.h)
|
||||
also defines `MaybeStackArray` and `MaybeStackHeaderAndArray` which automate
|
||||
management of arrays.
|
||||
|
||||
Use `CharString`
|
||||
([charstr.h](http://bugs.icu-project.org/trac/browser/icu/trunk/source/common/charstr.h))
|
||||
([charstr.h](https://github.com/unicode-org/icu/blob/master/icu4c/source/common/charstr.h))
|
||||
for `char *` strings that you build and modify.
|
||||
|
||||
#### Global Inline Functions
|
||||
@ -1182,7 +1199,7 @@ This section describes the C-specific guidelines or conventions to use.
|
||||
All C APIs need to be **both declared and defined** using the `U_CAPI` and
|
||||
`U_EXPORT2` qualifiers.
|
||||
|
||||
```C++
|
||||
```c++
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
u_formatMessage(...);
|
||||
```
|
||||
@ -1215,7 +1232,7 @@ sometimes 4-) letter module identifier. Very general names like
|
||||
Functions that roughly compare to constructors and destructors are called
|
||||
`umod_open()` and `umod_close()`. See the following example:
|
||||
|
||||
```C++
|
||||
```c++
|
||||
CAPI UBiDi * U_EXPORT2
|
||||
ubidi_open();
|
||||
|
||||
@ -1242,7 +1259,7 @@ For example, in `ubidi.h` we define the `UBiDi` "service object" type and also
|
||||
have the following "smart pointer" definition which will call `ubidi_close()` on
|
||||
destruction:
|
||||
|
||||
```C++
|
||||
```c++
|
||||
// Use config switches like this only after including unicode/utypes.h
|
||||
// or another ICU header.
|
||||
#if U_SHOW_CPLUSPLUS_API
|
||||
@ -1304,7 +1321,7 @@ there must be a corresponding function (like a `ucnv_close()`) that deallocates
|
||||
that memory.
|
||||
|
||||
All memory allocations in ICU should be checked for success. In the event of a
|
||||
failure (a NULL returned from `uprv_malloc()`), a `U_MEMORY_ALLOCATION_ERROR` status
|
||||
failure (a `NULL` returned from `uprv_malloc()`), a `U_MEMORY_ALLOCATION_ERROR` status
|
||||
should be returned by the ICU function in question. If the allocation failure
|
||||
leaves the ICU service in an invalid state, such that subsequent ICU operations
|
||||
could also fail, the situation should be flagged so that the subsequent
|
||||
@ -1614,7 +1631,7 @@ regarding C test services, please see the `icu4c/source/tools/ctestfw` directory
|
||||
|
||||
The following shows the possible format of test functions:
|
||||
|
||||
```C++
|
||||
```c++
|
||||
void some_test()
|
||||
{
|
||||
}
|
||||
@ -1622,7 +1639,7 @@ void some_test()
|
||||
|
||||
Output from the test is accomplished with three printf-like functions:
|
||||
|
||||
```C++
|
||||
```c++
|
||||
void log_err ( const char *fmt, ... );
|
||||
void log_info ( const char *fmt, ... );
|
||||
void log_verbose ( const char *fmt, ... );
|
||||
@ -1638,7 +1655,7 @@ void log_verbose ( const char *fmt, ... );
|
||||
To use the tests, link them into a hierarchical structure. The root of the
|
||||
structure will be allocated by default.
|
||||
|
||||
```C++
|
||||
```c++
|
||||
TestNode *root = NULL; /* empty */
|
||||
addTest( &root, &some_test, "/test");
|
||||
```
|
||||
@ -1655,14 +1672,14 @@ The calls to `addTest` must be placed in a function or a hierarchy of functions
|
||||
A subtree may be extracted from another tree of tests for the programmatic
|
||||
running of subtests.
|
||||
|
||||
```C++
|
||||
```c++
|
||||
TestNode* sub;
|
||||
sub = getTest(root, "/mytests");
|
||||
```
|
||||
|
||||
And a tree of tests may be run simply by:
|
||||
|
||||
```C++
|
||||
```c++
|
||||
runTests( root ); /* or 'sub' */
|
||||
```
|
||||
|
||||
@ -1694,7 +1711,7 @@ To run the test suite from the command line, change the directories to
|
||||
|
||||
Type `cintltst -h` to view its command line parameters.
|
||||
|
||||
```Text
|
||||
```text
|
||||
### Syntax:
|
||||
### Usage: [ -l ] [ -v ] [ -verbose] [-a] [ -all] [-n]
|
||||
[-no_err_msg] [ -h] [ /path/to/test ]
|
||||
@ -1802,7 +1819,7 @@ release build, the executable will reside in the
|
||||
|
||||
Type just `intltest -h` to see the usage:
|
||||
|
||||
```Text
|
||||
```text
|
||||
### Syntax:
|
||||
### IntlTest [-option1 -option2 ...] [testname1 testname2 ...]
|
||||
### where options are: verbose (v), all (a), noerrormsg (n),
|
||||
@ -1836,7 +1853,7 @@ directly to the underlying operating system.
|
||||
be accomplished with the following line in a file
|
||||
**icu/source/icudefs.local** :
|
||||
|
||||
```Shell
|
||||
```shell
|
||||
CPPFLAGS+=-DU_DEBUG_FAKETIME
|
||||
```
|
||||
|
||||
@ -1853,7 +1870,7 @@ directly to the underlying operating system.
|
||||
`/tsformat/ccaltst/TestCalendar` in verbose mode which will print out the
|
||||
current time:
|
||||
|
||||
```Shell
|
||||
```shell
|
||||
$ make check ICUINFO_OPTS=-m U_FAKETIME_START=28800000 CINTLTST_OPTS=-v
|
||||
/tsformat/ccaltst/TestCalendar
|
||||
U_DEBUG_FAKETIME was set at compile time, so the ICU clock will start at a
|
||||
@ -1938,7 +1955,7 @@ It is possible to use struct types, but one must make sure that each field is
|
||||
naturally aligned, without possible implicit field padding by the compiler —
|
||||
assuming a reasonable compiler.
|
||||
|
||||
```C++
|
||||
```c++
|
||||
// bad because i will be preceded by compiler-dependent padding
|
||||
// for proper alignment
|
||||
struct BadExample {
|
||||
@ -2007,7 +2024,7 @@ format).
|
||||
The following addition to autoexp.dat will cause **`UnicodeString`**s to be
|
||||
visible as strings in the debugger without expanding sub-items:
|
||||
|
||||
```Text
|
||||
```text
|
||||
;; Copyright (C) 2010 IBM Corporation and Others. All Rights Reserved.
|
||||
;; ICU Additions
|
||||
;; Add to {VISUAL STUDIO} \Common7\Packages\Debugger\autoexp.dat
|
||||
|
@ -1,9 +1,24 @@
|
||||
---
|
||||
layout: default
|
||||
title: Contributions
|
||||
nav_order: 4
|
||||
parent: Misc
|
||||
---
|
||||
<!--
|
||||
© 2020 and later: Unicode, Inc. and others.
|
||||
License & terms of use: http://www.unicode.org/copyright.html
|
||||
-->
|
||||
|
||||
# Contributions to the ICU library
|
||||
{: .no_toc }
|
||||
|
||||
## Contents
|
||||
{: .no_toc .text-delta }
|
||||
|
||||
1. TOC
|
||||
{:toc}
|
||||
|
||||
---
|
||||
|
||||
## Why Contribute?
|
||||
|
||||
|
@ -1,3 +1,9 @@
|
||||
---
|
||||
layout: default
|
||||
title: Misc
|
||||
nav_order: 15
|
||||
has_children: true
|
||||
---
|
||||
<!--
|
||||
© 2020 and later: Unicode, Inc. and others.
|
||||
License & terms of use: http://www.unicode.org/copyright.html
|
||||
|
@ -1,9 +1,28 @@
|
||||
---
|
||||
layout: default
|
||||
title: Custom ICU4C Synchronization
|
||||
nav_order: 3
|
||||
parent: Misc
|
||||
---
|
||||
<!--
|
||||
© 2020 and later: Unicode, Inc. and others.
|
||||
License & terms of use: http://www.unicode.org/copyright.html
|
||||
-->
|
||||
|
||||
# Custom ICU4C Synchronization
|
||||
{: .no_toc }
|
||||
|
||||
## Contents
|
||||
{: .no_toc .text-delta }
|
||||
|
||||
1. TOC
|
||||
{:toc}
|
||||
|
||||
---
|
||||
|
||||
> :warning: ***Support for including an alternate implementation of atomic and mutex
|
||||
> operations has been withdrawn and removed from ICU4C.***
|
||||
> See issue [ICU-20185](https://unicode-org.atlassian.net/browse/ICU-20185).
|
||||
|
||||
### Build Time User Provided Synchronization
|
||||
|
||||
|
@ -1,9 +1,24 @@
|
||||
---
|
||||
layout: default
|
||||
title: Synchronization
|
||||
nav_order: 2
|
||||
parent: Misc
|
||||
---
|
||||
<!--
|
||||
© 2020 and later: Unicode, Inc. and others.
|
||||
License & terms of use: http://www.unicode.org/copyright.html
|
||||
-->
|
||||
|
||||
# Synchronization Issues
|
||||
{: .no_toc }
|
||||
|
||||
## Contents
|
||||
{: .no_toc .text-delta }
|
||||
|
||||
1. TOC
|
||||
{:toc}
|
||||
|
||||
---
|
||||
|
||||
## Overview
|
||||
|
||||
|
@ -1,9 +1,24 @@
|
||||
---
|
||||
layout: default
|
||||
title: User Guide Editing
|
||||
nav_order: 5
|
||||
parent: Misc
|
||||
---
|
||||
<!--
|
||||
© 2020 and later: Unicode, Inc. and others.
|
||||
License & terms of use: http://www.unicode.org/copyright.html
|
||||
-->
|
||||
|
||||
# Editing the ICU User Guide
|
||||
{: .no_toc }
|
||||
|
||||
## Contents
|
||||
{: .no_toc .text-delta }
|
||||
|
||||
1. TOC
|
||||
{:toc}
|
||||
|
||||
---
|
||||
|
||||
## Overview
|
||||
|
||||
|
@ -1,9 +1,25 @@
|
||||
---
|
||||
layout: default
|
||||
title: Date and Time Formatting Examples
|
||||
nav_order: 1
|
||||
grand_parent: Formatting
|
||||
parent: Formatting Dates and Times
|
||||
---
|
||||
<!--
|
||||
© 2020 and later: Unicode, Inc. and others.
|
||||
License & terms of use: http://www.unicode.org/copyright.html
|
||||
-->
|
||||
|
||||
# Date and Time Formatting Examples
|
||||
{: .no_toc }
|
||||
|
||||
## Contents
|
||||
{: .no_toc .text-delta }
|
||||
|
||||
1. TOC
|
||||
{:toc}
|
||||
|
||||
---
|
||||
|
||||
## Format
|
||||
|
||||
|
@ -1,9 +1,25 @@
|
||||
---
|
||||
layout: default
|
||||
title: Formatting Dates and Times
|
||||
nav_order: 2
|
||||
parent: Formatting
|
||||
has_children: true
|
||||
---
|
||||
<!--
|
||||
© 2020 and later: Unicode, Inc. and others.
|
||||
License & terms of use: http://www.unicode.org/copyright.html
|
||||
-->
|
||||
|
||||
# Formatting Dates and Times
|
||||
{: .no_toc }
|
||||
|
||||
## Contents
|
||||
{: .no_toc .text-delta }
|
||||
|
||||
1. TOC
|
||||
{:toc}
|
||||
|
||||
---
|
||||
|
||||
## Formatting Dates and Times Overview
|
||||
|
||||
|
@ -1,10 +1,27 @@
|
||||
---
|
||||
layout: default
|
||||
title: FormattedValue
|
||||
nav_order: 4
|
||||
grand_parent: Formatting
|
||||
parent: Formatting Numbers
|
||||
---
|
||||
<!--
|
||||
© 2019 and later: Unicode, Inc. and others.
|
||||
License & terms of use: http://www.unicode.org/copyright.html
|
||||
-->
|
||||
|
||||
FormattedValue
|
||||
==============
|
||||
# FormattedValue
|
||||
{: .no_toc }
|
||||
|
||||
## Contents
|
||||
{: .no_toc .text-delta }
|
||||
|
||||
1. TOC
|
||||
{:toc}
|
||||
|
||||
---
|
||||
|
||||
## Overview
|
||||
|
||||
FormattedValue is an abstraction for localized strings with attributes
|
||||
returned by a number of ICU formatters. APIs for FormattedValue are available
|
||||
|
@ -1,9 +1,24 @@
|
||||
---
|
||||
layout: default
|
||||
title: Formatting
|
||||
nav_order: 7
|
||||
has_children: true
|
||||
---
|
||||
<!--
|
||||
© 2020 and later: Unicode, Inc. and others.
|
||||
License & terms of use: http://www.unicode.org/copyright.html
|
||||
-->
|
||||
|
||||
# Formatting and Parsing
|
||||
{: .no_toc }
|
||||
|
||||
## Contents
|
||||
{: .no_toc .text-delta }
|
||||
|
||||
1. TOC
|
||||
{:toc}
|
||||
|
||||
---
|
||||
|
||||
## Overview
|
||||
|
||||
|
@ -1,9 +1,25 @@
|
||||
---
|
||||
layout: default
|
||||
title: Message Formatting Examples
|
||||
nav_order: 1
|
||||
parent: Formatting Messages
|
||||
grand_parent: Formatting
|
||||
---
|
||||
<!--
|
||||
© 2020 and later: Unicode, Inc. and others.
|
||||
License & terms of use: http://www.unicode.org/copyright.html
|
||||
-->
|
||||
|
||||
# Message Formatting Examples
|
||||
{: .no_toc }
|
||||
|
||||
## Contents
|
||||
{: .no_toc .text-delta }
|
||||
|
||||
1. TOC
|
||||
{:toc}
|
||||
|
||||
---
|
||||
|
||||
## MessageFormat Class
|
||||
|
||||
|
@ -1,11 +1,25 @@
|
||||
---
|
||||
layout: default
|
||||
title: Formatting Messages
|
||||
nav_order: 3
|
||||
parent: Formatting
|
||||
has_children: true
|
||||
---
|
||||
<!--
|
||||
© 2020 and later: Unicode, Inc. and others.
|
||||
License & terms of use: http://www.unicode.org/copyright.html
|
||||
-->
|
||||
|
||||
{% raw %}
|
||||
|
||||
# Formatting Messages
|
||||
{: .no_toc }
|
||||
|
||||
## Contents
|
||||
{: .no_toc .text-delta }
|
||||
|
||||
1. TOC
|
||||
{:toc}
|
||||
|
||||
---
|
||||
|
||||
## Overview
|
||||
|
||||
@ -62,25 +76,31 @@ arguments (hopefully at most one) inside.
|
||||
|
||||
For example:
|
||||
|
||||
"{gender_of_host, **select**, "
|
||||
"**female** {"
|
||||
"{num_guests, **plural**, offset:1 "
|
||||
"=0 {{host} does not give a party.}"
|
||||
"=1 {{host} invites {guest} to **her** party.}"
|
||||
"=2 {{host} invites {guest} and one other person to her party.}"
|
||||
"other {{host} invites {guest} and # other people to her party.}}}"
|
||||
"**male** {"
|
||||
"{num_guests, **plural**, offset:1 "
|
||||
"=0 {{host} does not give a party.}"
|
||||
"=1 {{host} invites {guest} to **his** party.}"
|
||||
"=2 {{host} invites {guest} and one other person to his party.}"
|
||||
"other {{host} invites {guest} and # other people to his party.}}}"
|
||||
"**other** {"
|
||||
"{num_guests, **plural**, offset:1 "
|
||||
"=0 {{host} does not give a party.}"
|
||||
"=1 {{host} invites {guest} to **their** party.}"
|
||||
"=2 {{host} invites {guest} and one other person to their party.}"
|
||||
"other {{host} invites {guest} and # other people to their party.}}}}"
|
||||
{% raw %}
|
||||
|
||||
```text
|
||||
"{gender_of_host, select, "
|
||||
"female {"
|
||||
"{num_guests, plural, offset:1 "
|
||||
"=0 {{host} does not give a party.}"
|
||||
"=1 {{host} invites {guest} to her party.}"
|
||||
"=2 {{host} invites {guest} and one other person to her party.}"
|
||||
"other {{host} invites {guest} and # other people to her party.}}}"
|
||||
"male {"
|
||||
"{num_guests, plural, offset:1 "
|
||||
"=0 {{host} does not give a party.}"
|
||||
"=1 {{host} invites {guest} to his party.}"
|
||||
"=2 {{host} invites {guest} and one other person to his party.}"
|
||||
"other {{host} invites {guest} and # other people to his party.}}}"
|
||||
"other {"
|
||||
"{num_guests, plural, offset:1 "
|
||||
"=0 {{host} does not give a party.}"
|
||||
"=1 {{host} invites {guest} to their party.}"
|
||||
"=2 {{host} invites {guest} and one other person to their party.}"
|
||||
"other {{host} invites {guest} and # other people to their party.}}}}"
|
||||
```
|
||||
|
||||
{% endraw %}
|
||||
|
||||
**Note:** In a plural argument like in the example above, if the English message
|
||||
has both `=0` and `=1` (up to `=offset`+1) then it does not need a "`one`"
|
||||
@ -95,9 +115,8 @@ language](http://cldr.unicode.org/index/cldr-spec/plural-rules).*
|
||||
|
||||
If syntax characters occur in the text portions, then they need to be quoted by
|
||||
enclosing the syntax in pairs of ASCII apostrophes. A pair of ASCII apostrophes
|
||||
always represents one ASCII apostrophe, similar to %% in printf representing one
|
||||
%, although this rule still applies inside quoted text. ("This '{isn''t}'
|
||||
obvious" → "This {isn't} obvious")
|
||||
always represents one ASCII apostrophe, similar to %% in printf representing one %,
|
||||
although this rule still applies inside quoted text. ("`This '{isn''t}' obvious`" → "`This {isn't} obvious`")
|
||||
|
||||
* Before ICU 4.8, ASCII apostrophes always started quoted text and had
|
||||
inconsistent behavior in nested sub-messages, which was a source of problems
|
||||
@ -107,8 +126,8 @@ obvious" → "This {isn't} obvious")
|
||||
needed"), and works the same in nested messages as on the top level of the
|
||||
pattern. The new behavior is otherwise compatible; for details see the
|
||||
MessageFormat and MessagePattern (new in ICU 4.8) API docs.
|
||||
* Recommendation: Use the real apostrophe (single quote) character ’ (U+2019)
|
||||
for human-readable text, and use the ASCII apostrophe ' (U+0027) only in
|
||||
* Recommendation: Use the real apostrophe (single quote) character `’` (U+2019)
|
||||
for human-readable text, and use the ASCII apostrophe `'` (U+0027) only in
|
||||
program syntax, like quoting in MessageFormat. See the annotations for
|
||||
U+0027 Apostrophe in The Unicode Standard.
|
||||
|
||||
@ -133,14 +152,14 @@ distinguish them from patterns. These are locale-independent ways to specify the
|
||||
format, and this is the recommended mechanism if the predefined styles are not
|
||||
appropriate.
|
||||
|
||||
Date skeletons:
|
||||
##### Date skeletons:
|
||||
|
||||
- **ICU4J:**
|
||||
<https://unicode-org.github.io/icu-docs/apidoc/released/icu4j/com/ibm/icu/text/SimpleDateFormat.html>
|
||||
|
||||
- **ICU4C:** <https://unicode-org.github.io/icu-docs/apidoc/released/icu4c/classSimpleDateFormat.html>
|
||||
|
||||
Number formatter skeletons:
|
||||
##### Number formatter skeletons:
|
||||
|
||||
- **ICU4J:**
|
||||
<https://unicode-org.github.io/icu-docs/apidoc/released/icu4j/com/ibm/icu/number/NumberFormatter.html>
|
||||
@ -171,7 +190,7 @@ that you don't benefit from that CLDR data and the results will likely be
|
||||
inconsistent with the rest of the patterns that ICU uses.
|
||||
|
||||
It is also a bad internationalization practice, because most companies only
|
||||
translate into “generic” versions of the languages (French, or Spanish, or
|
||||
translate into "generic" versions of the languages (French, or Spanish, or
|
||||
Arabic). So the translated patterns get used in tens of countries. On the other
|
||||
hand, skeletons are localized according to the MessageFormat locale, which
|
||||
should include regional variants (e.g., “fr-CA”).
|
||||
@ -181,11 +200,11 @@ should include regional variants (e.g., “fr-CA”).
|
||||
The MessageFormat class allows setting custom Format objects to format
|
||||
arguments, overriding the arguments' pattern specification. This is discouraged:
|
||||
For custom formatting of some values it should normally suffice to format them
|
||||
externally and to provide the formatted strings to the MessageFormat.format()
|
||||
externally and to provide the formatted strings to the `MessageFormat.format()`
|
||||
methods.
|
||||
|
||||
Only the top-level arguments are accessible and settable via setFormat(),
|
||||
getFormat() etc. Arguments inside nested sub-messages, inside
|
||||
`getFormat()` etc. Arguments inside nested sub-messages, inside
|
||||
choice/plural/select arguments, are "invisible" via these API methods.
|
||||
|
||||
Some of these methods (the ones corresponding to the original JDK MessageFormat
|
||||
@ -217,5 +236,3 @@ was a disturbance in the Force on planet 7."
|
||||
|
||||
There are several more usage examples for the MessageFormat and ChoiceFormat
|
||||
classes in [C , C++ and Java](examples.md).
|
||||
|
||||
{% endraw %}
|
||||
|
@ -1,3 +1,10 @@
|
||||
---
|
||||
layout: default
|
||||
title: Formatting Numbers
|
||||
nav_order: 1
|
||||
parent: Formatting
|
||||
has_children: true
|
||||
---
|
||||
<!--
|
||||
© 2020 and later: Unicode, Inc. and others.
|
||||
License & terms of use: http://www.unicode.org/copyright.html
|
||||
|
@ -1,11 +1,31 @@
|
||||
---
|
||||
layout: default
|
||||
title: Legacy NumberFormat
|
||||
nav_order: 1
|
||||
grand_parent: Formatting
|
||||
parent: Formatting Numbers
|
||||
---
|
||||
<!--
|
||||
© 2020 and later: Unicode, Inc. and others.
|
||||
License & terms of use: http://www.unicode.org/copyright.html
|
||||
-->
|
||||
|
||||
# Legacy NumberFormat
|
||||
{: .no_toc }
|
||||
|
||||
Since ICU 60, the recommended way to format numbers is NumberFormatter; see [index.md](index.md). This page is here for reference for the older NumberFormat heirarchy in ICU4C and ICU4J.
|
||||
## Contents
|
||||
{: .no_toc .text-delta }
|
||||
|
||||
1. TOC
|
||||
{:toc}
|
||||
|
||||
---
|
||||
|
||||
## Note
|
||||
|
||||
> :warning: Since ICU 60, the recommended way to format numbers is NumberFormatter; see [index.md](index.md).
|
||||
>
|
||||
> This page is here for reference for the older NumberFormat hierarchy in ICU4C and ICU4J.
|
||||
|
||||
## NumberFormat
|
||||
|
||||
@ -39,7 +59,8 @@ locale "th" uses the western digits 0-9. To create a number format that uses the
|
||||
native Thai digits instead, first create a locale with "@numbers=thai" defined.
|
||||
See [the description on Locales](../../locale/index.md) for details.
|
||||
|
||||
> :point_right: **Note**: If you are formatting multiple numbers, save processing time by constructing the formatter once and then using it several times.
|
||||
> :point_right: **Note**: If you are formatting multiple numbers, save processing time
|
||||
> by constructing the formatter once and then using it several times.
|
||||
|
||||
#### Instantiating a NumberFormat
|
||||
|
||||
@ -113,7 +134,7 @@ and display name, but also the correct number of fraction digits and the correct
|
||||
the API references for more details.
|
||||
|
||||
There is ICU4C sample code at
|
||||
[icu4c/source/samples/numfmt/main.cpp]](https://github.com/unicode-org/icu/blob/master/icu4c/source/samples/numfmt/main.cpp)
|
||||
[icu4c/source/samples/numfmt/main.cpp](https://github.com/unicode-org/icu/blob/master/icu4c/source/samples/numfmt/main.cpp)
|
||||
which illustrates the use of NumberFormat.setCurrency().
|
||||
|
||||
#### Displaying Numbers
|
||||
|
@ -1,9 +1,25 @@
|
||||
---
|
||||
layout: default
|
||||
title: RuleBasedNumberFormat Examples
|
||||
nav_order: 6
|
||||
grand_parent: Formatting
|
||||
parent: Formatting Numbers
|
||||
---
|
||||
<!--
|
||||
© 2020 and later: Unicode, Inc. and others.
|
||||
License & terms of use: http://www.unicode.org/copyright.html
|
||||
-->
|
||||
|
||||
# RuleBasedNumberFormat Examples
|
||||
{: .no_toc }
|
||||
|
||||
## Contents
|
||||
{: .no_toc .text-delta }
|
||||
|
||||
1. TOC
|
||||
{:toc}
|
||||
|
||||
---
|
||||
|
||||
## Annotated RuleBasedNumberFormat Example
|
||||
|
||||
|
@ -1,9 +1,27 @@
|
||||
---
|
||||
layout: default
|
||||
title: RuleBasedNumberFormat
|
||||
nav_order: 5
|
||||
grand_parent: Formatting
|
||||
parent: Formatting Numbers
|
||||
---
|
||||
<!--
|
||||
© 2020 and later: Unicode, Inc. and others.
|
||||
License & terms of use: http://www.unicode.org/copyright.html
|
||||
-->
|
||||
|
||||
# RuleBasedNumberFormat
|
||||
{: .no_toc }
|
||||
|
||||
## Contents
|
||||
{: .no_toc .text-delta }
|
||||
|
||||
1. TOC
|
||||
{:toc}
|
||||
|
||||
---
|
||||
|
||||
# Overview
|
||||
|
||||
[RuleBasedNumberFormat](https://unicode-org.github.io/icu-docs/apidoc/released/icu4c/classRuleBasedNumberFormat.html)
|
||||
can format and parse numbers in spelled-out format, e.g. "one hundred and
|
||||
|
@ -1,9 +1,25 @@
|
||||
---
|
||||
layout: default
|
||||
title: Rounding Modes
|
||||
nav_order: 2
|
||||
grand_parent: Formatting
|
||||
parent: Formatting Numbers
|
||||
---
|
||||
<!--
|
||||
© 2020 and later: Unicode, Inc. and others.
|
||||
License & terms of use: http://www.unicode.org/copyright.html
|
||||
-->
|
||||
|
||||
# Rounding Modes
|
||||
{: .no_toc }
|
||||
|
||||
## Contents
|
||||
{: .no_toc .text-delta }
|
||||
|
||||
1. TOC
|
||||
{:toc}
|
||||
|
||||
---
|
||||
|
||||
The following rounding modes are used with ICU's Decimal Formatter. Note that
|
||||
ICU's use of the terms "Down" and "Up" here are somewhat at odds with other
|
||||
|
@ -1,10 +1,27 @@
|
||||
---
|
||||
layout: default
|
||||
title: Number Skeletons
|
||||
nav_order: 3
|
||||
grand_parent: Formatting
|
||||
parent: Formatting Numbers
|
||||
---
|
||||
<!--
|
||||
© 2019 and later: Unicode, Inc. and others.
|
||||
License & terms of use: http://www.unicode.org/copyright.html
|
||||
-->
|
||||
|
||||
Number Skeletons
|
||||
================
|
||||
# Number Skeletons
|
||||
{: .no_toc }
|
||||
|
||||
## Contents
|
||||
{: .no_toc .text-delta }
|
||||
|
||||
1. TOC
|
||||
{:toc}
|
||||
|
||||
---
|
||||
|
||||
## Overview
|
||||
|
||||
Number skeletons are a locale-agnostic way to configure a NumberFormatter in
|
||||
ICU. Number skeletons work in MessageFormat.
|
||||
|
@ -1,7 +1,6 @@
|
||||
---
|
||||
layout: default
|
||||
title: Glossary
|
||||
permalink: /glossary
|
||||
nav_order: 9000
|
||||
---
|
||||
<!--
|
||||
|
@ -1,9 +1,8 @@
|
||||
---
|
||||
layout: default
|
||||
title: How To Use ICU
|
||||
permalink: /howtouseicu
|
||||
nav_order: 2
|
||||
parent: Introduction
|
||||
parent: ICU
|
||||
---
|
||||
<!--
|
||||
© 2020 and later: Unicode, Inc. and others.
|
||||
@ -11,6 +10,17 @@ License & terms of use: http://www.unicode.org/copyright.html
|
||||
-->
|
||||
|
||||
# How To Use ICU
|
||||
{: .no_toc }
|
||||
|
||||
## Contents
|
||||
{: .no_toc .text-delta }
|
||||
|
||||
1. TOC
|
||||
{:toc}
|
||||
|
||||
---
|
||||
|
||||
# Overview
|
||||
|
||||
ICU builds and installs as relatively standard libraries. For details about
|
||||
building, installing and porting see the [ICU4C
|
||||
@ -88,8 +98,8 @@ files, `.so` files, etc. See the next section, "C++ With Your Own Build System".
|
||||
|
||||
## Notes on `icu-config`
|
||||
|
||||
> :point_right: **Note**: **icu-config is deprecated, and no longer recommended for production
|
||||
use. Please use pkg-config files or other options.**
|
||||
> :point_right: **Note**: **icu-config is deprecated, and no longer recommended for
|
||||
> production use. Please use pkg-config files or other options.**
|
||||
|
||||
As of ICU 63.1, [icu-config has been deprecated
|
||||
(ICU-10464)](https://unicode-org.atlassian.net/browse/ICU-10464).
|
||||
@ -190,7 +200,7 @@ need to modify this file directly to allow `static` and `dll` modes to function.
|
||||
|
||||
For building and running trivial (one-compilation-unit) programs with an
|
||||
installed ICU4C, the shell script
|
||||
[icurun](http://bugs.icu-project.org/trac/browser/trunk/tools/scripts/icurun)
|
||||
[icurun](https://github.com/unicode-org/icu/blob/master/tools/scripts/icurun)
|
||||
may be used. For detailed help, see the top of that script.
|
||||
As an example, if ICU is installed to the prefix **/opt/local** and the current
|
||||
directory contains two sample programs "test1.cpp" and "test2.c", they may be
|
||||
|
@ -1,9 +1,8 @@
|
||||
---
|
||||
layout: default
|
||||
title: Internationalization
|
||||
permalink: /i18n
|
||||
nav_order: 1
|
||||
parent: Introduction
|
||||
parent: ICU
|
||||
---
|
||||
<!--
|
||||
© 2020 and later: Unicode, Inc. and others.
|
||||
@ -11,6 +10,15 @@ License & terms of use: http://www.unicode.org/copyright.html
|
||||
-->
|
||||
|
||||
# Software Internationalization
|
||||
{: .no_toc }
|
||||
|
||||
## Contents
|
||||
{: .no_toc .text-delta }
|
||||
|
||||
1. TOC
|
||||
{:toc}
|
||||
|
||||
---
|
||||
|
||||
## Overview of Software Internationalization
|
||||
|
||||
|
@ -1,9 +1,8 @@
|
||||
---
|
||||
layout: default
|
||||
title: ICU4J Locale Service Provider
|
||||
permalink: /icu4j-locale-service-provider
|
||||
nav_order: 7
|
||||
parent: Introduction
|
||||
parent: ICU
|
||||
---
|
||||
<!--
|
||||
© 2020 and later: Unicode, Inc. and others.
|
||||
@ -11,6 +10,15 @@ License & terms of use: http://www.unicode.org/copyright.html
|
||||
-->
|
||||
|
||||
# ICU4J Locale Service Provider
|
||||
{: .no_toc }
|
||||
|
||||
## Contents
|
||||
{: .no_toc .text-delta }
|
||||
|
||||
1. TOC
|
||||
{:toc}
|
||||
|
||||
---
|
||||
|
||||
## Overview
|
||||
|
||||
|
@ -1,10 +1,26 @@
|
||||
---
|
||||
layout: default
|
||||
title: ICU Data Build Tool
|
||||
nav_order: 1
|
||||
parent: ICU Data
|
||||
---
|
||||
<!--
|
||||
© 2019 and later: Unicode, Inc. and others.
|
||||
License & terms of use: http://www.unicode.org/copyright.html
|
||||
-->
|
||||
|
||||
ICU Data Build Tool
|
||||
===================
|
||||
# ICU Data Build Tool
|
||||
{: .no_toc }
|
||||
|
||||
## Contents
|
||||
{: .no_toc .text-delta }
|
||||
|
||||
1. TOC
|
||||
{:toc}
|
||||
|
||||
---
|
||||
|
||||
## Overview
|
||||
|
||||
ICU 64 provides a tool for configuring your ICU locale data file with finer
|
||||
granularity. This page explains how to use this tool to customize and reduce
|
||||
|
@ -1,10 +1,26 @@
|
||||
---
|
||||
layout: default
|
||||
title: Resource and Data Tracing
|
||||
nav_order: 2
|
||||
parent: ICU Data
|
||||
---
|
||||
<!--
|
||||
© 2019 and later: Unicode, Inc. and others.
|
||||
License & terms of use: http://www.unicode.org/copyright.html
|
||||
-->
|
||||
|
||||
Resource and Data Tracing
|
||||
=========================
|
||||
# Resource and Data Tracing
|
||||
{: .no_toc }
|
||||
|
||||
## Contents
|
||||
{: .no_toc .text-delta }
|
||||
|
||||
1. TOC
|
||||
{:toc}
|
||||
|
||||
---
|
||||
|
||||
## Overview
|
||||
|
||||
When building an [ICU data filter specification](buildtool.md), it is useful to
|
||||
see what resources are being used by your application so that you can select
|
||||
|
@ -1,9 +1,24 @@
|
||||
---
|
||||
layout: default
|
||||
title: ICU Data
|
||||
nav_order: 13
|
||||
has_children: true
|
||||
---
|
||||
<!--
|
||||
© 2020 and later: Unicode, Inc. and others.
|
||||
License & terms of use: http://www.unicode.org/copyright.html
|
||||
-->
|
||||
|
||||
# ICU Data
|
||||
{: .no_toc }
|
||||
|
||||
## Contents
|
||||
{: .no_toc .text-delta }
|
||||
|
||||
1. TOC
|
||||
{:toc}
|
||||
|
||||
---
|
||||
|
||||
## Overview
|
||||
|
||||
@ -23,14 +38,12 @@ the rest of ICU. No specific action or setup is required of either the
|
||||
application program or the execution environment.
|
||||
|
||||
Update: as of ICU 64, the standard data library is over 20 MB in size. We have
|
||||
introduced a new tool, the [ICU Data Build
|
||||
Tool](https://github.com/unicode-org/icu/blob/master/docs/userguide/icu_data/buildtool.md),
|
||||
introduced a new tool, the [ICU Data Build Tool](./icu_data/buildtool.md),
|
||||
to give you more control over what goes into your ICU locale data file.
|
||||
|
||||
> :point_right: **Note**: ICU for C by default comes with pre-built data. The
|
||||
source data files are included as an "icu\*data.zip" file starting in ICU4C 49.
|
||||
Previously, they were not included unless ICU is downloaded from the [source
|
||||
repository](http://site.icu-project.org/repository).
|
||||
> :point_right: **Note**: ICU for C by default comes with pre-built data.
|
||||
> The source data files are included as an "icu\*data.zip" file starting in ICU4C 49.
|
||||
> Previously, they were not included unless ICU is downloaded from the [source repository](http://site.icu-project.org/repository).
|
||||
|
||||
## ICU and CLDR Data
|
||||
|
||||
@ -71,12 +84,13 @@ The ICU data directory is determined as follows:
|
||||
highest data loading performance.
|
||||
|
||||
> :point_right: **Note**: `u_setDataDirectory()` is not thread-safe. Call it
|
||||
before calling ICU APIs from multiple threads. If you use both
|
||||
`u_setDataDirectory()` and `u_init()`, then use `u_setDataDirectory()` first.*
|
||||
*Earlier versions of ICU supported two additional schemes: setting a data
|
||||
directory relative to the location of the ICU shared libraries, and on Windows,
|
||||
taking a location from the registry. These have both been removed to make the
|
||||
behavior more predictable and easier to understand.*
|
||||
> *before* calling ICU APIs from multiple threads. If you use both
|
||||
> `u_setDataDirectory()` and `u_init()`, then use `u_setDataDirectory()` first.
|
||||
>
|
||||
> *Earlier versions of ICU supported two additional schemes: setting a data
|
||||
> directory relative to the location of the ICU shared libraries, and on Windows,
|
||||
> taking a location from the registry. These have both been removed to make the
|
||||
> behavior more predictable and easier to understand.*
|
||||
|
||||
The ICU data directory does not need to be set in order to reference the
|
||||
standard built-in ICU data. Applications that just use standard ICU capabilities
|
||||
@ -87,7 +101,7 @@ data do not need to specify an ICU data directory.
|
||||
|
||||
The ICU data directory string can contain multiple directories as well as .dat
|
||||
path/filenames. They must be separated by the path separator that is used on the
|
||||
platform, for example a semicolon (;) on Windows. Data files will be searched in
|
||||
platform, for example a semicolon (`;`) on Windows. Data files will be searched in
|
||||
all directories and .dat package files in the order of the directory string. For
|
||||
details, see the example below.
|
||||
|
||||
@ -1024,11 +1038,11 @@ required resource is present.
|
||||
|
||||
#### Using additional resource files with ICU4J
|
||||
|
||||
> :point_right: **Note**: Resource file formats can change across releases of
|
||||
ICU4J!
|
||||
*The format of ICU4J resources is not part of the API. Clients who develop their
|
||||
own resources for use with ICU4J should be prepared to regenerate them when they
|
||||
move to new releases of ICU4J.*
|
||||
> :point_right: **Note**: Resource file formats can change across releases of ICU4J!
|
||||
>
|
||||
> *The format of ICU4J resources is not part of the API. Clients who develop their
|
||||
> own resources for use with ICU4J should be prepared to regenerate them when they
|
||||
> move to new releases of ICU4J.*
|
||||
|
||||
We are still developing ICU4J's resource mechanism. Currently it is not possible
|
||||
to mix icu's new binary .res resources with traditional java-style .class or
|
||||
@ -1053,19 +1067,15 @@ corresponding resource files already in that directory.
|
||||
|
||||
1. [ICU4C](http://icu-project.org/download/)
|
||||
|
||||
2. Compilers and tools required for [building
|
||||
ICU4C](https://htmlpreview.github.io/?https://github.com/unicode-org/icu/blob/master/icu4c/readme.html#HowToBuild)
|
||||
.
|
||||
2. Compilers and tools required for [building ICU4C](https://htmlpreview.github.io/?https://github.com/unicode-org/icu/blob/master/icu4c/readme.html#HowToBuild).
|
||||
|
||||
3. J2SE SDK version 5 or above
|
||||
|
||||
#### Procedure
|
||||
|
||||
1. Download and build ICU4C on a Windows or Linux machine. For instructions on
|
||||
downloading and building ICU4C, please click
|
||||
[here](https://htmlpreview.github.io/?https://github.com/unicode-org/icu/blob/master/icu4c/readme.html#HowToBuild)
|
||||
.
|
||||
1. Download and build ICU4C on a Windows or Linux machine. For instructions on downloading and building ICU4C, please click
|
||||
[here](https://htmlpreview.github.io/?https://github.com/unicode-org/icu/blob/master/icu4c/readme.html#HowToBuild).
|
||||
|
||||
2. Follow the remaining instructions in
|
||||
[*$icu4c_root*/source/data/icu4j-readme.txt](https://github.com/unicode-org/icu/blob/master/icu4c/source/data/icu4j-readme.txt)
|
||||
. *$icu4c_root* is the root directory of ICU4C source package.
|
||||
[*$icu4c_root*/source/data/icu4j-readme.txt](https://github.com/unicode-org/icu/blob/master/icu4c/source/data/icu4j-readme.txt).
|
||||
*$icu4c_root* is the root directory of ICU4C source package.
|
||||
|
@ -1,9 +1,26 @@
|
||||
---
|
||||
layout: default
|
||||
title: ICU4J FAQ
|
||||
nav_order: 7
|
||||
parent: Misc
|
||||
---
|
||||
<!--
|
||||
© 2020 and later: Unicode, Inc. and others.
|
||||
License & terms of use: http://www.unicode.org/copyright.html
|
||||
-->
|
||||
|
||||
# ICU4J FAQ
|
||||
{: .no_toc }
|
||||
|
||||
## Contents
|
||||
{: .no_toc .text-delta }
|
||||
|
||||
1. TOC
|
||||
{:toc}
|
||||
|
||||
---
|
||||
|
||||
# Overview
|
||||
|
||||
This page contains frequently asked questions about the content provided with
|
||||
the International Components for Unicode for Java as well as basics on
|
||||
|
@ -1,9 +1,24 @@
|
||||
---
|
||||
layout: default
|
||||
title: ICU FAQ
|
||||
nav_order: 6
|
||||
parent: Misc
|
||||
---
|
||||
<!--
|
||||
© 2020 and later: Unicode, Inc. and others.
|
||||
License & terms of use: http://www.unicode.org/copyright.html
|
||||
-->
|
||||
|
||||
# ICU FAQs
|
||||
{: .no_toc }
|
||||
|
||||
## Contents
|
||||
{: .no_toc .text-delta }
|
||||
|
||||
1. TOC
|
||||
{:toc}
|
||||
|
||||
---
|
||||
|
||||
## Introduction to ICU
|
||||
|
||||
|
@ -1,7 +1,6 @@
|
||||
---
|
||||
layout: default
|
||||
title: Introduction
|
||||
permalink: /userguideintro
|
||||
title: ICU
|
||||
nav_order: 2
|
||||
has_children: true
|
||||
---
|
||||
|
@ -1,3 +1,9 @@
|
||||
---
|
||||
layout: default
|
||||
title: IO
|
||||
nav_order: 11
|
||||
has_children: true
|
||||
---
|
||||
<!--
|
||||
© 2020 and later: Unicode, Inc. and others.
|
||||
License & terms of use: http://www.unicode.org/copyright.html
|
||||
@ -5,4 +11,5 @@ License & terms of use: http://www.unicode.org/copyright.html
|
||||
|
||||
# ICU IO
|
||||
|
||||
The ICU I/O (Unicode stdio) Library is an optional library that provides a stdio like API with Unicode support.
|
||||
|
||||
|
@ -1,3 +1,9 @@
|
||||
---
|
||||
layout: default
|
||||
title: ustdio
|
||||
nav_order: 1
|
||||
parent: IO
|
||||
---
|
||||
<!--
|
||||
© 2020 and later: Unicode, Inc. and others.
|
||||
License & terms of use: http://www.unicode.org/copyright.html
|
||||
|
@ -1,3 +1,9 @@
|
||||
---
|
||||
layout: default
|
||||
title: ustream
|
||||
nav_order: 2
|
||||
parent: IO
|
||||
---
|
||||
<!--
|
||||
© 2020 and later: Unicode, Inc. and others.
|
||||
License & terms of use: http://www.unicode.org/copyright.html
|
||||
|
@ -1,33 +1,49 @@
|
||||
---
|
||||
layout: default
|
||||
title: Layout Engine
|
||||
nav_order: 12
|
||||
has_children: true
|
||||
---
|
||||
<!--
|
||||
© 2020 and later: Unicode, Inc. and others.
|
||||
License & terms of use: http://www.unicode.org/copyright.html
|
||||
-->
|
||||
|
||||
# Layout Engine
|
||||
{: .no_toc }
|
||||
|
||||
## Contents
|
||||
{: .no_toc .text-delta }
|
||||
|
||||
1. TOC
|
||||
{:toc}
|
||||
|
||||
---
|
||||
|
||||
## Line Layout Deprecation
|
||||
|
||||
> :warning: ***The ICU Line LayoutEngine has been removed in ICU 58.*** It had not had active
|
||||
> development for some time, had many open abugs,
|
||||
> :warning: ***The ICU Line LayoutEngine has been removed in ICU 58.***
|
||||
> It had not had active development for some time, had many open bugs,
|
||||
> and had been deprecated in ICU 54.
|
||||
|
||||
>
|
||||
> Users of ICU Layout are **strongly** encouraged to consider the HarfBuzz project
|
||||
> as a replacement for the ICU Layout Engine. An ICU team member responsible for
|
||||
> the Layout Engine is contributing fixes and features to HarfBuzz, and a drop in
|
||||
> wrapper is available to allow use of HarfBuzz as a direct replacement for the
|
||||
> ICU layout engine.
|
||||
|
||||
>
|
||||
> HarfBuzz has its own active mailing lists, please use those for discussion of
|
||||
HarfBuzz and its use as a replacement for the ICU layout engine.
|
||||
See:
|
||||
[http://www.freedesktop.org/wiki/Software/HarfBuzz](http://www.freedesktop.org/wiki/Software/HarfBuzz)
|
||||
> HarfBuzz and its use as a replacement for the ICU layout engine.
|
||||
> See: [http://www.freedesktop.org/wiki/Software/HarfBuzz](http://www.freedesktop.org/wiki/Software/HarfBuzz)
|
||||
|
||||
|
||||
> :point_right: **Users of the "layoutex" ParagraphLayout library**: Please see information
|
||||
about how to build "layoutex" on the [Paragraph Layout](paragraph.md) page.
|
||||
|
||||
|
||||
## Overview
|
||||
|
||||
:warning: **See deletion/deprecation notice, above.**
|
||||
> :warning: **See the deletion/deprecation notice, above.**
|
||||
|
||||
The Latin script, which is the most commonly used script among software
|
||||
developers, is also the least complex script to display especially when it is
|
||||
|
@ -1,9 +1,26 @@
|
||||
---
|
||||
layout: default
|
||||
title: Paragraph Layout
|
||||
nav_order: 1
|
||||
parent: Layout Engine
|
||||
---
|
||||
<!--
|
||||
© 2020 and later: Unicode, Inc. and others.
|
||||
License & terms of use: http://www.unicode.org/copyright.html
|
||||
-->
|
||||
|
||||
# Paragraph Layout
|
||||
{: .no_toc }
|
||||
|
||||
## Contents
|
||||
{: .no_toc .text-delta }
|
||||
|
||||
1. TOC
|
||||
{:toc}
|
||||
|
||||
---
|
||||
|
||||
# Overview
|
||||
|
||||
This page is about the Paragraph Layout library that is available in ICU4C/C++.
|
||||
|
||||
@ -25,24 +42,24 @@ see: [Layout Engine](index.md).
|
||||
|
||||
### Building the Paragraph Layout library with HarfBuzz
|
||||
|
||||
While the ICU LayoutEngine is deprecated as of ICU 54, the ICU *Paragraph
|
||||
*Layout library is not. The Paragraph Layout library must now be built using the HarfBuzz engine instead of the ICU LayoutEngine.
|
||||
While the ICU LayoutEngine is deprecated as of ICU 54, the ICU *Paragraph* Layout library is not.
|
||||
The Paragraph Layout library must now be built using the HarfBuzz engine instead of the ICU LayoutEngine.
|
||||
|
||||
#### UNIX Makefile instructions / Cygwin / Msys / etc. (ICU 54+)
|
||||
|
||||
The following steps must be completed in order:
|
||||
|
||||
1. Build and install a complete ICU with the **`--disable-layout`
|
||||
`--disable-layoutex`** switches passed to configure
|
||||
2. Build and install HarfBuzz - http://harfbuzz.org (HarfBuzz's use of ICU may
|
||||
1. Build and install a complete ICU with the **`--disable-layout` `--disable-layoutex`**
|
||||
switches passed to configure
|
||||
3. Build and install HarfBuzz - http://harfbuzz.org (HarfBuzz's use of ICU may
|
||||
be enabled or disabled at your choice)
|
||||
3. Build and install the [icu-le-hb](http://harfbuzz.org) library.
|
||||
4. Now, rerun "configure" on the exact **same** ICU workspace used above:
|
||||
4. Build and install the [icu-le-hb](http://harfbuzz.org) library.
|
||||
5. Now, rerun "configure" on the exact **same** ICU workspace used above:
|
||||
* with "icu-le-hb" AND the above-mentioned installed ICU available via
|
||||
pkg-config ( `pkg-config --modversion icu-le-hb` should return a version,
|
||||
such as "0.0.0" )
|
||||
* with the --disable-layout **`--enable-layoutex`** switches passed to configure
|
||||
5. next, run `make install` JUST in the **`source/layoutex`** directory, to install
|
||||
6. next, run `make install` JUST in the **`source/layoutex`** directory, to install
|
||||
libiculx and `icu-lx.pc`
|
||||
|
||||
The above steps will produce a libiculx library that depends on HarfBuzz.
|
||||
|
@ -1,3 +1,9 @@
|
||||
---
|
||||
layout: default
|
||||
title: Locale Examples
|
||||
nav_order: 1
|
||||
parent: Locales and Resources
|
||||
---
|
||||
<!--
|
||||
© 2020 and later: Unicode, Inc. and others.
|
||||
License & terms of use: http://www.unicode.org/copyright.html
|
||||
|
@ -1,9 +1,24 @@
|
||||
---
|
||||
layout: default
|
||||
title: Locales and Resources
|
||||
nav_order: 5
|
||||
has_children: true
|
||||
---
|
||||
<!--
|
||||
© 2020 and later: Unicode, Inc. and others.
|
||||
License & terms of use: http://www.unicode.org/copyright.html
|
||||
-->
|
||||
|
||||
# Locale
|
||||
{: .no_toc }
|
||||
|
||||
## Contents
|
||||
{: .no_toc .text-delta }
|
||||
|
||||
1. TOC
|
||||
{:toc}
|
||||
|
||||
---
|
||||
|
||||
## Overview
|
||||
|
||||
|
@ -1,9 +1,24 @@
|
||||
---
|
||||
layout: default
|
||||
title: Localizing with ICU
|
||||
nav_order: 3
|
||||
parent: Locales and Resources
|
||||
---
|
||||
<!--
|
||||
© 2020 and later: Unicode, Inc. and others.
|
||||
License & terms of use: http://www.unicode.org/copyright.html
|
||||
-->
|
||||
|
||||
# Localizing with ICU
|
||||
{: .no_toc }
|
||||
|
||||
## Contents
|
||||
{: .no_toc .text-delta }
|
||||
|
||||
1. TOC
|
||||
{:toc}
|
||||
|
||||
---
|
||||
|
||||
## Overview
|
||||
|
||||
|
@ -1,16 +1,31 @@
|
||||
---
|
||||
layout: default
|
||||
title: Resources
|
||||
nav_order: 2
|
||||
parent: Locales and Resources
|
||||
---
|
||||
<!--
|
||||
© 2020 and later: Unicode, Inc. and others.
|
||||
License & terms of use: http://www.unicode.org/copyright.html
|
||||
-->
|
||||
|
||||
# Resource Management
|
||||
{: .no_toc }
|
||||
|
||||
## Contents
|
||||
{: .no_toc .text-delta }
|
||||
|
||||
1. TOC
|
||||
{:toc}
|
||||
|
||||
---
|
||||
|
||||
## Overview
|
||||
|
||||
> :point_right: **Note**: This page describes the use of ICU4C Resource
|
||||
> Management techniques and APIs. For an overview of the message localization
|
||||
> process using ICU, see the related page [Localizing with ICU](localizing.md).
|
||||
|
||||
## Overview
|
||||
|
||||
A software product that needs to be localized wins or loses depending on how
|
||||
easy is to change the data or "resources" which affect users. From the simplest
|
||||
point of view, that data is the information presented to the user (such as a
|
||||
|
@ -1,9 +1,24 @@
|
||||
---
|
||||
layout: default
|
||||
title: Packaging ICU4C
|
||||
nav_order: 3
|
||||
parent: ICU Data
|
||||
---
|
||||
<!--
|
||||
© 2020 and later: Unicode, Inc. and others.
|
||||
License & terms of use: http://www.unicode.org/copyright.html
|
||||
-->
|
||||
|
||||
# Packaging ICU4C
|
||||
{: .no_toc }
|
||||
|
||||
## Contents
|
||||
{: .no_toc .text-delta }
|
||||
|
||||
1. TOC
|
||||
{:toc}
|
||||
|
||||
---
|
||||
|
||||
## Overview
|
||||
|
||||
|
@ -1,9 +1,26 @@
|
||||
---
|
||||
layout: default
|
||||
title: Plug-ins
|
||||
nav_order: 4
|
||||
parent: ICU Data
|
||||
---
|
||||
<!--
|
||||
© 2020 and later: Unicode, Inc. and others.
|
||||
License & terms of use: http://www.unicode.org/copyright.html
|
||||
-->
|
||||
|
||||
# Plug-ins
|
||||
{: .no_toc }
|
||||
|
||||
## Contents
|
||||
{: .no_toc .text-delta }
|
||||
|
||||
1. TOC
|
||||
{:toc}
|
||||
|
||||
---
|
||||
|
||||
## Overview
|
||||
|
||||
This page documents the ICU4C DLL Plug-in capability.
|
||||
This feature is a Technology Preview which first appeared in ICU4C version
|
||||
@ -43,18 +60,17 @@ Here is a simple, trivial plugin:
|
||||
|
||||
```c
|
||||
U_CAPI
|
||||
UPlugTokenReturn U_EXPORT2
|
||||
myPlugin(UPlugData *data, UPlugReason reason, UErrorCode *status) {
|
||||
if(reason==UPLUG_REASON_QUERY) {
|
||||
uplug_setPlugName(data, "Simple Plugin"); /* optional */
|
||||
uplug_setPlugLevel(data, UPLUG_LEVEL_HIGH); /* Mandatory */
|
||||
} else if(reason==UPLUG_REASON_LOAD) {
|
||||
/* ... load ... */
|
||||
/* Set up some ICU things here. */
|
||||
} else if(reason==UPLUG_REASON_UNLOAD) {
|
||||
/* ... unload ... */
|
||||
}
|
||||
return UPLUG_TOKEN; /* Mandatory. */
|
||||
UPlugTokenReturn U_EXPORT2 myPlugin (UPlugData *data, UPlugReason reason, UErrorCode *status) {
|
||||
if(reason==UPLUG_REASON_QUERY) {
|
||||
uplug_setPlugName(data, "Simple Plugin"); /* optional */
|
||||
uplug_setPlugLevel(data, UPLUG_LEVEL_HIGH); /* Mandatory */
|
||||
} else if(reason==UPLUG_REASON_LOAD) {
|
||||
/* ... load ... */
|
||||
/* Set up some ICU things here. */
|
||||
} else if(reason==UPLUG_REASON_UNLOAD) {
|
||||
/* ... unload ... */
|
||||
}
|
||||
return UPLUG_TOKEN; /* Mandatory. */
|
||||
}
|
||||
```
|
||||
|
||||
@ -67,11 +83,11 @@ The API contract is:
|
||||
indicate that it is a valid plugin.
|
||||
|
||||
2. when the 'reason' parameter is set to UPLUG_REASON_QUERY, the
|
||||
plugin MUST call uplug_setPlugLevel() to indicate whether it is a high
|
||||
plugin MUST call `uplug_setPlugLevel()` to indicate whether it is a high
|
||||
level or low level plugin.
|
||||
|
||||
3. when the 'reason' parameter is UPLUG_REASON_QUERY, the plugin
|
||||
SHOULD call uplug_setPlugName to indicate a human readable plugin name.
|
||||
SHOULD call `uplug_setPlugName` to indicate a human readable plugin name.
|
||||
|
||||
## Configuration
|
||||
|
||||
@ -106,6 +122,7 @@ An example configuration file is, in its entirety:
|
||||
# this is icuplugins44.txt
|
||||
testplug.dll myPlugin hello=world
|
||||
```
|
||||
|
||||
The DLL testplug.dll is opened, and searched for the entrypoint
|
||||
"myPlugin", which must meet the API contract above.
|
||||
The string "hello=world" is passed to the plugin verbatim.
|
||||
@ -131,36 +148,38 @@ encountered which prevented them from loading. Thus, the end user can
|
||||
validate their plugin configuration file to determine if plugins are
|
||||
missing, unloadable, or loaded in the wrong order.
|
||||
For example the following run shows that the plugin named
|
||||
"myPluginFailQuery" did not call uplug_setPlugLevel() and thus failed to
|
||||
"myPluginFailQuery" did not call `uplug_setPlugLevel()` and thus failed to
|
||||
load.
|
||||
|
||||
$ icuinfo -v -L
|
||||
Compiled against ICU 4.3.4, currently running ICU 4.3.4
|
||||
ICUDATA is icudt43l
|
||||
plugin file is: /lib/plugins/icuplugins43.txt
|
||||
Plugins:
|
||||
# Level Name
|
||||
Library:Symbol
|
||||
config| (configuration string)
|
||||
>>> Error | Explanation
|
||||
-----------------------------------
|
||||
|
||||
#1 HIGH Just a Test High-Level Plugin
|
||||
plugin| /lib/plugins/libplugin.dylib:myPlugin
|
||||
config| x=4
|
||||
|
||||
#2 HIGH High Plugin
|
||||
plugin| /lib/plugins/libplugin.dylib:myPluginHigh
|
||||
config| x=4
|
||||
|
||||
#3 INVALID this plugin did not call uplug_setPlugName()
|
||||
plugin| /lib/plugins/libplugin.dylib:myPluginFailQuery
|
||||
config| uery
|
||||
\\\ status| U_PLUGIN_DIDNT_SET_LEVEL
|
||||
/// Error: This plugin did not call uplug_setPlugLevel during QUERY.
|
||||
|
||||
#4 LOW Low Plugin
|
||||
plugin| /lib/plugins/libplugin.dylib:myPluginLow
|
||||
config| x=4
|
||||
Default locale is en_US
|
||||
Default converter is UTF-8.
|
||||
```
|
||||
$ icuinfo -v -L
|
||||
Compiled against ICU 4.3.4, currently running ICU 4.3.4
|
||||
ICUDATA is icudt43l
|
||||
plugin file is: /lib/plugins/icuplugins43.txt
|
||||
Plugins:
|
||||
# Level Name
|
||||
Library:Symbol
|
||||
config| (configuration string)
|
||||
>>> Error | Explanation
|
||||
-----------------------------------
|
||||
|
||||
#1 HIGH Just a Test High-Level Plugin
|
||||
plugin| /lib/plugins/libplugin.dylib:myPlugin
|
||||
config| x=4
|
||||
|
||||
#2 HIGH High Plugin
|
||||
plugin| /lib/plugins/libplugin.dylib:myPluginHigh
|
||||
config| x=4
|
||||
|
||||
#3 INVALID this plugin did not call uplug_setPlugName()
|
||||
plugin| /lib/plugins/libplugin.dylib:myPluginFailQuery
|
||||
config| uery
|
||||
\\\ status| U_PLUGIN_DIDNT_SET_LEVEL
|
||||
/// Error: This plugin did not call uplug_setPlugLevel during QUERY.
|
||||
|
||||
#4 LOW Low Plugin
|
||||
plugin| /lib/plugins/libplugin.dylib:myPluginLow
|
||||
config| x=4
|
||||
Default locale is en_US
|
||||
Default converter is UTF-8.
|
||||
```
|
@ -1,9 +1,8 @@
|
||||
---
|
||||
layout: default
|
||||
title: C/POSIX Migration
|
||||
permalink: /posix
|
||||
nav_order: 6
|
||||
parent: Introduction
|
||||
parent: ICU
|
||||
---
|
||||
<!--
|
||||
© 2020 and later: Unicode, Inc. and others.
|
||||
@ -11,6 +10,15 @@ License & terms of use: http://www.unicode.org/copyright.html
|
||||
-->
|
||||
|
||||
# C/POSIX Migration
|
||||
{: .no_toc }
|
||||
|
||||
## Contents
|
||||
{: .no_toc .text-delta }
|
||||
|
||||
1. TOC
|
||||
{:toc}
|
||||
|
||||
---
|
||||
|
||||
## Migration from Standard C and POSIX APIs
|
||||
|
||||
|
@ -1,9 +1,8 @@
|
||||
---
|
||||
layout: default
|
||||
title: ICU Services
|
||||
permalink: /services
|
||||
nav_order: 4
|
||||
parent: Introduction
|
||||
parent: ICU
|
||||
---
|
||||
<!--
|
||||
© 2020 and later: Unicode, Inc. and others.
|
||||
@ -11,6 +10,15 @@ License & terms of use: http://www.unicode.org/copyright.html
|
||||
-->
|
||||
|
||||
# ICU Services
|
||||
{: .no_toc }
|
||||
|
||||
## Contents
|
||||
{: .no_toc .text-delta }
|
||||
|
||||
1. TOC
|
||||
{:toc}
|
||||
|
||||
---
|
||||
|
||||
## Overview of the ICU Services
|
||||
|
||||
|
@ -1,83 +0,0 @@
|
||||
<!--
|
||||
© 2020 and later: Unicode, Inc. and others.
|
||||
License & terms of use: http://www.unicode.org/copyright.html
|
||||
-->
|
||||
|
||||
* [Boundary Analysis](boundaryanalysis/index.md)
|
||||
* [Break Rules](boundaryanalysis/break-rules.md)
|
||||
* [Collation](collation/index.md)
|
||||
* [Collation API Details](collation/api.md)
|
||||
* [ICU Collation Service Architecture](collation/architecture.md)
|
||||
* [Collation Concepts](collation/concepts.md)
|
||||
* [Collation Customization](collation/customization/index.md)
|
||||
* [“Ignore Punctuation” Options](collation/customization/ignorepunct.md)
|
||||
* [Collation Examples](collation/examples.md)
|
||||
* [Collation FAQ](collation/faq.md)
|
||||
* [ICU String Search Service](collation/icu-string-search-service.md)
|
||||
* [Conversion](conversion/index.md)
|
||||
* [Compression](conversion/compression.md)
|
||||
* [Using Converters](conversion/converters.md)
|
||||
* [Conversion Data](conversion/data.md)
|
||||
* [Character Set Detection](conversion/detection.md)
|
||||
* [Date/Time Services](datetime/index.md)
|
||||
* [Calendar Classes](datetime/calendar/index.md)
|
||||
* [Calendar Examples](datetime/calendar/examples.md)
|
||||
* [ICU TimeZone Classes](datetime/timezone/index.md)
|
||||
* [Date and Time Zone Examples](datetime/timezone/examples.md)
|
||||
* [Universal Time Scale](datetime/universaltimescale.md)
|
||||
* [ICU Architectural Design](design.md)
|
||||
* [Development](dev/index.md)
|
||||
* [Coding Guidelines](dev/codingguidelines.md)
|
||||
* [Contributions to the ICU library](dev/contributions.md)
|
||||
* [Synchronization Issues](dev/sync/index.md)
|
||||
* [Custom ICU4C Synchronization](dev/sync/custom.md)
|
||||
* [Editing the ICU User Guide](editing.md)
|
||||
* [Formatting and Parsing](index.md)
|
||||
* [Formatting Dates and Times](formatparse/datetime/index.md)
|
||||
* [Date and Time Formatting Examples](formatparse/datetime/examples.md)
|
||||
* [Formatting Messages](formatparse/messages/index.md)
|
||||
* [Message Formatting Examples](formatparse/messages/examples.md)
|
||||
* [Formatting Numbers](formatparse/numbers/index.md)
|
||||
* [RuleBasedNumberFormat Examples](formatparse/numbers/rbnf-examples.md)
|
||||
* [Rounding Modes](formatparse/numbers/rounding-modes.md)
|
||||
* [Glossary](glossary.md)
|
||||
* [How To Use ICU](howtouseicu.md)
|
||||
* [Software Internationalization](i18n.md)
|
||||
* [ICU4J Locale Service Provider](icu4j-locale-service-provider.md)
|
||||
* [ICU Data](icudata.md)
|
||||
* [ICU FAQs](icufaq/index.md)
|
||||
* [ICU4J FAQ](icufaq/icu4j-faq.md)
|
||||
* [Introduction to ICU](intro.md)
|
||||
* [ICU IO](io/index.md)
|
||||
* [C: ustdio](io/ustdio.md)
|
||||
* [C++: ustream](io/ustream.md)
|
||||
* [Layout Engine](layoutengine/index.md)
|
||||
* [Paragraph Layout](layoutengine/paragraph.md)
|
||||
* [Locale](locale/index.md)
|
||||
* [Locale Examples](locale/examples.md)
|
||||
* [Localizing with ICU](locale/localizing.md)
|
||||
* [Resource Management](locale/resources.md)
|
||||
* [Packaging ICU4C](packaging/index.md)
|
||||
* [Packaging ICU4J](packaging-icu4j.md)
|
||||
* [Plug-ins](packaging/plug-ins.md)
|
||||
* [C/POSIX Migration](posix.md)
|
||||
* [ICU Services](services.md)
|
||||
* [Strings](strings/index.md)
|
||||
* [CharacterIterator Class](strings/characteriterator.md)
|
||||
* [Properties](strings/properties.md)
|
||||
* [Regular Expressions](strings/regexp.md)
|
||||
* [StringPrep](strings/stringprep.md)
|
||||
* [UnicodeSet](strings/unicodeset.md)
|
||||
* [UText](strings/utext.md)
|
||||
* [UTF-8](strings/utf-8.md)
|
||||
* [Transforms](transforms/index.md)
|
||||
* [BiDi Algorithm](transforms/bidi.md)
|
||||
* [Case Mappings](transforms/casemappings.md)
|
||||
* [General Transforms](transforms/general/index.md)
|
||||
* [Transform Rule Tutorial](transforms/general/rules.md)
|
||||
* [Normalization](transforms/normalization/index.md)
|
||||
* [Normalization Examples (Obsolete)](transforms/normalization/examples.md)
|
||||
* [Unicode Basics](unicode.md)
|
||||
* [Use From...](usefrom/index.md)
|
||||
* [How To Use ICU4C From COBOL](usefrom/cobol.md)
|
||||
* [Java Native Interface (JNI)](usefrom/jni.md)
|
@ -1,7 +1,6 @@
|
||||
---
|
||||
layout: default
|
||||
title: CharacterIterator
|
||||
permalink: /characteriterator
|
||||
nav_order: 3
|
||||
parent: Chars and Strings
|
||||
---
|
||||
|
@ -1,7 +1,6 @@
|
||||
---
|
||||
layout: default
|
||||
title: Chars and Strings
|
||||
permalink: /strings
|
||||
nav_order: 3
|
||||
has_children: true
|
||||
---
|
||||
|
@ -1,7 +1,6 @@
|
||||
---
|
||||
layout: default
|
||||
title: Properties
|
||||
permalink: /properties
|
||||
nav_order: 2
|
||||
parent: Chars and Strings
|
||||
---
|
||||
|
@ -1,7 +1,6 @@
|
||||
---
|
||||
layout: default
|
||||
title: Regular Expressions
|
||||
permalink: /regexp
|
||||
nav_order: 6
|
||||
parent: Chars and Strings
|
||||
---
|
||||
|
@ -1,7 +1,6 @@
|
||||
---
|
||||
layout: default
|
||||
title: StringPrep
|
||||
permalink: /stringprep
|
||||
nav_order: 7
|
||||
parent: Chars and Strings
|
||||
---
|
||||
|
@ -1,7 +1,6 @@
|
||||
---
|
||||
layout: default
|
||||
title: UnicodeSet
|
||||
permalink: /unicodeset
|
||||
nav_order: 5
|
||||
parent: Chars and Strings
|
||||
---
|
||||
|
@ -1,7 +1,6 @@
|
||||
---
|
||||
layout: default
|
||||
title: UText
|
||||
permalink: /utext
|
||||
nav_order: 4
|
||||
parent: Chars and Strings
|
||||
---
|
||||
|
@ -1,7 +1,6 @@
|
||||
---
|
||||
layout: default
|
||||
title: UTF-8
|
||||
permalink: /utf-8
|
||||
nav_order: 1
|
||||
parent: Chars and Strings
|
||||
---
|
||||
@ -75,7 +74,7 @@ macros in `unicode/utf16.h`. The macros handle many cases inline, but call
|
||||
internal functions for complicated parts of the UTF-8 encoding form. For
|
||||
example, the following code snippet counts white space characters in a string:
|
||||
|
||||
```C
|
||||
```c
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/stringpiece.h"
|
||||
#include "unicode/utf8.h"
|
||||
@ -135,9 +134,7 @@ string.
|
||||
|
||||
* *Note: In ICU 4.4 and before, BreakIterator only works with UTF-8 (or any
|
||||
other charset with non-1:1 index conversion to UTF-16) if no dictionary is
|
||||
supported. This excludes Thai word break. See [ticket
|
||||
#5532](http://bugs.icu-project.org/trac/ticket/5532). No fix is currently
|
||||
scheduled.*
|
||||
supported. This excludes Thai word break. See [ticket #5532](https://unicode-org.atlassian.net/browse/ICU-5532).*
|
||||
* *As a workaround for Thai word breaking, you can convert the string to
|
||||
UTF-16 and convert indexes to UTF-8 string indexes via
|
||||
`u_strToUTF8(dest=NULL, destCapacity=0, *destLength gets UTF-8 index).`*
|
||||
|
@ -1,9 +1,24 @@
|
||||
---
|
||||
layout: default
|
||||
title: BiDi Algorithm
|
||||
nav_order: 2
|
||||
parent: Transforms
|
||||
---
|
||||
<!--
|
||||
© 2020 and later: Unicode, Inc. and others.
|
||||
License & terms of use: http://www.unicode.org/copyright.html
|
||||
-->
|
||||
|
||||
# BiDi Algorithm
|
||||
{: .no_toc }
|
||||
|
||||
## Contents
|
||||
{: .no_toc .text-delta }
|
||||
|
||||
1. TOC
|
||||
{:toc}
|
||||
|
||||
---
|
||||
|
||||
## Overview
|
||||
|
||||
|
@ -1,9 +1,24 @@
|
||||
---
|
||||
layout: default
|
||||
title: Case Mappings
|
||||
nav_order: 1
|
||||
parent: Transforms
|
||||
---
|
||||
<!--
|
||||
© 2020 and later: Unicode, Inc. and others.
|
||||
License & terms of use: http://www.unicode.org/copyright.html
|
||||
-->
|
||||
|
||||
# Case Mappings
|
||||
{: .no_toc }
|
||||
|
||||
## Contents
|
||||
{: .no_toc .text-delta }
|
||||
|
||||
1. TOC
|
||||
{:toc}
|
||||
|
||||
---
|
||||
|
||||
## Overview
|
||||
|
||||
|
@ -1,9 +1,24 @@
|
||||
---
|
||||
layout: default
|
||||
title: Transforms
|
||||
nav_order: 4
|
||||
parent: Transforms
|
||||
---
|
||||
<!--
|
||||
© 2020 and later: Unicode, Inc. and others.
|
||||
License & terms of use: http://www.unicode.org/copyright.html
|
||||
-->
|
||||
|
||||
# General Transforms
|
||||
{: .no_toc }
|
||||
|
||||
## Contents
|
||||
{: .no_toc .text-delta }
|
||||
|
||||
1. TOC
|
||||
{:toc}
|
||||
|
||||
---
|
||||
|
||||
## Overview
|
||||
|
||||
|
@ -1,9 +1,26 @@
|
||||
---
|
||||
layout: default
|
||||
title: Transform Rule Tutorial
|
||||
nav_order: 5
|
||||
parent: Transforms
|
||||
---
|
||||
<!--
|
||||
© 2020 and later: Unicode, Inc. and others.
|
||||
License & terms of use: http://www.unicode.org/copyright.html
|
||||
-->
|
||||
|
||||
# Transform Rule Tutorial
|
||||
{: .no_toc }
|
||||
|
||||
## Contents
|
||||
{: .no_toc .text-delta }
|
||||
|
||||
1. TOC
|
||||
{:toc}
|
||||
|
||||
---
|
||||
|
||||
## Overview
|
||||
|
||||
This tutorial describes the process of building a custom transform based on a
|
||||
set of rules. The tutorial does not describe, in detail, the features of
|
||||
|
@ -1,9 +1,24 @@
|
||||
---
|
||||
layout: default
|
||||
title: Transforms
|
||||
nav_order: 8
|
||||
has_children: true
|
||||
---
|
||||
<!--
|
||||
© 2020 and later: Unicode, Inc. and others.
|
||||
License & terms of use: http://www.unicode.org/copyright.html
|
||||
-->
|
||||
|
||||
# Transforms
|
||||
{: .no_toc }
|
||||
|
||||
## Contents
|
||||
{: .no_toc .text-delta }
|
||||
|
||||
1. TOC
|
||||
{:toc}
|
||||
|
||||
---
|
||||
|
||||
## Overview
|
||||
|
||||
|
@ -1,9 +1,24 @@
|
||||
---
|
||||
layout: default
|
||||
title: Normalization
|
||||
nav_order: 3
|
||||
parent: Transforms
|
||||
---
|
||||
<!--
|
||||
© 2020 and later: Unicode, Inc. and others.
|
||||
License & terms of use: http://www.unicode.org/copyright.html
|
||||
-->
|
||||
|
||||
# Normalization
|
||||
{: .no_toc }
|
||||
|
||||
## Contents
|
||||
{: .no_toc .text-delta }
|
||||
|
||||
1. TOC
|
||||
{:toc}
|
||||
|
||||
---
|
||||
|
||||
## Overview
|
||||
|
||||
@ -58,7 +73,7 @@ Here is a summary of the differences:
|
||||
composition](http://www.unicode.org/notes/tn5/#FCC) which is almost the same
|
||||
as NFC/NFKC except that the normalized form also passes the FCD test. This
|
||||
is also supported for any standard or custom data file.
|
||||
* Quick check: There is a new spanQuickCheckYes() function for an optimized
|
||||
* Quick check: There is a new `spanQuickCheckYes()` function for an optimized
|
||||
combination of quick check and normalization.
|
||||
* Filtered: The new FilteredNormalizer2 class combines a Normalizer2 instance
|
||||
with a UnicodeSet to limit normalization to certain characters. For example,
|
||||
@ -88,12 +103,12 @@ The new API does not replace a few pieces of the old API:
|
||||
## Data File Syntax
|
||||
|
||||
The gennorm2 tool accepts one or more .txt files and generates a .nrm binary
|
||||
data file for Normalizer2.getInstance(). For gennorm2 command line options,
|
||||
invoke gennorm2 --help.
|
||||
data file for `Normalizer2.getInstance()`. For gennorm2 command line options,
|
||||
invoke `gennorm2 --help`.
|
||||
|
||||
gennorm2 starts with no data. If you want to include standard Unicode
|
||||
Normalization data, use the files in
|
||||
[{ICU4C}/source/data/unidata/norm2/](http://bugs.icu-project.org/trac/browser/trunk/icu4c/source/data/unidata/norm2)
|
||||
[{ICU4C}/source/data/unidata/norm2/](https://github.com/unicode-org/icu/tree/master/icu4c/source/data/unidata/norm2)
|
||||
. You can modify one of them, or provide it together with one or more additional
|
||||
files that add or remove mappings.
|
||||
|
||||
@ -124,19 +139,23 @@ mappings that are forbidden by the Unicode Normalization algorithms are reported
|
||||
as errors. For example, if a character has a two-way mapping, then neither of
|
||||
its mapping characters can have a one-way mapping.
|
||||
|
||||
* Unicode 6.1 # Optional Unicode version (since ICU 49; default: uchar.h U_UNICODE_VERSION)
|
||||
00E1=0061 0301 # Two-way mapping
|
||||
00AA>0061 # One-way mapping
|
||||
0300..0314:230 # ccc for a code point range
|
||||
0315:232 # ccc for a single code point
|
||||
0132..0133>0069 006A # Range, each code point mapping to "ij"
|
||||
E0000..E0FFF> # Range, each code point mapping to the empty string
|
||||
```
|
||||
* Unicode 6.1 # Optional Unicode version (since ICU 49; default: uchar.h U_UNICODE_VERSION)
|
||||
00E1=0061 0301 # Two-way mapping
|
||||
00AA>0061 # One-way mapping
|
||||
0300..0314:230 # ccc for a code point range
|
||||
0315:232 # ccc for a single code point
|
||||
0132..0133>0069 006A # Range, each code point mapping to "ij"
|
||||
E0000..E0FFF> # Range, each code point mapping to the empty string
|
||||
```
|
||||
|
||||
It is possible to override mappings from previous source files, including
|
||||
removing a mapping:
|
||||
|
||||
```
|
||||
00AA-
|
||||
E0000..E0FFF-
|
||||
```
|
||||
|
||||
## Data Generation Tool
|
||||
|
||||
@ -145,7 +164,9 @@ processed, and a binary data file is written for use by the ICU library (same
|
||||
file for C++ and Java). The binary data file format changes occasionally in
|
||||
order to support additional functionality.
|
||||
|
||||
```shell
|
||||
bin/gennorm2 -v -o $ICU4C_DATA_IN/nfkc_cf.nrm -s $ICU4C_UNIDATA/norm2 nfc.txt nfkc.txt nfkc_cf.txt
|
||||
```
|
||||
|
||||
For the complete set of options, invoke `gennorm2 --help`.
|
||||
|
||||
@ -153,13 +174,17 @@ Instead of the binary data file, the processed data can be written into a C
|
||||
file. This is closely tied to the needs of the ICU library. The format may
|
||||
change from one ICU version to the next.
|
||||
|
||||
```shell
|
||||
bin/gennorm2 -v -o $ICU_SRC/icu4c/source/common/norm2_nfc_data.h -s $ICU4C_UNIDATA/norm2 nfc.txt **--csource**
|
||||
```
|
||||
|
||||
With the --combined option, gennorm2 writes the combined data of the input
|
||||
files. The following example writes the combined NFKC_Casefold data. (New in ICU
|
||||
60.)
|
||||
|
||||
```shell
|
||||
bin/gennorm2 -o /tmp/nfkc_cf.txt -s $ICU4C_UNIDATA/norm2 nfc.txt nfkc.txt nfkc_cf.txt **--combined**
|
||||
```
|
||||
|
||||
With the "minus" operator, gennorm2 writes the diffs of the combined data from
|
||||
two sets of input files. (New in ICU 60.)
|
||||
@ -169,47 +194,53 @@ extracted from the UCD file DerivedNormalizationProps.txt. It is not minimal.
|
||||
The following command line generates the minimal differences of NFKC_Casefold
|
||||
compared with NFKC.
|
||||
|
||||
```shell
|
||||
bin/gennorm2 -o /tmp/nfkc_cf-minus-nfkc.txt -s $ICU4C_UNIDATA/norm2 nfc.txt nfkc.txt nfkc_cf.txt **minus** nfc.txt nfkc.txt
|
||||
```
|
||||
|
||||
## Example
|
||||
|
||||
class NormSample {
|
||||
public:
|
||||
// ICU service objects should be cached and reused, as usual.
|
||||
NormSample(UErrorCode &errorCode)
|
||||
: nfkc(*Normalizer2::getNFKCInstance(errorCode),
|
||||
fcd(*Normalizer2::getInstance(NULL, "nfc", UNORM2_FCD, errorCode) {}
|
||||
// Normalize a string.
|
||||
UnicodeString toNFKC(const UnicodeString &s, UErrorCode &errorCode) {
|
||||
return nfkc.normalize(s, errorCode);
|
||||
}
|
||||
// Ensure FCD before processing (like in sort key generation).
|
||||
// In practice, almost all strings pass the FCD test, so it might make sense to
|
||||
// test for it and only normalize when necessary, rather than always normalizing.
|
||||
void processText(const UnicodeString &s, UErrorCode &errorCode) {
|
||||
UnicodeString fcdString;
|
||||
const UnicodeString *ps; // points to either s or fcdString
|
||||
int32_t spanQCYes=fcd.spanQuickCheckYes(s, errorCode);
|
||||
if(U_FAILURE(errorCode)) {
|
||||
return; // report error
|
||||
}
|
||||
if(spanQCYes==s.length()) {
|
||||
ps=&s; // s is already in FCD
|
||||
} else {
|
||||
// unnormalized suffix as a read-only alias (does not copy characters)
|
||||
UnicodeString unnormalized=s.tempSubString(spanQCYes);
|
||||
// set the fcdString to the FCD prefix as a read-only alias
|
||||
fcdString.setTo(FALSE, s.getBuffer(), spanQCYes);
|
||||
// automatic copy-on-write, and append the FCD'ed suffix
|
||||
fcd.normalizeSecondAndAppend(fcdString, unnormalized, errorCode);
|
||||
ps=&fcdString;
|
||||
if(U_FAILURE(errorCode)) {
|
||||
return; // report error
|
||||
}
|
||||
}
|
||||
// ... now process the string *ps which is in FCD ...
|
||||
}
|
||||
private:
|
||||
const Normalizer2 &nfkc;
|
||||
const Normalizer2 &fcd;
|
||||
};
|
||||
```java
|
||||
class NormSample {
|
||||
public:
|
||||
// ICU service objects should be cached and reused, as usual.
|
||||
NormSample(UErrorCode &errorCode)
|
||||
: nfkc(*Normalizer2::getNFKCInstance(errorCode),
|
||||
fcd(*Normalizer2::getInstance(NULL, "nfc", UNORM2_FCD, errorCode) {}
|
||||
|
||||
// Normalize a string.
|
||||
UnicodeString toNFKC(const UnicodeString &s, UErrorCode &errorCode) {
|
||||
return nfkc.normalize(s, errorCode);
|
||||
}
|
||||
|
||||
// Ensure FCD before processing (like in sort key generation).
|
||||
// In practice, almost all strings pass the FCD test, so it might make sense to
|
||||
// test for it and only normalize when necessary, rather than always normalizing.
|
||||
void processText(const UnicodeString &s, UErrorCode &errorCode) {
|
||||
UnicodeString fcdString;
|
||||
const UnicodeString *ps; // points to either s or fcdString
|
||||
int32_t spanQCYes=fcd.spanQuickCheckYes(s, errorCode);
|
||||
if(U_FAILURE(errorCode)) {
|
||||
return; // report error
|
||||
}
|
||||
if(spanQCYes==s.length()) {
|
||||
ps=&s; // s is already in FCD
|
||||
} else {
|
||||
// unnormalized suffix as a read-only alias (does not copy characters)
|
||||
UnicodeString unnormalized=s.tempSubString(spanQCYes);
|
||||
// set the fcdString to the FCD prefix as a read-only alias
|
||||
fcdString.setTo(FALSE, s.getBuffer(), spanQCYes);
|
||||
// automatic copy-on-write, and append the FCD'ed suffix
|
||||
fcd.normalizeSecondAndAppend(fcdString, unnormalized, errorCode);
|
||||
ps=&fcdString;
|
||||
if(U_FAILURE(errorCode)) {
|
||||
return; // report error
|
||||
}
|
||||
}
|
||||
// ... now process the string *ps which is in FCD ...
|
||||
}
|
||||
private:
|
||||
const Normalizer2 &nfkc;
|
||||
const Normalizer2 &fcd;
|
||||
};
|
||||
```
|
||||
|
@ -1,9 +1,8 @@
|
||||
---
|
||||
layout: default
|
||||
title: Unicode Basics
|
||||
permalink: /unicode
|
||||
nav_order: 3
|
||||
parent: Introduction
|
||||
parent: ICU
|
||||
---
|
||||
<!--
|
||||
© 2020 and later: Unicode, Inc. and others.
|
||||
@ -11,6 +10,15 @@ License & terms of use: http://www.unicode.org/copyright.html
|
||||
-->
|
||||
|
||||
# Unicode Basics
|
||||
{: .no_toc }
|
||||
|
||||
## Contents
|
||||
{: .no_toc .text-delta }
|
||||
|
||||
1. TOC
|
||||
{:toc}
|
||||
|
||||
---
|
||||
|
||||
## Introduction to Unicode
|
||||
|
||||
|
@ -1,9 +1,24 @@
|
||||
---
|
||||
layout: default
|
||||
title: Cobol
|
||||
nav_order: 1
|
||||
parent: Use From...
|
||||
---
|
||||
<!--
|
||||
© 2020 and later: Unicode, Inc. and others.
|
||||
License & terms of use: http://www.unicode.org/copyright.html
|
||||
-->
|
||||
|
||||
# How To Use ICU4C From COBOL
|
||||
{: .no_toc }
|
||||
|
||||
## Contents
|
||||
{: .no_toc .text-delta }
|
||||
|
||||
1. TOC
|
||||
{:toc}
|
||||
|
||||
---
|
||||
|
||||
## Overview
|
||||
|
||||
|
@ -1,3 +1,9 @@
|
||||
---
|
||||
layout: default
|
||||
title: Use From...
|
||||
nav_order: 14
|
||||
has_children: true
|
||||
---
|
||||
<!--
|
||||
© 2020 and later: Unicode, Inc. and others.
|
||||
License & terms of use: http://www.unicode.org/copyright.html
|
||||
|
Loading…
Reference in New Issue
Block a user