ICU-21176 Add aliases for terms "whitelist" and "blacklist" in data filter
See #1189
This commit is contained in:
parent
7997955f2d
commit
bf2c2c5ca7
@ -8,18 +8,29 @@
|
||||
]
|
||||
},
|
||||
// Test mixed feature filter and resource filter
|
||||
// Exlude translit data so we can run test for ICU-20673
|
||||
// Exclude translit data so we can run test for ICU-20673
|
||||
// Also test for "whitelist" versus "includelist"
|
||||
"featureFilters": {
|
||||
"misc": {
|
||||
"whitelist": ["supplementalData"]
|
||||
},
|
||||
"translit": "exclude"
|
||||
"translit": "exclude",
|
||||
"curr_tree": {
|
||||
"filterType": "locale",
|
||||
"includelist": ["my"]
|
||||
},
|
||||
"brkitr_rules": {
|
||||
"excludelist": ["line"]
|
||||
},
|
||||
"brkitr_dictionaries": {
|
||||
"blacklist": ["cjdict"]
|
||||
}
|
||||
},
|
||||
"resourceFilters": [
|
||||
{
|
||||
"categories": ["misc"],
|
||||
"files": {
|
||||
"whitelist": ["supplementalData"]
|
||||
"includelist": ["supplementalData"]
|
||||
},
|
||||
"rules": ["+/*"]
|
||||
}
|
||||
|
@ -76,7 +76,7 @@ languages:
|
||||
{
|
||||
"localeFilter": {
|
||||
"filterType": "language",
|
||||
"whitelist": [
|
||||
"includelist": [
|
||||
"en",
|
||||
"de",
|
||||
"zh"
|
||||
@ -86,6 +86,11 @@ languages:
|
||||
|
||||
The *filterType* "language" only supports slicing by entire languages.
|
||||
|
||||
##### Terminology: Includelist, Excludelist, Whitelist, Blacklist
|
||||
|
||||
Prior to ICU 68, use `"whitelist"` and `"blacklist"` instead of `"includelist"`
|
||||
and `"excludelist"`, respectively. ICU 68 allows all four terms.
|
||||
|
||||
#### Filtering by Locale
|
||||
|
||||
For more control, use *filterType* "locale". Here is a *filters.hjson* file that
|
||||
@ -94,13 +99,15 @@ only the default script (e.g., Simplified Han for Chinese):
|
||||
|
||||
localeFilter: {
|
||||
filterType: locale
|
||||
whitelist: [
|
||||
includelist: [
|
||||
en
|
||||
de
|
||||
zh
|
||||
]
|
||||
}
|
||||
|
||||
*If using ICU 67 or earlier, see note above regarding allowed keywords.*
|
||||
|
||||
#### Adding Script Variants (includeScripts = true)
|
||||
|
||||
You may set the *includeScripts* option to true to include all scripts for a
|
||||
@ -112,7 +119,7 @@ Chinese are included:
|
||||
"localeFilter": {
|
||||
"filterType": "locale",
|
||||
"includeScripts": true,
|
||||
"whitelist": [
|
||||
"includelist": [
|
||||
"en",
|
||||
"de",
|
||||
"zh"
|
||||
@ -120,6 +127,8 @@ Chinese are included:
|
||||
}
|
||||
}
|
||||
|
||||
*If using ICU 67 or earlier, see note above regarding allowed keywords.*
|
||||
|
||||
If you wish to explicitly list the scripts, you may put the script code in the
|
||||
locale tag in the whitelist, and you do not need the *includeScripts* option
|
||||
enabled. For example, in Hjson, to include Han Traditional ***but not Han
|
||||
@ -127,14 +136,16 @@ Simplified***:
|
||||
|
||||
localeFilter: {
|
||||
filterType: locale
|
||||
whitelist: [
|
||||
includelist: [
|
||||
en
|
||||
de
|
||||
zh_Hant
|
||||
]
|
||||
}
|
||||
|
||||
Note: the option *includeScripts* is only supported at the language level;
|
||||
*If using ICU 67 or earlier, see note above regarding allowed keywords.*
|
||||
|
||||
**Note:** the option *includeScripts* is only supported at the language level;
|
||||
i.e., in order to include all scripts for a particular language, you must
|
||||
specify the language alone, without a region tag.
|
||||
|
||||
@ -150,7 +161,7 @@ German (Switzerland), or Chinese (Taiwan, Han Traditional):
|
||||
localeFilter: {
|
||||
filterType: locale
|
||||
includeChildren: false
|
||||
whitelist: [
|
||||
includelist: [
|
||||
en_US
|
||||
en_GB
|
||||
de_DE
|
||||
@ -158,6 +169,8 @@ German (Switzerland), or Chinese (Taiwan, Han Traditional):
|
||||
]
|
||||
}
|
||||
|
||||
*If using ICU 67 or earlier, see note above regarding allowed keywords.*
|
||||
|
||||
Including dependencies, the above filter would include the following data files:
|
||||
|
||||
- root.txt
|
||||
@ -285,7 +298,7 @@ dictionaries:
|
||||
|
||||
featureFilters: {
|
||||
brkitr_dictionaries: {
|
||||
whitelist: [
|
||||
includelist: [
|
||||
burmesedict
|
||||
]
|
||||
}
|
||||
@ -295,7 +308,8 @@ Do *not* include directories or file extensions. They will be added
|
||||
automatically for you. Note that all files in a particular category have the
|
||||
same directory and extension.
|
||||
|
||||
You can use either a whitelist or a blacklist for the file name filter.
|
||||
You can use either `"includelist"` or `"excludelist"` for the file name filter.
|
||||
*If using ICU 67 or earlier, see note above regarding allowed keywords.*
|
||||
|
||||
##### Regex Filter
|
||||
|
||||
@ -305,7 +319,7 @@ To exclude filenames matching a certain regular expression, use *filterType*
|
||||
featureFilters: {
|
||||
brkitr_rules: {
|
||||
filterType: regex
|
||||
blacklist: [
|
||||
excludelist: [
|
||||
^.*_cj$
|
||||
]
|
||||
}
|
||||
@ -353,12 +367,14 @@ the common locales specified in *localeFilter*, you can do the following:
|
||||
featureFilters:
|
||||
curr_tree: {
|
||||
filterType: locale
|
||||
whitelist: [
|
||||
includelist: [
|
||||
it
|
||||
]
|
||||
}
|
||||
}
|
||||
|
||||
*If using ICU 67 or earlier, see note above regarding allowed keywords.*
|
||||
|
||||
You can exclude an entire `_tree` category without affecting other categories.
|
||||
For example, to exclude region display names:
|
||||
|
||||
@ -446,7 +462,7 @@ following (this example removes calendar data):
|
||||
{
|
||||
categories: ["misc"]
|
||||
files: {
|
||||
whitelist: ["supplementalData"]
|
||||
includelist: ["supplementalData"]
|
||||
}
|
||||
rules: [
|
||||
-/calendarData
|
||||
@ -454,6 +470,8 @@ following (this example removes calendar data):
|
||||
}
|
||||
]
|
||||
|
||||
*If using ICU 67 or earlier, see note above regarding allowed keywords.*
|
||||
|
||||
#### Combining Multiple Resource Filter Specs
|
||||
|
||||
You can also list multiple resource filter objects in the *resourceFilters*
|
||||
@ -474,7 +492,7 @@ en-CA; this also makes use of the *files* option:
|
||||
categories: ["unit_tree"]
|
||||
files: {
|
||||
filterType: locale
|
||||
whitelist: ["en_US"]
|
||||
includelist: ["en_US"]
|
||||
}
|
||||
rules: [
|
||||
+/*/length/mile
|
||||
@ -484,7 +502,7 @@ en-CA; this also makes use of the *files* option:
|
||||
categories: ["unit_tree"]
|
||||
files: {
|
||||
filterType: locale
|
||||
whitelist: ["en_CA"]
|
||||
includelist: ["en_CA"]
|
||||
}
|
||||
rules: [
|
||||
+/*/length/kilometer
|
||||
|
@ -78,15 +78,22 @@ class ExclusionFilter(Filter):
|
||||
return False
|
||||
|
||||
|
||||
class WhitelistBlacklistFilter(Filter):
|
||||
class IncludeExcludeFilter(Filter):
|
||||
def __init__(self, json_data):
|
||||
if "whitelist" in json_data:
|
||||
self.is_whitelist = True
|
||||
self.whitelist = json_data["whitelist"]
|
||||
self.is_includelist = True
|
||||
self.includelist = json_data["whitelist"]
|
||||
elif "includelist" in json_data:
|
||||
self.is_includelist = True
|
||||
self.includelist = json_data["includelist"]
|
||||
elif "blacklist" in json_data:
|
||||
self.is_includelist = False
|
||||
self.excludelist = json_data["blacklist"]
|
||||
elif "excludelist" in json_data:
|
||||
self.is_includelist = False
|
||||
self.excludelist = json_data["excludelist"]
|
||||
else:
|
||||
assert "blacklist" in json_data, "Need either whitelist or blacklist: %s" % str(json_data)
|
||||
self.is_whitelist = False
|
||||
self.blacklist = json_data["blacklist"]
|
||||
raise AssertionError("Need either includelist or excludelist: %s" % str(json_data))
|
||||
|
||||
def match(self, file):
|
||||
file_stem = self._file_to_file_stem(file)
|
||||
@ -97,43 +104,43 @@ class WhitelistBlacklistFilter(Filter):
|
||||
pass
|
||||
|
||||
|
||||
class FileStemFilter(WhitelistBlacklistFilter):
|
||||
class FileStemFilter(IncludeExcludeFilter):
|
||||
def _should_include(self, file_stem):
|
||||
if self.is_whitelist:
|
||||
return file_stem in self.whitelist
|
||||
if self.is_includelist:
|
||||
return file_stem in self.includelist
|
||||
else:
|
||||
return file_stem not in self.blacklist
|
||||
return file_stem not in self.excludelist
|
||||
|
||||
|
||||
class LanguageFilter(WhitelistBlacklistFilter):
|
||||
class LanguageFilter(IncludeExcludeFilter):
|
||||
def _should_include(self, file_stem):
|
||||
language = file_stem.split("_")[0]
|
||||
if language == "root":
|
||||
# Always include root.txt
|
||||
return True
|
||||
if self.is_whitelist:
|
||||
return language in self.whitelist
|
||||
if self.is_includelist:
|
||||
return language in self.includelist
|
||||
else:
|
||||
return language not in self.blacklist
|
||||
return language not in self.excludelist
|
||||
|
||||
|
||||
class RegexFilter(WhitelistBlacklistFilter):
|
||||
class RegexFilter(IncludeExcludeFilter):
|
||||
def __init__(self, *args):
|
||||
# TODO(ICU-20301): Change this to: super().__init__(*args)
|
||||
super(RegexFilter, self).__init__(*args)
|
||||
if self.is_whitelist:
|
||||
self.whitelist = [re.compile(pat) for pat in self.whitelist]
|
||||
if self.is_includelist:
|
||||
self.includelist = [re.compile(pat) for pat in self.includelist]
|
||||
else:
|
||||
self.blacklist = [re.compile(pat) for pat in self.blacklist]
|
||||
self.excludelist = [re.compile(pat) for pat in self.excludelist]
|
||||
|
||||
def _should_include(self, file_stem):
|
||||
if self.is_whitelist:
|
||||
for pattern in self.whitelist:
|
||||
if self.is_includelist:
|
||||
for pattern in self.includelist:
|
||||
if pattern.match(file_stem):
|
||||
return True
|
||||
return False
|
||||
else:
|
||||
for pattern in self.blacklist:
|
||||
for pattern in self.excludelist:
|
||||
if pattern.match(file_stem):
|
||||
return False
|
||||
return True
|
||||
@ -159,7 +166,12 @@ LANGUAGE_ONLY_REGEX = re.compile(r"^[a-z]{2,3}$")
|
||||
|
||||
class LocaleFilter(Filter):
|
||||
def __init__(self, json_data, io):
|
||||
if "whitelist" in json_data:
|
||||
self.locales_requested = list(json_data["whitelist"])
|
||||
elif "includelist" in json_data:
|
||||
self.locales_requested = list(json_data["includelist"])
|
||||
else:
|
||||
raise AssertionError("You must have an includelist in a locale filter")
|
||||
self.include_children = json_data.get("includeChildren", True)
|
||||
self.include_scripts = json_data.get("includeScripts", False)
|
||||
|
||||
|
@ -90,7 +90,7 @@
|
||||
{
|
||||
"properties": {
|
||||
"filterType": {
|
||||
"$ref": "#/definitions/blacklistWhitelistFilterTypes"
|
||||
"$ref": "#/definitions/includeExcludeFilterTypes"
|
||||
},
|
||||
"whitelist": { "$ref": "#/definitions/stringList" }
|
||||
},
|
||||
@ -100,13 +100,33 @@
|
||||
{
|
||||
"properties": {
|
||||
"filterType": {
|
||||
"$ref": "#/definitions/blacklistWhitelistFilterTypes"
|
||||
"$ref": "#/definitions/includeExcludeFilterTypes"
|
||||
},
|
||||
"blacklist": { "$ref": "#/definitions/stringList" }
|
||||
},
|
||||
"required": ["blacklist"],
|
||||
"additionalProperties": false
|
||||
},
|
||||
{
|
||||
"properties": {
|
||||
"filterType": {
|
||||
"$ref": "#/definitions/includeExcludeFilterTypes"
|
||||
},
|
||||
"includelist": { "$ref": "#/definitions/stringList" }
|
||||
},
|
||||
"required": ["includelist"],
|
||||
"additionalProperties": false
|
||||
},
|
||||
{
|
||||
"properties": {
|
||||
"filterType": {
|
||||
"$ref": "#/definitions/includeExcludeFilterTypes"
|
||||
},
|
||||
"excludelist": { "$ref": "#/definitions/stringList" }
|
||||
},
|
||||
"required": ["excludelist"],
|
||||
"additionalProperties": false
|
||||
},
|
||||
{
|
||||
"properties": {
|
||||
"filterType": {
|
||||
@ -134,6 +154,23 @@
|
||||
"required": ["filterType", "whitelist"],
|
||||
"additionalProperties": false
|
||||
},
|
||||
{
|
||||
"properties": {
|
||||
"filterType": {
|
||||
"type": "string",
|
||||
"enum": ["locale"]
|
||||
},
|
||||
"includeChildren": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"includeScripts": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"includelist": { "$ref": "#/definitions/stringList" }
|
||||
},
|
||||
"required": ["filterType", "includelist"],
|
||||
"additionalProperties": false
|
||||
},
|
||||
{
|
||||
"properties": {
|
||||
"filterType": {
|
||||
@ -150,7 +187,7 @@
|
||||
}
|
||||
]
|
||||
},
|
||||
"blacklistWhitelistFilterTypes": {
|
||||
"includeExcludeFilterTypes": {
|
||||
"type": "string",
|
||||
"enum": [
|
||||
"language",
|
||||
|
Loading…
Reference in New Issue
Block a user