ICU-21176 Add aliases for terms "whitelist" and "blacklist" in data filter

See #1189
This commit is contained in:
Shane F. Carr 2020-08-11 18:06:45 -05:00
parent 7997955f2d
commit bf2c2c5ca7
4 changed files with 119 additions and 41 deletions

View File

@ -8,18 +8,29 @@
]
},
// Test mixed feature filter and resource filter
// Exlude translit data so we can run test for ICU-20673
// Exclude translit data so we can run test for ICU-20673
// Also test for "whitelist" versus "includelist"
"featureFilters": {
"misc": {
"whitelist": ["supplementalData"]
},
"translit": "exclude"
"translit": "exclude",
"curr_tree": {
"filterType": "locale",
"includelist": ["my"]
},
"brkitr_rules": {
"excludelist": ["line"]
},
"brkitr_dictionaries": {
"blacklist": ["cjdict"]
}
},
"resourceFilters": [
{
"categories": ["misc"],
"files": {
"whitelist": ["supplementalData"]
"includelist": ["supplementalData"]
},
"rules": ["+/*"]
}

View File

@ -76,7 +76,7 @@ languages:
{
"localeFilter": {
"filterType": "language",
"whitelist": [
"includelist": [
"en",
"de",
"zh"
@ -86,6 +86,11 @@ languages:
The *filterType* "language" only supports slicing by entire languages.
##### Terminology: Includelist, Excludelist, Whitelist, Blacklist
Prior to ICU 68, use `"whitelist"` and `"blacklist"` instead of `"includelist"`
and `"excludelist"`, respectively. ICU 68 allows all four terms.
#### Filtering by Locale
For more control, use *filterType* "locale". Here is a *filters.hjson* file that
@ -94,13 +99,15 @@ only the default script (e.g., Simplified Han for Chinese):
localeFilter: {
filterType: locale
whitelist: [
includelist: [
en
de
zh
]
}
*If using ICU 67 or earlier, see note above regarding allowed keywords.*
#### Adding Script Variants (includeScripts = true)
You may set the *includeScripts* option to true to include all scripts for a
@ -112,7 +119,7 @@ Chinese are included:
"localeFilter": {
"filterType": "locale",
"includeScripts": true,
"whitelist": [
"includelist": [
"en",
"de",
"zh"
@ -120,6 +127,8 @@ Chinese are included:
}
}
*If using ICU 67 or earlier, see note above regarding allowed keywords.*
If you wish to explicitly list the scripts, you may put the script code in the
locale tag in the whitelist, and you do not need the *includeScripts* option
enabled. For example, in Hjson, to include Han Traditional ***but not Han
@ -127,14 +136,16 @@ Simplified***:
localeFilter: {
filterType: locale
whitelist: [
includelist: [
en
de
zh_Hant
]
}
Note: the option *includeScripts* is only supported at the language level;
*If using ICU 67 or earlier, see note above regarding allowed keywords.*
**Note:** the option *includeScripts* is only supported at the language level;
i.e., in order to include all scripts for a particular language, you must
specify the language alone, without a region tag.
@ -150,7 +161,7 @@ German (Switzerland), or Chinese (Taiwan, Han Traditional):
localeFilter: {
filterType: locale
includeChildren: false
whitelist: [
includelist: [
en_US
en_GB
de_DE
@ -158,6 +169,8 @@ German (Switzerland), or Chinese (Taiwan, Han Traditional):
]
}
*If using ICU 67 or earlier, see note above regarding allowed keywords.*
Including dependencies, the above filter would include the following data files:
- root.txt
@ -285,7 +298,7 @@ dictionaries:
featureFilters: {
brkitr_dictionaries: {
whitelist: [
includelist: [
burmesedict
]
}
@ -295,7 +308,8 @@ Do *not* include directories or file extensions. They will be added
automatically for you. Note that all files in a particular category have the
same directory and extension.
You can use either a whitelist or a blacklist for the file name filter.
You can use either `"includelist"` or `"excludelist"` for the file name filter.
*If using ICU 67 or earlier, see note above regarding allowed keywords.*
##### Regex Filter
@ -305,7 +319,7 @@ To exclude filenames matching a certain regular expression, use *filterType*
featureFilters: {
brkitr_rules: {
filterType: regex
blacklist: [
excludelist: [
^.*_cj$
]
}
@ -353,12 +367,14 @@ the common locales specified in *localeFilter*, you can do the following:
featureFilters:
curr_tree: {
filterType: locale
whitelist: [
includelist: [
it
]
}
}
*If using ICU 67 or earlier, see note above regarding allowed keywords.*
You can exclude an entire `_tree` category without affecting other categories.
For example, to exclude region display names:
@ -446,7 +462,7 @@ following (this example removes calendar data):
{
categories: ["misc"]
files: {
whitelist: ["supplementalData"]
includelist: ["supplementalData"]
}
rules: [
-/calendarData
@ -454,6 +470,8 @@ following (this example removes calendar data):
}
]
*If using ICU 67 or earlier, see note above regarding allowed keywords.*
#### Combining Multiple Resource Filter Specs
You can also list multiple resource filter objects in the *resourceFilters*
@ -474,7 +492,7 @@ en-CA; this also makes use of the *files* option:
categories: ["unit_tree"]
files: {
filterType: locale
whitelist: ["en_US"]
includelist: ["en_US"]
}
rules: [
+/*/length/mile
@ -484,7 +502,7 @@ en-CA; this also makes use of the *files* option:
categories: ["unit_tree"]
files: {
filterType: locale
whitelist: ["en_CA"]
includelist: ["en_CA"]
}
rules: [
+/*/length/kilometer

View File

@ -78,15 +78,22 @@ class ExclusionFilter(Filter):
return False
class WhitelistBlacklistFilter(Filter):
class IncludeExcludeFilter(Filter):
def __init__(self, json_data):
if "whitelist" in json_data:
self.is_whitelist = True
self.whitelist = json_data["whitelist"]
self.is_includelist = True
self.includelist = json_data["whitelist"]
elif "includelist" in json_data:
self.is_includelist = True
self.includelist = json_data["includelist"]
elif "blacklist" in json_data:
self.is_includelist = False
self.excludelist = json_data["blacklist"]
elif "excludelist" in json_data:
self.is_includelist = False
self.excludelist = json_data["excludelist"]
else:
assert "blacklist" in json_data, "Need either whitelist or blacklist: %s" % str(json_data)
self.is_whitelist = False
self.blacklist = json_data["blacklist"]
raise AssertionError("Need either includelist or excludelist: %s" % str(json_data))
def match(self, file):
file_stem = self._file_to_file_stem(file)
@ -97,43 +104,43 @@ class WhitelistBlacklistFilter(Filter):
pass
class FileStemFilter(WhitelistBlacklistFilter):
class FileStemFilter(IncludeExcludeFilter):
def _should_include(self, file_stem):
if self.is_whitelist:
return file_stem in self.whitelist
if self.is_includelist:
return file_stem in self.includelist
else:
return file_stem not in self.blacklist
return file_stem not in self.excludelist
class LanguageFilter(WhitelistBlacklistFilter):
class LanguageFilter(IncludeExcludeFilter):
def _should_include(self, file_stem):
language = file_stem.split("_")[0]
if language == "root":
# Always include root.txt
return True
if self.is_whitelist:
return language in self.whitelist
if self.is_includelist:
return language in self.includelist
else:
return language not in self.blacklist
return language not in self.excludelist
class RegexFilter(WhitelistBlacklistFilter):
class RegexFilter(IncludeExcludeFilter):
def __init__(self, *args):
# TODO(ICU-20301): Change this to: super().__init__(*args)
super(RegexFilter, self).__init__(*args)
if self.is_whitelist:
self.whitelist = [re.compile(pat) for pat in self.whitelist]
if self.is_includelist:
self.includelist = [re.compile(pat) for pat in self.includelist]
else:
self.blacklist = [re.compile(pat) for pat in self.blacklist]
self.excludelist = [re.compile(pat) for pat in self.excludelist]
def _should_include(self, file_stem):
if self.is_whitelist:
for pattern in self.whitelist:
if self.is_includelist:
for pattern in self.includelist:
if pattern.match(file_stem):
return True
return False
else:
for pattern in self.blacklist:
for pattern in self.excludelist:
if pattern.match(file_stem):
return False
return True
@ -159,7 +166,12 @@ LANGUAGE_ONLY_REGEX = re.compile(r"^[a-z]{2,3}$")
class LocaleFilter(Filter):
def __init__(self, json_data, io):
self.locales_requested = list(json_data["whitelist"])
if "whitelist" in json_data:
self.locales_requested = list(json_data["whitelist"])
elif "includelist" in json_data:
self.locales_requested = list(json_data["includelist"])
else:
raise AssertionError("You must have an includelist in a locale filter")
self.include_children = json_data.get("includeChildren", True)
self.include_scripts = json_data.get("includeScripts", False)

View File

@ -90,7 +90,7 @@
{
"properties": {
"filterType": {
"$ref": "#/definitions/blacklistWhitelistFilterTypes"
"$ref": "#/definitions/includeExcludeFilterTypes"
},
"whitelist": { "$ref": "#/definitions/stringList" }
},
@ -100,13 +100,33 @@
{
"properties": {
"filterType": {
"$ref": "#/definitions/blacklistWhitelistFilterTypes"
"$ref": "#/definitions/includeExcludeFilterTypes"
},
"blacklist": { "$ref": "#/definitions/stringList" }
},
"required": ["blacklist"],
"additionalProperties": false
},
{
"properties": {
"filterType": {
"$ref": "#/definitions/includeExcludeFilterTypes"
},
"includelist": { "$ref": "#/definitions/stringList" }
},
"required": ["includelist"],
"additionalProperties": false
},
{
"properties": {
"filterType": {
"$ref": "#/definitions/includeExcludeFilterTypes"
},
"excludelist": { "$ref": "#/definitions/stringList" }
},
"required": ["excludelist"],
"additionalProperties": false
},
{
"properties": {
"filterType": {
@ -134,6 +154,23 @@
"required": ["filterType", "whitelist"],
"additionalProperties": false
},
{
"properties": {
"filterType": {
"type": "string",
"enum": ["locale"]
},
"includeChildren": {
"type": "boolean"
},
"includeScripts": {
"type": "boolean"
},
"includelist": { "$ref": "#/definitions/stringList" }
},
"required": ["filterType", "includelist"],
"additionalProperties": false
},
{
"properties": {
"filterType": {
@ -150,7 +187,7 @@
}
]
},
"blacklistWhitelistFilterTypes": {
"includeExcludeFilterTypes": {
"type": "string",
"enum": [
"language",