From ecd660579b645461e4a4a4d87f869096c5224023 Mon Sep 17 00:00:00 2001 From: Markus Scherer Date: Thu, 29 Apr 2010 00:33:58 +0000 Subject: [PATCH] ICU-7144 handle deviation characters and disallowed ASCII characters in code X-SVN-Rev: 27999 --- tools/unicode/py/idna2nrm.py | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/tools/unicode/py/idna2nrm.py b/tools/unicode/py/idna2nrm.py index 58bb98c8d80..6d1eac03041 100755 --- a/tools/unicode/py/idna2nrm.py +++ b/tools/unicode/py/idna2nrm.py @@ -17,12 +17,22 @@ __author__ = "Markus Scherer" import re replacements = [ + # Pass through disallowed ASCII characters: Handled in code. + (re.compile(r"0000..002C ; disallowed"), "# 0000..002C (allow ASCII)"), + (re.compile(r"002F ; disallowed"), "# 002F (allow ASCII)"), + (re.compile(r"003A..0040 ; disallowed"), "# 003A..0040 (allow ASCII)"), + (re.compile(r"005B..0060 ; disallowed"), "# 005B..0060 (allow ASCII)"), + (re.compile(r"007B..00A0 ; disallowed #"), + "0080..00A0 >FFFD # (allow ASCII)"), + # Normal transformations. (re.compile(r"; disallowed "), ">FFFD"), (re.compile(r"; ignored "), ">"), (re.compile(r"^([^;]+) ; valid"), r"# \1valid"), (re.compile(r"; mapped ; "), ">"), - (re.compile(r"; deviation ; "), ">"), + (re.compile(r"^([^;]+) ; deviation"), r"# \1deviation"), (re.compile(r" +(\# [^\#]+)$"), r" \1"), + # Two versions of avoiding circular FFFD>FFFD mappings, + # depending on the version of the input file. (re.compile(r"\.\.FFFD"), "..FFFC"), (re.compile(r"(FFF[^E])\.\.FFFF"), "\1..FFFC") ] @@ -44,9 +54,13 @@ for line in in_file: # s/; ignored />/ # s/^([^;]+) ; valid/# \1valid/ # s/; mapped ; />/ -# s/; deviation ; />/ +# s/^([^;]+) ; deviation/# \1deviation/ # s/ +(\# [^\#]+)$/ \1/ # +# Except: Disallowed ASCII characters are passed through; +# they are handled in code. +# Deviation characters are also handled in code. +# # A circular mapping FFFD>FFFD is avoided by rewriting the line that contains # ..FFFD to contain ..FFFC instead. #