Skip to content

Commit b665e66

Browse files
Updates for Serhiy
1 parent acb61d0 commit b665e66

1 file changed

Lines changed: 30 additions & 31 deletions

File tree

Tools/unicode/makeunicodedata.py

Lines changed: 30 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -208,7 +208,7 @@ def makeunicodedata(unicode, trace):
208208
# extract database properties
209209
category = CATEGORY_NAMES.index(record.general_category)
210210
combining = int(record.canonical_combining_class)
211-
bidirectional = BIDIRECTIONAL_NAMES.index(record.bidi_class)
211+
bidirectional = BIDIRECTIONAL_NAMES.index(unicode.bidi_classes[char])
212212
mirrored = record.bidi_mirrored == "Y"
213213
normalizationquickcheck = record.quick_check
214214
incb = INDIC_CONJUNCT_BREAK_NAMES.index(record.incb)
@@ -217,10 +217,7 @@ def makeunicodedata(unicode, trace):
217217
normalizationquickcheck, graphemebreak, incb, extpict,
218218
)
219219
else:
220-
if unicode.bidi_defaults is not None:
221-
bidirectional = BIDIRECTIONAL_NAMES.index(unicode.bidi_defaults[char])
222-
else:
223-
bidirectional = 0
220+
bidirectional = BIDIRECTIONAL_NAMES.index(unicode.bidi_classes[char])
224221
if eastasianwidth or graphemebreak or extpict or bidirectional:
225222
item = (0, 0, bidirectional, 0, eastasianwidth,
226223
0, graphemebreak, 0, extpict)
@@ -496,7 +493,7 @@ def makeunicodetype(unicode, trace):
496493
if record:
497494
# extract database properties
498495
category = record.general_category
499-
bidirectional = record.bidi_class
496+
bidirectional = unicode.bidi_classes[char]
500497
properties = record.binary_properties
501498
flags = 0
502499
if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:
@@ -792,6 +789,8 @@ def merge_old_version(version, new, old):
792789
# category 0 is "unassigned"
793790
category_changes[i] = 0
794791
continue
792+
if old.bidi_classes[i] != new.bidi_classes[i]:
793+
bidir_changes[i] = BIDIRECTIONAL_NAMES.index(old.bidi_classes[i])
795794
# check characters that differ
796795
if old.table[i] != new.table[i]:
797796
for k, field in enumerate(dataclasses.fields(UcdRecord)):
@@ -805,50 +804,48 @@ def merge_old_version(version, new, old):
805804
elif k == 2:
806805
category_changes[i] = CATEGORY_NAMES.index(value)
807806
elif k == 4:
808-
bidir_changes[i] = BIDIRECTIONAL_NAMES.index(value)
809-
elif k == 5:
810807
# We assume that all normalization changes are in 1:1 mappings
811808
assert " " not in value
812809
normalization_changes.append((i, value))
813-
elif k == 6:
810+
elif k == 5:
814811
# we only support changes where the old value is a single digit
815812
assert value in "0123456789"
816813
decimal_changes[i] = int(value)
817-
elif k == 8:
814+
elif k == 7:
818815
# Since 0 encodes "no change", the old value is better not 0
819816
if not value:
820817
numeric_changes[i] = -1
821818
else:
822819
numeric_changes[i] = float(value)
823820
assert numeric_changes[i] not in (0, -1)
824-
elif k == 9:
821+
elif k == 8:
825822
if value == 'Y':
826823
mirrored_changes[i] = '1'
827824
else:
828825
mirrored_changes[i] = '0'
829-
elif k == 11:
826+
elif k == 10:
830827
# change to ISO comment, ignore
831828
pass
832-
elif k == 12:
829+
elif k == 11:
833830
# change to simple uppercase mapping; ignore
834831
pass
835-
elif k == 13:
832+
elif k == 12:
836833
# change to simple lowercase mapping; ignore
837834
pass
838-
elif k == 14:
835+
elif k == 13:
839836
# change to simple titlecase mapping; ignore
840837
pass
841-
elif k == 15:
838+
elif k == 14:
842839
# change to east asian width
843840
east_asian_width_changes[i] = EASTASIANWIDTH_NAMES.index(value)
844-
elif k == 16:
841+
elif k == 15:
845842
# derived property changes; not yet
846843
pass
847-
elif k == 17:
844+
elif k == 16:
848845
# normalization quickchecks are not performed
849846
# for older versions
850847
pass
851-
elif k == 18:
848+
elif k == 17:
852849
# The Indic_Conjunct_Break property did not exist for
853850
# older versions
854851
pass
@@ -935,7 +932,7 @@ class UcdRecord:
935932
name: str
936933
general_category: str
937934
canonical_combining_class: str
938-
bidi_class: str
935+
# UnicodeData.bidi_classes
939936
decomposition_type: str
940937
decomposition_mapping: str
941938
numeric_type: str
@@ -967,7 +964,7 @@ class UcdRecord:
967964

968965

969966
def from_row(row: List[str]) -> UcdRecord:
970-
return UcdRecord(*row, None, set(), 0, "None")
967+
return UcdRecord(*row[:4], *row[5:], None, set(), 0, "None")
971968

972969

973970
# --------------------------------------------------------------------
@@ -982,14 +979,17 @@ class UnicodeData:
982979
def __init__(self, version, cjk_check=True):
983980
self.changed = []
984981
table = [None] * 0x110000
982+
bidi_classes = [None] * 0x110000
985983
for s in UcdFile(UNICODE_DATA, version):
986984
char = int(s[0], 16)
985+
bidi_classes[char] = s[4]
987986
table[char] = from_row(s)
988987

989988
cjk_ranges_found = []
990989

991990
# expand first-last ranges
992991
field = None
992+
bidi_val = None
993993
for i in range(0, 0x110000):
994994
# The file UnicodeData.txt has its own distinct way of
995995
# expressing ranges. See:
@@ -998,15 +998,17 @@ def __init__(self, version, cjk_check=True):
998998
if s:
999999
if s.name[-6:] == "First>":
10001000
s.name = ""
1001-
field = dataclasses.astuple(s)[:15]
1001+
field = dataclasses.astuple(s)[:14]
1002+
bidi_val = bidi_classes[i]
10021003
elif s.name[-5:] == "Last>":
10031004
if s.name.startswith("<CJK Ideograph"):
10041005
cjk_ranges_found.append((field[0],
10051006
s.codepoint))
10061007
s.name = ""
10071008
field = None
10081009
elif field:
1009-
table[i] = from_row(('%X' % i,) + field[1:])
1010+
bidi_classes[i] = bidi_val
1011+
table[i] = UcdRecord('%X' % i, *field[1:], None, set(), 0, "None")
10101012
if cjk_check and cjk_ranges != cjk_ranges_found:
10111013
raise ValueError("CJK ranges deviate: have %r" % cjk_ranges_found)
10121014

@@ -1064,27 +1066,24 @@ def __init__(self, version, cjk_check=True):
10641066
table[i].east_asian_width = widths[i]
10651067
self.widths = widths
10661068

1067-
# Read DerivedBidiClass.txt for bidi defaults of unassigned codepoints
1069+
# Read DerivedBidiClass.txt for bidi classes
10681070
# see https://www.unicode.org/reports/tr44/#Missing_Conventions
10691071
if version != '3.2.0':
1070-
bidi_defaults = [None] * 0x110000
10711072
missing_re = re.compile(
10721073
r'# @missing: ([\dA-F]+)\.\.([\dA-F]+); (\w+)'
10731074
)
10741075
with open_data(DERIVED_BIDI_CLASS, version) as f:
10751076
for l in f:
1076-
m = missing_re.search(l)
1077+
m = missing_re.match(l)
10771078
if not m:
10781079
continue
10791080
start, end = int(m[1], 16), int(m[2], 16)
10801081
name = BIDI_LONG_NAMES[m[3]]
10811082
for i in range(start, end + 1):
1082-
bidi_defaults[i] = name
1083+
bidi_classes[i] = name
10831084
for char, (bidi,) in UcdFile(DERIVED_BIDI_CLASS, version).expanded():
1084-
bidi_defaults[char] = bidi
1085-
self.bidi_defaults = bidi_defaults
1086-
else:
1087-
self.bidi_defaults = None
1085+
bidi_classes[char] = bidi
1086+
self.bidi_classes = bidi_classes
10881087

10891088
for char, (propname, *propinfo) in UcdFile(DERIVED_CORE_PROPERTIES, version).expanded():
10901089
if not propinfo:

0 commit comments

Comments
 (0)