@@ -208,7 +208,7 @@ def makeunicodedata(unicode, trace):
208208 # extract database properties
209209 category = CATEGORY_NAMES .index (record .general_category )
210210 combining = int (record .canonical_combining_class )
211- bidirectional = BIDIRECTIONAL_NAMES .index (record . bidi_class )
211+ bidirectional = BIDIRECTIONAL_NAMES .index (unicode . bidi_classes [ char ] )
212212 mirrored = record .bidi_mirrored == "Y"
213213 normalizationquickcheck = record .quick_check
214214 incb = INDIC_CONJUNCT_BREAK_NAMES .index (record .incb )
@@ -217,10 +217,7 @@ def makeunicodedata(unicode, trace):
217217 normalizationquickcheck , graphemebreak , incb , extpict ,
218218 )
219219 else :
220- if unicode .bidi_defaults is not None :
221- bidirectional = BIDIRECTIONAL_NAMES .index (unicode .bidi_defaults [char ])
222- else :
223- bidirectional = 0
220+ bidirectional = BIDIRECTIONAL_NAMES .index (unicode .bidi_classes [char ])
224221 if eastasianwidth or graphemebreak or extpict or bidirectional :
225222 item = (0 , 0 , bidirectional , 0 , eastasianwidth ,
226223 0 , graphemebreak , 0 , extpict )
@@ -496,7 +493,7 @@ def makeunicodetype(unicode, trace):
496493 if record :
497494 # extract database properties
498495 category = record .general_category
499- bidirectional = record . bidi_class
496+ bidirectional = unicode . bidi_classes [ char ]
500497 properties = record .binary_properties
501498 flags = 0
502499 if category in ["Lm" , "Lt" , "Lu" , "Ll" , "Lo" ]:
@@ -792,6 +789,8 @@ def merge_old_version(version, new, old):
792789 # category 0 is "unassigned"
793790 category_changes [i ] = 0
794791 continue
792+ if old .bidi_classes [i ] != new .bidi_classes [i ]:
793+ bidir_changes [i ] = BIDIRECTIONAL_NAMES .index (old .bidi_classes [i ])
795794 # check characters that differ
796795 if old .table [i ] != new .table [i ]:
797796 for k , field in enumerate (dataclasses .fields (UcdRecord )):
@@ -805,50 +804,48 @@ def merge_old_version(version, new, old):
805804 elif k == 2 :
806805 category_changes [i ] = CATEGORY_NAMES .index (value )
807806 elif k == 4 :
808- bidir_changes [i ] = BIDIRECTIONAL_NAMES .index (value )
809- elif k == 5 :
810807 # We assume that all normalization changes are in 1:1 mappings
811808 assert " " not in value
812809 normalization_changes .append ((i , value ))
813- elif k == 6 :
810+ elif k == 5 :
814811 # we only support changes where the old value is a single digit
815812 assert value in "0123456789"
816813 decimal_changes [i ] = int (value )
817- elif k == 8 :
814+ elif k == 7 :
818815 # Since 0 encodes "no change", the old value is better not 0
819816 if not value :
820817 numeric_changes [i ] = - 1
821818 else :
822819 numeric_changes [i ] = float (value )
823820 assert numeric_changes [i ] not in (0 , - 1 )
824- elif k == 9 :
821+ elif k == 8 :
825822 if value == 'Y' :
826823 mirrored_changes [i ] = '1'
827824 else :
828825 mirrored_changes [i ] = '0'
829- elif k == 11 :
826+ elif k == 10 :
830827 # change to ISO comment, ignore
831828 pass
832- elif k == 12 :
829+ elif k == 11 :
833830 # change to simple uppercase mapping; ignore
834831 pass
835- elif k == 13 :
832+ elif k == 12 :
836833 # change to simple lowercase mapping; ignore
837834 pass
838- elif k == 14 :
835+ elif k == 13 :
839836 # change to simple titlecase mapping; ignore
840837 pass
841- elif k == 15 :
838+ elif k == 14 :
842839 # change to east asian width
843840 east_asian_width_changes [i ] = EASTASIANWIDTH_NAMES .index (value )
844- elif k == 16 :
841+ elif k == 15 :
845842 # derived property changes; not yet
846843 pass
847- elif k == 17 :
844+ elif k == 16 :
848845 # normalization quickchecks are not performed
849846 # for older versions
850847 pass
851- elif k == 18 :
848+ elif k == 17 :
852849 # The Indic_Conjunct_Break property did not exist for
853850 # older versions
854851 pass
@@ -935,7 +932,7 @@ class UcdRecord:
935932 name : str
936933 general_category : str
937934 canonical_combining_class : str
938- bidi_class : str
935+ # UnicodeData.bidi_classes
939936 decomposition_type : str
940937 decomposition_mapping : str
941938 numeric_type : str
@@ -967,7 +964,7 @@ class UcdRecord:
967964
968965
969966def from_row (row : List [str ]) -> UcdRecord :
970- return UcdRecord (* row , None , set (), 0 , "None" )
967+ return UcdRecord (* row [: 4 ], * row [ 5 :] , None , set (), 0 , "None" )
971968
972969
973970# --------------------------------------------------------------------
@@ -982,14 +979,17 @@ class UnicodeData:
982979 def __init__ (self , version , cjk_check = True ):
983980 self .changed = []
984981 table = [None ] * 0x110000
982+ bidi_classes = [None ] * 0x110000
985983 for s in UcdFile (UNICODE_DATA , version ):
986984 char = int (s [0 ], 16 )
985+ bidi_classes [char ] = s [4 ]
987986 table [char ] = from_row (s )
988987
989988 cjk_ranges_found = []
990989
991990 # expand first-last ranges
992991 field = None
992+ bidi_val = None
993993 for i in range (0 , 0x110000 ):
994994 # The file UnicodeData.txt has its own distinct way of
995995 # expressing ranges. See:
@@ -998,15 +998,17 @@ def __init__(self, version, cjk_check=True):
998998 if s :
999999 if s .name [- 6 :] == "First>" :
10001000 s .name = ""
1001- field = dataclasses .astuple (s )[:15 ]
1001+ field = dataclasses .astuple (s )[:14 ]
1002+ bidi_val = bidi_classes [i ]
10021003 elif s .name [- 5 :] == "Last>" :
10031004 if s .name .startswith ("<CJK Ideograph" ):
10041005 cjk_ranges_found .append ((field [0 ],
10051006 s .codepoint ))
10061007 s .name = ""
10071008 field = None
10081009 elif field :
1009- table [i ] = from_row (('%X' % i ,) + field [1 :])
1010+ bidi_classes [i ] = bidi_val
1011+ table [i ] = UcdRecord ('%X' % i , * field [1 :], None , set (), 0 , "None" )
10101012 if cjk_check and cjk_ranges != cjk_ranges_found :
10111013 raise ValueError ("CJK ranges deviate: have %r" % cjk_ranges_found )
10121014
@@ -1064,27 +1066,24 @@ def __init__(self, version, cjk_check=True):
10641066 table [i ].east_asian_width = widths [i ]
10651067 self .widths = widths
10661068
1067- # Read DerivedBidiClass.txt for bidi defaults of unassigned codepoints
1069+ # Read DerivedBidiClass.txt for bidi classes
10681070 # see https://www.unicode.org/reports/tr44/#Missing_Conventions
10691071 if version != '3.2.0' :
1070- bidi_defaults = [None ] * 0x110000
10711072 missing_re = re .compile (
10721073 r'# @missing: ([\dA-F]+)\.\.([\dA-F]+); (\w+)'
10731074 )
10741075 with open_data (DERIVED_BIDI_CLASS , version ) as f :
10751076 for l in f :
1076- m = missing_re .search (l )
1077+ m = missing_re .match (l )
10771078 if not m :
10781079 continue
10791080 start , end = int (m [1 ], 16 ), int (m [2 ], 16 )
10801081 name = BIDI_LONG_NAMES [m [3 ]]
10811082 for i in range (start , end + 1 ):
1082- bidi_defaults [i ] = name
1083+ bidi_classes [i ] = name
10831084 for char , (bidi ,) in UcdFile (DERIVED_BIDI_CLASS , version ).expanded ():
1084- bidi_defaults [char ] = bidi
1085- self .bidi_defaults = bidi_defaults
1086- else :
1087- self .bidi_defaults = None
1085+ bidi_classes [char ] = bidi
1086+ self .bidi_classes = bidi_classes
10881087
10891088 for char , (propname , * propinfo ) in UcdFile (DERIVED_CORE_PROPERTIES , version ).expanded ():
10901089 if not propinfo :
0 commit comments