Skip to content

Commit 4ec0fc7

Browse files
committed
Revert "reorg internal state"
This reverts commit 5b3c150.
1 parent 5b3c150 commit 4ec0fc7

2 files changed

Lines changed: 24 additions & 23 deletions

File tree

README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -215,7 +215,7 @@ Since [Hermes doesn't support the `Intl.Segmenter` API](https://github.com/faceb
215215

216216
| Name | Unicode® | ESM? | Size | Size (min) | Size (min+gzip) | Size (min+br) | Size (min+zstd) |
217217
|------------------------------|----------|------|----------:|-----------:|----------------:|--------------:|----------------:|
218-
| `unicode-segmenter/grapheme` | 16.0.0 | ✔️ | 10,725 | 6,650 | 3,364 | 2,732 | 3,485 |
218+
| `unicode-segmenter/grapheme` | 16.0.0 | ✔️ | 10,708 | 6,659 | 3,363 | 2,739 | 3,490 |
219219
| `graphemer` | 15.0.0 | ✖️ ️| 410,435 | 95,104 | 15,752 | 10,660 | 15,911 |
220220
| `grapheme-splitter` | 10.0.0 | ✖️ | 122,254 | 23,682 | 7,852 | 4,802 | 6,753 |
221221
| `@formatjs/intl-segmenter`* | 15.0.0 | ✖️ | 603,510 | 369,673 | 72,273 | 49,530 | 68,027 |
@@ -231,7 +231,7 @@ Since [Hermes doesn't support the `Intl.Segmenter` API](https://github.com/faceb
231231

232232
| Name | Bytecode size | Bytecode size (gzip)* |
233233
|------------------------------|--------------:|----------------------:|
234-
| `unicode-segmenter/grapheme` | 20,229 | 11,393 |
234+
| `unicode-segmenter/grapheme` | 20,259 | 11,417 |
235235
| `graphemer` | 134,089 | 31,766 |
236236
| `grapheme-splitter` | 63,946 | 19,162 |
237237

src/grapheme.js

Lines changed: 22 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -45,47 +45,47 @@ const BMP_MAX = 0xFFFF;
4545
* @return {GraphemeSegmenter} iterator for grapheme cluster segments
4646
*/
4747
export function* graphemeSegments(input) {
48+
let cp = input.codePointAt(0);
49+
50+
// do nothing on empty string
51+
if (cp == null) return;
52+
53+
/** Current cursor position. */
54+
let cursor = cp <= BMP_MAX ? 1 : 2;
55+
4856
/** Total length of the input string. */
4957
let len = input.length;
5058

51-
// do nothing on empty string
52-
if (len === 0) return;
59+
/** Category of codepoint immediately preceding cursor */
60+
let catBefore = cat(cp);
5361

54-
let cp = /** @type {number}*/ (input.codePointAt(0));
62+
/** @type {GraphemeCategoryNum} Category of codepoint immediately preceding cursor. */
63+
let catAfter = 0;
5564

56-
/** Memoize the beginning code point of the segment. */
57-
let _hd = cp;
65+
/** The number of RIS codepoints preceding `cursor`. */
66+
let risCount = 0;
5867

5968
/**
6069
* Emoji state for GB11: tracks if we've seen Extended_Pictographic followed by Extend* ZWJ
6170
* Only relevant when catBefore === ZWJ && catAfter === Extended_Pictographic
6271
*/
6372
let emoji = false;
6473

65-
/** The number of RI codepoints preceding `cursor`. */
66-
let riCount = 0;
67-
6874
/** InCB=Consonant - segment started with Indic consonant */
6975
let consonant = false;
7076

7177
/** InCB=Linker - seen a linker after consonant */
7278
let linker = false;
7379

74-
/** Category of codepoint immediately preceding cursor */
75-
let catBefore = cat(cp);
80+
let index = 0;
7681

77-
/** Memoize the beginning category of the segment */
82+
/** Beginning category of a segment */
7883
let _catBegin = catBefore;
7984

80-
/** @type {GraphemeCategoryNum} Category of codepoint immediately preceding cursor. */
81-
let catAfter = 0;
82-
83-
let index = 0;
84-
let cursor = 0;
85+
/** Memoize the beginning code point of the segment. */
86+
let _hd = cp;
8587

8688
while (cursor < len) {
87-
cursor += cp <= BMP_MAX ? 1 : 2;
88-
8989
cp = /** @type {number} */ (input.codePointAt(cursor));
9090
catAfter = cat(cp);
9191

@@ -117,8 +117,8 @@ export function* graphemeSegments(input) {
117117
}
118118
// GB12, GB13: RI × RI (odd count means no break)
119119
else if (catBefore === 10 && catAfter === 10) {
120-
// riCount is count BEFORE current RI, so odd means this is 2nd, 4th, etc.
121-
boundary = riCount++ % 2 === 1;
120+
// risCount is count BEFORE current RI, so odd means this is 2nd, 4th, etc.
121+
boundary = risCount++ % 2 === 1;
122122
}
123123
// GB6: L × (L | V | LV | LVT)
124124
else if (catBefore === 5) {
@@ -150,7 +150,7 @@ export function* graphemeSegments(input) {
150150

151151
// Reset segment state
152152
emoji = false;
153-
riCount = 0;
153+
risCount = 0;
154154
index = cursor;
155155
_catBegin = catAfter;
156156
_hd = cp;
@@ -181,6 +181,7 @@ export function* graphemeSegments(input) {
181181
}
182182
}
183183

184+
cursor += cp <= BMP_MAX ? 1 : 2;
184185
catBefore = catAfter;
185186
}
186187

0 commit comments

Comments
 (0)