11#[ cfg( test) ]
22use alloc:: { vec, vec:: Vec } ;
3- use anyhow:: { Result , bail} ;
43use core:: char;
54use core:: fmt;
5+ use core:: result:: Result ;
66use core:: str;
77use unicode_xid:: UnicodeXID ;
88
@@ -166,6 +166,9 @@ pub enum Token {
166166#[ derive( Eq , PartialEq , Debug ) ]
167167#[ allow( dead_code) ]
168168pub enum Error {
169+ ControlCodepoint ( u32 , char ) ,
170+ DeprecatedCodepoint ( u32 , char ) ,
171+ ForbiddenCodepoint ( u32 , char ) ,
169172 InvalidCharInId ( u32 , char ) ,
170173 IdPartEmpty ( u32 ) ,
171174 InvalidEscape ( u32 , char ) ,
@@ -179,7 +182,7 @@ pub enum Error {
179182}
180183
181184impl < ' a > Tokenizer < ' a > {
182- pub fn new ( input : & ' a str , span_offset : u32 ) -> Result < Tokenizer < ' a > > {
185+ pub fn new ( input : & ' a str , span_offset : u32 ) -> Result < Tokenizer < ' a > , Error > {
183186 detect_invalid_input ( input) ?;
184187
185188 let mut t = Tokenizer {
@@ -194,7 +197,7 @@ impl<'a> Tokenizer<'a> {
194197 Ok ( t)
195198 }
196199
197- pub fn expect_semicolon ( & mut self ) -> Result < ( ) > {
200+ pub fn expect_semicolon ( & mut self ) -> Result < ( ) , Error > {
198201 self . expect ( Token :: Semicolon ) ?;
199202 Ok ( ( ) )
200203 }
@@ -205,13 +208,13 @@ impl<'a> Tokenizer<'a> {
205208 & self . input [ start..end]
206209 }
207210
208- pub fn parse_id ( & self , span : Span ) -> Result < & ' a str > {
211+ pub fn parse_id ( & self , span : Span ) -> Result < & ' a str , Error > {
209212 let ret = self . get_span ( span) ;
210213 validate_id ( span. start ( ) , & ret) ?;
211214 Ok ( ret)
212215 }
213216
214- pub fn parse_explicit_id ( & self , span : Span ) -> Result < & ' a str > {
217+ pub fn parse_explicit_id ( & self , span : Span ) -> Result < & ' a str , Error > {
215218 let token = self . get_span ( span) ;
216219 let id_part = token. strip_prefix ( '%' ) . unwrap ( ) ;
217220 validate_id ( span. start ( ) , id_part) ?;
@@ -456,13 +459,11 @@ impl<'a> Iterator for CrlfFold<'a> {
456459 }
457460}
458461
459- fn detect_invalid_input ( input : & str ) -> Result < ( ) > {
462+ fn detect_invalid_input ( input : & str ) -> Result < ( ) , Error > {
460463 // Disallow specific codepoints.
461- let mut line = 1 ;
462- for ch in input. chars ( ) {
464+ for ( pos, ch) in input. char_indices ( ) {
463465 match ch {
464- '\n' => line += 1 ,
465- '\r' | '\t' => { }
466+ '\n' | '\r' | '\t' => { }
466467
467468 // Bidirectional override codepoints can be used to craft source code that
468469 // appears to have a different meaning than its actual meaning. See
@@ -471,11 +472,7 @@ fn detect_invalid_input(input: &str) -> Result<()> {
471472 // [CVE-2021-42574]: https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2021-42574
472473 '\u{202a}' | '\u{202b}' | '\u{202c}' | '\u{202d}' | '\u{202e}' | '\u{2066}'
473474 | '\u{2067}' | '\u{2068}' | '\u{2069}' => {
474- bail ! (
475- "Input contains bidirectional override codepoint {:?} at line {}" ,
476- ch. escape_unicode( ) ,
477- line
478- ) ;
475+ return Err ( Error :: ForbiddenCodepoint ( u32:: try_from ( pos) . unwrap ( ) , ch) ) ;
479476 }
480477
481478 // Disallow several characters which are deprecated or discouraged in Unicode.
@@ -487,18 +484,14 @@ fn detect_invalid_input(input: &str) -> Result<()> {
487484 // Unicode 13.0.0, sec. 16.4 Khmer, Characters Whose Use Is Discouraged.
488485 '\u{149}' | '\u{673}' | '\u{f77}' | '\u{f79}' | '\u{17a3}' | '\u{17a4}'
489486 | '\u{17b4}' | '\u{17b5}' => {
490- bail ! (
491- "Codepoint {:?} at line {} is discouraged by Unicode" ,
492- ch. escape_unicode( ) ,
493- line
494- ) ;
487+ return Err ( Error :: DeprecatedCodepoint ( u32:: try_from ( pos) . unwrap ( ) , ch) ) ;
495488 }
496489
497490 // Disallow control codes other than the ones explicitly recognized above,
498491 // so that viewing a wit file on a terminal doesn't have surprising side
499492 // effects or appear to have a different meaning than its actual meaning.
500493 ch if ch. is_control ( ) => {
501- bail ! ( "Control code '{}' at line {}" , ch . escape_unicode ( ) , line ) ;
494+ return Err ( Error :: ControlCodepoint ( u32 :: try_from ( pos ) . unwrap ( ) , ch ) ) ;
502495 }
503496
504497 _ => { }
@@ -635,9 +628,41 @@ impl Token {
635628
636629impl core:: error:: Error for Error { }
637630
631+ impl Error {
632+ /// Returns the byte offset in the source map where this error occurred.
633+ pub fn position ( & self ) -> u32 {
634+ match self {
635+ Error :: ControlCodepoint ( at, _)
636+ | Error :: DeprecatedCodepoint ( at, _)
637+ | Error :: ForbiddenCodepoint ( at, _)
638+ | Error :: InvalidCharInId ( at, _)
639+ | Error :: IdPartEmpty ( at)
640+ | Error :: InvalidEscape ( at, _)
641+ | Error :: Unexpected ( at, _)
642+ | Error :: UnterminatedComment ( at) => * at,
643+ Error :: Wanted { at, .. } => * at,
644+ }
645+ }
646+ }
647+
638648impl fmt:: Display for Error {
639649 fn fmt ( & self , f : & mut fmt:: Formatter < ' _ > ) -> fmt:: Result {
640650 match self {
651+ Error :: ControlCodepoint ( _, ch) => write ! ( f, "Control code '{}'" , ch. escape_unicode( ) ) ,
652+ Error :: DeprecatedCodepoint ( _, ch) => {
653+ write ! (
654+ f,
655+ "Codepoint {:?} is discouraged by Unicode" ,
656+ ch. escape_unicode( )
657+ )
658+ }
659+ Error :: ForbiddenCodepoint ( _, ch) => {
660+ write ! (
661+ f,
662+ "Input contains bidirectional override codepoint {:?}" ,
663+ ch. escape_unicode( )
664+ )
665+ }
641666 Error :: Unexpected ( _, ch) => write ! ( f, "unexpected character {ch:?}" ) ,
642667 Error :: UnterminatedComment ( _) => write ! ( f, "unterminated block comment" ) ,
643668 Error :: Wanted {
@@ -712,7 +737,7 @@ fn test_validate_id() {
712737
713738#[ test]
714739fn test_tokenizer ( ) {
715- fn collect ( s : & str ) -> Result < Vec < Token > > {
740+ fn collect ( s : & str ) -> Result < Vec < Token > , Error > {
716741 let mut t = Tokenizer :: new ( s, 0 ) ?;
717742 let mut tokens = Vec :: new ( ) ;
718743 while let Some ( token) = t. next ( ) ? {
0 commit comments