Skip to content

Commit 599f7da

Browse files
Remove anyhow from the WIT lexer (#2461)
* Remove anyhow from the WIT lexer * Add a doc to the position method
1 parent ea10d4f commit 599f7da

File tree

2 files changed

+48
-30
lines changed

2 files changed

+48
-30
lines changed

crates/wit-parser/src/ast.rs

Lines changed: 1 addition & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1843,14 +1843,7 @@ impl SourceMap {
18431843
}
18441844

18451845
if let Some(lex) = err.downcast_ref::<lex::Error>() {
1846-
let pos = match lex {
1847-
lex::Error::Unexpected(at, _)
1848-
| lex::Error::UnterminatedComment(at)
1849-
| lex::Error::Wanted { at, .. }
1850-
| lex::Error::InvalidCharInId(at, _)
1851-
| lex::Error::IdPartEmpty(at)
1852-
| lex::Error::InvalidEscape(at, _) => *at,
1853-
};
1846+
let pos = lex.position();
18541847
let msg = self.highlight_err(pos, None, lex);
18551848
bail!("{msg}")
18561849
}

crates/wit-parser/src/ast/lex.rs

Lines changed: 47 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
#[cfg(test)]
22
use alloc::{vec, vec::Vec};
3-
use anyhow::{Result, bail};
43
use core::char;
54
use core::fmt;
5+
use core::result::Result;
66
use core::str;
77
use unicode_xid::UnicodeXID;
88

@@ -166,6 +166,9 @@ pub enum Token {
166166
#[derive(Eq, PartialEq, Debug)]
167167
#[allow(dead_code)]
168168
pub enum Error {
169+
ControlCodepoint(u32, char),
170+
DeprecatedCodepoint(u32, char),
171+
ForbiddenCodepoint(u32, char),
169172
InvalidCharInId(u32, char),
170173
IdPartEmpty(u32),
171174
InvalidEscape(u32, char),
@@ -179,7 +182,7 @@ pub enum Error {
179182
}
180183

181184
impl<'a> Tokenizer<'a> {
182-
pub fn new(input: &'a str, span_offset: u32) -> Result<Tokenizer<'a>> {
185+
pub fn new(input: &'a str, span_offset: u32) -> Result<Tokenizer<'a>, Error> {
183186
detect_invalid_input(input)?;
184187

185188
let mut t = Tokenizer {
@@ -194,7 +197,7 @@ impl<'a> Tokenizer<'a> {
194197
Ok(t)
195198
}
196199

197-
pub fn expect_semicolon(&mut self) -> Result<()> {
200+
pub fn expect_semicolon(&mut self) -> Result<(), Error> {
198201
self.expect(Token::Semicolon)?;
199202
Ok(())
200203
}
@@ -205,13 +208,13 @@ impl<'a> Tokenizer<'a> {
205208
&self.input[start..end]
206209
}
207210

208-
pub fn parse_id(&self, span: Span) -> Result<&'a str> {
211+
pub fn parse_id(&self, span: Span) -> Result<&'a str, Error> {
209212
let ret = self.get_span(span);
210213
validate_id(span.start(), &ret)?;
211214
Ok(ret)
212215
}
213216

214-
pub fn parse_explicit_id(&self, span: Span) -> Result<&'a str> {
217+
pub fn parse_explicit_id(&self, span: Span) -> Result<&'a str, Error> {
215218
let token = self.get_span(span);
216219
let id_part = token.strip_prefix('%').unwrap();
217220
validate_id(span.start(), id_part)?;
@@ -456,13 +459,11 @@ impl<'a> Iterator for CrlfFold<'a> {
456459
}
457460
}
458461

459-
fn detect_invalid_input(input: &str) -> Result<()> {
462+
fn detect_invalid_input(input: &str) -> Result<(), Error> {
460463
// Disallow specific codepoints.
461-
let mut line = 1;
462-
for ch in input.chars() {
464+
for (pos, ch) in input.char_indices() {
463465
match ch {
464-
'\n' => line += 1,
465-
'\r' | '\t' => {}
466+
'\n' | '\r' | '\t' => {}
466467

467468
// Bidirectional override codepoints can be used to craft source code that
468469
// appears to have a different meaning than its actual meaning. See
@@ -471,11 +472,7 @@ fn detect_invalid_input(input: &str) -> Result<()> {
471472
// [CVE-2021-42574]: https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2021-42574
472473
'\u{202a}' | '\u{202b}' | '\u{202c}' | '\u{202d}' | '\u{202e}' | '\u{2066}'
473474
| '\u{2067}' | '\u{2068}' | '\u{2069}' => {
474-
bail!(
475-
"Input contains bidirectional override codepoint {:?} at line {}",
476-
ch.escape_unicode(),
477-
line
478-
);
475+
return Err(Error::ForbiddenCodepoint(u32::try_from(pos).unwrap(), ch));
479476
}
480477

481478
// Disallow several characters which are deprecated or discouraged in Unicode.
@@ -487,18 +484,14 @@ fn detect_invalid_input(input: &str) -> Result<()> {
487484
// Unicode 13.0.0, sec. 16.4 Khmer, Characters Whose Use Is Discouraged.
488485
'\u{149}' | '\u{673}' | '\u{f77}' | '\u{f79}' | '\u{17a3}' | '\u{17a4}'
489486
| '\u{17b4}' | '\u{17b5}' => {
490-
bail!(
491-
"Codepoint {:?} at line {} is discouraged by Unicode",
492-
ch.escape_unicode(),
493-
line
494-
);
487+
return Err(Error::DeprecatedCodepoint(u32::try_from(pos).unwrap(), ch));
495488
}
496489

497490
// Disallow control codes other than the ones explicitly recognized above,
498491
// so that viewing a wit file on a terminal doesn't have surprising side
499492
// effects or appear to have a different meaning than its actual meaning.
500493
ch if ch.is_control() => {
501-
bail!("Control code '{}' at line {}", ch.escape_unicode(), line);
494+
return Err(Error::ControlCodepoint(u32::try_from(pos).unwrap(), ch));
502495
}
503496

504497
_ => {}
@@ -635,9 +628,41 @@ impl Token {
635628

636629
impl core::error::Error for Error {}
637630

631+
impl Error {
632+
/// Returns the byte offset in the source map where this error occurred.
633+
pub fn position(&self) -> u32 {
634+
match self {
635+
Error::ControlCodepoint(at, _)
636+
| Error::DeprecatedCodepoint(at, _)
637+
| Error::ForbiddenCodepoint(at, _)
638+
| Error::InvalidCharInId(at, _)
639+
| Error::IdPartEmpty(at)
640+
| Error::InvalidEscape(at, _)
641+
| Error::Unexpected(at, _)
642+
| Error::UnterminatedComment(at) => *at,
643+
Error::Wanted { at, .. } => *at,
644+
}
645+
}
646+
}
647+
638648
impl fmt::Display for Error {
639649
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
640650
match self {
651+
Error::ControlCodepoint(_, ch) => write!(f, "Control code '{}'", ch.escape_unicode()),
652+
Error::DeprecatedCodepoint(_, ch) => {
653+
write!(
654+
f,
655+
"Codepoint {:?} is discouraged by Unicode",
656+
ch.escape_unicode()
657+
)
658+
}
659+
Error::ForbiddenCodepoint(_, ch) => {
660+
write!(
661+
f,
662+
"Input contains bidirectional override codepoint {:?}",
663+
ch.escape_unicode()
664+
)
665+
}
641666
Error::Unexpected(_, ch) => write!(f, "unexpected character {ch:?}"),
642667
Error::UnterminatedComment(_) => write!(f, "unterminated block comment"),
643668
Error::Wanted {
@@ -712,7 +737,7 @@ fn test_validate_id() {
712737

713738
#[test]
714739
fn test_tokenizer() {
715-
fn collect(s: &str) -> Result<Vec<Token>> {
740+
fn collect(s: &str) -> Result<Vec<Token>, Error> {
716741
let mut t = Tokenizer::new(s, 0)?;
717742
let mut tokens = Vec::new();
718743
while let Some(token) = t.next()? {

0 commit comments

Comments
 (0)