diff --git a/src/lex.rs b/src/lex.rs index dd79ea3..1ec4f33 100644 --- a/src/lex.rs +++ b/src/lex.rs @@ -1,27 +1,6 @@ use crate::error::LexError; use std::{collections::VecDeque, fmt, ops::Range, str::Chars}; -fn is_glob(c: char) -> bool { - match c { - '*' | '?' => true, - _ => false, - } -} - -fn is_special(c: char) -> bool { - match c { - '?' => true, - _ => false, - } -} - -fn is_keyword(s: &str) -> bool { - match s { - "for" => true, - _ => false, - } -} - /// The position of a specific glyph within a corpus of text. We use this for rendering error /// messages and communicating to the user the location of errors. #[derive(PartialEq, Clone, Copy)] @@ -82,6 +61,16 @@ pub struct Topoglyph { bytes: Range, } +impl Topoglyph { + fn is_word(&self) -> bool { + self.glyph.is_alphanumeric() || self.glyph == '.' + } + + fn is_glob(&self) -> bool { + self.is_word() || self.glyph == '*' + } +} + impl fmt::Debug for Topoglyph { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> Result<(), fmt::Error> { write!(f, "{char}@{pos:?}", char = self.glyph, pos = self.position) @@ -294,10 +283,25 @@ impl From> for Lexeme { #[allow(dead_code)] #[derive(Debug, PartialEq, Clone)] pub enum Token { - String(Lexeme), + /// A bare word: a sequence of characters without any quotes. A bare word is always not a glob. + Word(Lexeme), + + /// A bare word containing 1 or more of the special characters ? or * Glob(Lexeme), } +impl Token { + fn same(&self, other: &Self) -> bool { + use Token::*; + + match (self, other) { + (Word(a), Word(b)) => a.text() == b.text(), + (Glob(a), Glob(b)) => a.text() == b.text(), + _ => false, + } + } +} + pub struct Tokenizer<'text> { source: Topoglypher<'text>, } @@ -314,7 +318,8 @@ impl<'text> Tokenizer<'text> { let next = self.source.next()?; match next.glyph { - _ if next.glyph.is_alphabetic() => Some(self.lex_bare_string(vec![next])), + _ if next.is_word() => Some(self.lex_bare_string(vec![next])), + _ if next.is_glob() => Some(self.lex_glob(vec![next])), '\\' => match self.source.pop() { Ok(escaped) => Some(self.lex_bare_string(vec![escaped])), Err(e) => Some(Err(e)), @@ -330,12 +335,15 @@ impl<'text> Tokenizer<'text> { while let Some(next) = self.source.peek() { match next.glyph { _ if next.glyph.is_whitespace() => break, - _ if next.glyph.is_alphanumeric() => progress.push(self.source.pop()?), + _ if next.is_word() => progress.push(self.source.pop()?), + _ if next.is_glob() => { + progress.push(self.source.pop()?); + return self.lex_glob(progress); + } '\\' => { self.source.pop()?; progress.push(self.source.pop()?); } - '*' | '?' => return self.lex_glob(progress), _ => return Err(LexError::UnexpectedCharacter(self.source.pop()?)), } } @@ -343,7 +351,7 @@ impl<'text> Tokenizer<'text> { if progress.is_empty() { Err(LexError::UnexpectedEOF) } else { - Ok(Token::String(progress.into())) + Ok(Token::Word(progress.into())) } } @@ -351,8 +359,7 @@ impl<'text> Tokenizer<'text> { while let Some(next) = self.source.peek() { match next.glyph { _ if next.glyph.is_whitespace() => break, - _ if next.glyph.is_alphanumeric() => progress.push(self.source.pop()?), - '*' | '?' => progress.push(self.source.pop()?), + _ if next.is_glob() => progress.push(self.source.pop()?), '\\' => { self.source.pop()?; progress.push(self.source.pop()?); @@ -441,46 +448,61 @@ impl<'text> Iterator for Lexer<'text> { #[cfg(test)] mod tests { use super::*; + use std::iter::zip; + + fn lexeme(txt: &str) -> Lexeme { + let x: Vec = txt + .chars() + .map(|c| Topoglyph { + glyph: c, + position: Position::start(), + bytes: 0..0, + }) + .collect(); + x.into() + } + + fn glob(txt: &str) -> Token { + Token::Glob(lexeme(txt)) + } + + fn word(txt: &str) -> Token { + Token::Word(lexeme(txt)) + } - /// this macro allows us to specify a set of inputs that we expect to lex successfully. macro_rules! accept { - ($($name:ident: $line:literal ;)+) => {$( - #[test] - fn $name() { - println!("testing that we can lex the following input text:\n\t{}", $line); - let lexer = Tokenizer::new($line); - let tokens: Result, LexError> = lexer.collect(); - match tokens { - Ok(tokens) => { - println!("output tokens: {tokens:?}"); - } - Err(e) => { - println!("output error: {e:?}"); - panic!("Encounter an unexpected lex error"); - } - } + ( $($test_name:ident $input_text:literal [ $( $token:expr )* ])+) => {$( + #[test] + fn $test_name() -> Result<(), LexError> { + #[allow(unused_mut)] + let mut expected: Vec = Vec::new(); + $( expected.push($token); )* + + let lexed = lex($input_text)?; + assert_eq!(expected.len(), lexed.len()); + + for pair in zip(expected, lexed) { + assert!(pair.0.same(&pair.1)); } - )*}; + + Ok(()) + } + )*}; } - /// this macro allows us to specify a set of inputs that we expect to fail to lex successfully. macro_rules! reject { - ($($name:ident: $line:literal ;)+) => {$( - #[test] - fn $name() { - println!("testing that we will fail to lex the following input text:\n\t{}", $line); - let tokens = lex($line); - match tokens { - Ok(tokens) => { - println!("output tokens: {tokens:?}"); - panic!("Did not encounter an expected lex error"); - } - Err(e) => { - println!("output error: {e:?}"); - } + ($($test_name:ident: $input_text:literal;)+) => {$( + #[test] + fn $test_name() { + match lex($input_text) { + Ok(tokens) => { + println!("output tokens: {tokens:?}"); + panic!("Did not encounter an expected lex error"); } + Err(e) => println!("output error: {e:?}"), } - )*}; + } + )*}; } reject! { @@ -491,9 +513,6 @@ mod tests { // input makes no sense trailing_slash: r"one two three \"; - // Globs aren't done yet - glob: "*"; - // Vars aren't done yet var: "@name"; @@ -505,19 +524,24 @@ mod tests { } accept! { - empty: ""; - - spaces: " "; - - identifier: "a"; - - identifier_2: " a"; - - identifier_3: "a "; - - identifier_4: " a "; - - multi_idents: "one two three four "; + empty "" [] + spaces " " [] + identifier "a" [ word("a") ] + identifier_2 " a" [ word("a") ] + identifier_3 "a " [ word("a") ] + identifier_4 " a " [ word("a") ] + file_name "poop.exe" [ word("poop.exe") ] + multi_idents "one two three four " [ + word("one") + word("two") + word("three") + word("four") + ] + glob_1 "*" [ glob("*") ] + glob_2 " * " [ glob("*") ] + glob_3 "x*" [ glob("x*") ] + glob_4 "*x" [ glob("*x") ] + glob_5 "*.py" [ glob("*.py") ] } } diff --git a/src/parse2.rs b/src/parse2.rs index 98f5688..e520113 100644 --- a/src/parse2.rs +++ b/src/parse2.rs @@ -139,7 +139,7 @@ impl<'text> Parser<'text> { fn step_start(&mut self) -> Result { assert!(matches!(self.cursor.value(), Value::Start)); match self.source.peek()? { - Some(Token::String(_)) => { + Some(Token::Word(_)) => { self.cursor.push(Value::Statement)?; let token = self.source.next().unwrap()?; self.cursor.push(Value::Terminal(token))?; @@ -157,7 +157,7 @@ impl<'text> Parser<'text> { fn step_statement(&mut self) -> Result { assert!(matches!(self.cursor.value(), Value::Statement)); match self.source.peek()? { - Some(Token::String(_) | Token::Glob(_)) => { + Some(Token::Word(_) | Token::Glob(_)) => { let token = self.source.next().unwrap()?; self.cursor.push(Value::Terminal(token))?; self.cursor.up()?;