lex tests actually test the tokens now

11 months ago · cb53fb9195
parent cd51f4cce1
commit cb53fb9195
2 changed files with 102 additions and 78 deletions
--- a/src/lex.rs
+++ b/src/lex.rs
@ -1,27 +1,6 @@
 use crate::error::LexError;
 use std::{collections::VecDeque, fmt, ops::Range, str::Chars};

-fn is_glob(c: char) -> bool {
-    match c {
-        '*' | '?' => true,
-        _ => false,
-    }
-}
-
-fn is_special(c: char) -> bool {
-    match c {
-        '?' => true,
-        _ => false,
-    }
-}
-
-fn is_keyword(s: &str) -> bool {
-    match s {
-        "for" => true,
-        _ => false,
-    }
-}
-
 /// The position of a specific glyph within a corpus of text. We use this for rendering error
 /// messages and communicating to the user the location of errors.
 #[derive(PartialEq, Clone, Copy)]
@ -82,6 +61,16 @@ pub struct Topoglyph {
    bytes: Range<u64>,
 }

+impl Topoglyph {
+    fn is_word(&self) -> bool {
+        self.glyph.is_alphanumeric() || self.glyph == '.'
+    }
+
+    fn is_glob(&self) -> bool {
+        self.is_word() || self.glyph == '*'
+    }
+}
+
 impl fmt::Debug for Topoglyph {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> Result<(), fmt::Error> {
        write!(f, "{char}@{pos:?}", char = self.glyph, pos = self.position)
@ -294,10 +283,25 @@ impl From<Vec<Topoglyph>> for Lexeme {
 #[allow(dead_code)]
 #[derive(Debug, PartialEq, Clone)]
 pub enum Token {
-    String(Lexeme),
+    /// A bare word: a sequence of characters without any quotes. A bare word is always not a glob.
+    Word(Lexeme),
+
+    /// A bare word containing 1 or more of the special characters ? or *
    Glob(Lexeme),
 }

+impl Token {
+    fn same(&self, other: &Self) -> bool {
+        use Token::*;
+
+        match (self, other) {
+            (Word(a), Word(b)) => a.text() == b.text(),
+            (Glob(a), Glob(b)) => a.text() == b.text(),
+            _ => false,
+        }
+    }
+}
+
 pub struct Tokenizer<'text> {
    source: Topoglypher<'text>,
 }
@ -314,7 +318,8 @@ impl<'text> Tokenizer<'text> {
        let next = self.source.next()?;

        match next.glyph {
-            _ if next.glyph.is_alphabetic() => Some(self.lex_bare_string(vec![next])),
+            _ if next.is_word() => Some(self.lex_bare_string(vec![next])),
+            _ if next.is_glob() => Some(self.lex_glob(vec![next])),
            '\\' => match self.source.pop() {
                Ok(escaped) => Some(self.lex_bare_string(vec![escaped])),
                Err(e) => Some(Err(e)),
@ -330,12 +335,15 @@ impl<'text> Tokenizer<'text> {
        while let Some(next) = self.source.peek() {
            match next.glyph {
                _ if next.glyph.is_whitespace() => break,
-                _ if next.glyph.is_alphanumeric() => progress.push(self.source.pop()?),
+                _ if next.is_word() => progress.push(self.source.pop()?),
+                _ if next.is_glob() => {
+                    progress.push(self.source.pop()?);
+                    return self.lex_glob(progress);
+                }
                '\\' => {
                    self.source.pop()?;
                    progress.push(self.source.pop()?);
                }
-                '*' | '?' => return self.lex_glob(progress),
                _ => return Err(LexError::UnexpectedCharacter(self.source.pop()?)),
            }
        }
@ -343,7 +351,7 @@ impl<'text> Tokenizer<'text> {
        if progress.is_empty() {
            Err(LexError::UnexpectedEOF)
        } else {
-            Ok(Token::String(progress.into()))
+            Ok(Token::Word(progress.into()))
        }
    }

@ -351,8 +359,7 @@ impl<'text> Tokenizer<'text> {
        while let Some(next) = self.source.peek() {
            match next.glyph {
                _ if next.glyph.is_whitespace() => break,
-                _ if next.glyph.is_alphanumeric() => progress.push(self.source.pop()?),
-                '*' | '?' => progress.push(self.source.pop()?),
+                _ if next.is_glob() => progress.push(self.source.pop()?),
                '\\' => {
                    self.source.pop()?;
                    progress.push(self.source.pop()?);
@ -441,46 +448,61 @@ impl<'text> Iterator for Lexer<'text> {
 #[cfg(test)]
 mod tests {
    use super::*;
+    use std::iter::zip;
+
+    fn lexeme(txt: &str) -> Lexeme {
+        let x: Vec<Topoglyph> = txt
+            .chars()
+            .map(|c| Topoglyph {
+                glyph: c,
+                position: Position::start(),
+                bytes: 0..0,
+            })
+            .collect();
+        x.into()
+    }
+
+    fn glob(txt: &str) -> Token {
+        Token::Glob(lexeme(txt))
+    }
+
+    fn word(txt: &str) -> Token {
+        Token::Word(lexeme(txt))
+    }

-    /// this macro allows us to specify a set of inputs that we expect to lex successfully.
    macro_rules! accept {
-        ($($name:ident: $line:literal ;)+) => {$(
-            #[test]
-            fn $name() {
-                println!("testing that we can lex the following input text:\n\t{}", $line);
-                let lexer = Tokenizer::new($line);
-                let tokens: Result<Vec<Token>, LexError> = lexer.collect();
-                match tokens {
-                    Ok(tokens) => {
-                        println!("output tokens: {tokens:?}");
-                    }
-                    Err(e) => {
-                        println!("output error: {e:?}");
-                        panic!("Encounter an unexpected lex error");
-                    }
-                }
+    ( $($test_name:ident $input_text:literal [ $( $token:expr )* ])+) => {$(
+        #[test]
+        fn $test_name() -> Result<(), LexError> {
+            #[allow(unused_mut)]
+            let mut expected: Vec<Token> = Vec::new();
+            $( expected.push($token); )*
+
+            let lexed = lex($input_text)?;
+            assert_eq!(expected.len(), lexed.len());
+
+            for pair in zip(expected, lexed) {
+                assert!(pair.0.same(&pair.1));
            }
-        )*};
+
+            Ok(())
+        }
+    )*};
    }

-    /// this macro allows us to specify a set of inputs that we expect to fail to lex successfully.
    macro_rules! reject {
-        ($($name:ident: $line:literal ;)+) => {$(
-            #[test]
-            fn $name() {
-                println!("testing that we will fail to lex the following input text:\n\t{}", $line);
-                let tokens = lex($line);
-                match tokens {
-                    Ok(tokens) => {
-                        println!("output tokens: {tokens:?}");
-                        panic!("Did not encounter an expected lex error");
-                    }
-                    Err(e) => {
-                        println!("output error: {e:?}");
-                    }
+    ($($test_name:ident:  $input_text:literal;)+) => {$(
+        #[test]
+        fn $test_name() {
+            match lex($input_text) {
+                Ok(tokens) => {
+                    println!("output tokens: {tokens:?}");
+                    panic!("Did not encounter an expected lex error");
                }
+                Err(e) => println!("output error: {e:?}"),
            }
-        )*};
+        }
+    )*};
    }

    reject! {
@ -491,9 +513,6 @@ mod tests {
        // input makes no sense
        trailing_slash: r"one two three \";

-        // Globs aren't done yet
-        glob: "*";
-
        // Vars aren't done yet
        var: "@name";

@ -505,19 +524,24 @@ mod tests {
    }

    accept! {
-        empty: "";
-
-        spaces: "    ";
-
-        identifier: "a";
-
-        identifier_2: "  a";
-
-        identifier_3: "a  ";
-
-        identifier_4: "  a  ";
-
-        multi_idents: "one two    three			four  ";
+        empty "" []
+        spaces "    " []
+        identifier "a" [ word("a") ]
+        identifier_2 "  a" [ word("a") ]
+        identifier_3 "a  " [ word("a") ]
+        identifier_4 "  a  " [ word("a") ]
+        file_name "poop.exe" [ word("poop.exe") ]
+        multi_idents "one two    three			four  " [
+            word("one")
+            word("two")
+            word("three")
+            word("four")
+        ]
+        glob_1 "*" [ glob("*") ]
+        glob_2 " * " [ glob("*") ]
+        glob_3 "x*" [ glob("x*") ]
+        glob_4 "*x" [ glob("*x") ]
+        glob_5 "*.py" [ glob("*.py") ]
    }
 }

--- a/src/parse2.rs
+++ b/src/parse2.rs
@ -139,7 +139,7 @@ impl<'text> Parser<'text> {
    fn step_start(&mut self) -> Result<bool, ParseError> {
        assert!(matches!(self.cursor.value(), Value::Start));
        match self.source.peek()? {
-            Some(Token::String(_)) => {
+            Some(Token::Word(_)) => {
                self.cursor.push(Value::Statement)?;
                let token = self.source.next().unwrap()?;
                self.cursor.push(Value::Terminal(token))?;
@ -157,7 +157,7 @@ impl<'text> Parser<'text> {
    fn step_statement(&mut self) -> Result<bool, ParseError> {
        assert!(matches!(self.cursor.value(), Value::Statement));
        match self.source.peek()? {
-            Some(Token::String(_) | Token::Glob(_)) => {
+            Some(Token::Word(_) | Token::Glob(_)) => {
                let token = self.source.next().unwrap()?;
                self.cursor.push(Value::Terminal(token))?;
                self.cursor.up()?;