|
|
|
@ -1,27 +1,6 @@
|
|
|
|
|
use crate::error::LexError;
|
|
|
|
|
use std::{collections::VecDeque, fmt, ops::Range, str::Chars};
|
|
|
|
|
|
|
|
|
|
fn is_glob(c: char) -> bool {
|
|
|
|
|
match c {
|
|
|
|
|
'*' | '?' => true,
|
|
|
|
|
_ => false,
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
fn is_special(c: char) -> bool {
|
|
|
|
|
match c {
|
|
|
|
|
'?' => true,
|
|
|
|
|
_ => false,
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
fn is_keyword(s: &str) -> bool {
|
|
|
|
|
match s {
|
|
|
|
|
"for" => true,
|
|
|
|
|
_ => false,
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// The position of a specific glyph within a corpus of text. We use this for rendering error
|
|
|
|
|
/// messages and communicating to the user the location of errors.
|
|
|
|
|
#[derive(PartialEq, Clone, Copy)]
|
|
|
|
@ -82,6 +61,16 @@ pub struct Topoglyph {
|
|
|
|
|
bytes: Range<u64>,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
impl Topoglyph {
|
|
|
|
|
fn is_word(&self) -> bool {
|
|
|
|
|
self.glyph.is_alphanumeric() || self.glyph == '.'
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
fn is_glob(&self) -> bool {
|
|
|
|
|
self.is_word() || self.glyph == '*'
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
impl fmt::Debug for Topoglyph {
|
|
|
|
|
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> Result<(), fmt::Error> {
|
|
|
|
|
write!(f, "{char}@{pos:?}", char = self.glyph, pos = self.position)
|
|
|
|
@ -294,10 +283,25 @@ impl From<Vec<Topoglyph>> for Lexeme {
|
|
|
|
|
#[allow(dead_code)]
|
|
|
|
|
#[derive(Debug, PartialEq, Clone)]
|
|
|
|
|
pub enum Token {
|
|
|
|
|
String(Lexeme),
|
|
|
|
|
/// A bare word: a sequence of characters without any quotes. A bare word is always not a glob.
|
|
|
|
|
Word(Lexeme),
|
|
|
|
|
|
|
|
|
|
/// A bare word containing 1 or more of the special characters ? or *
|
|
|
|
|
Glob(Lexeme),
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
impl Token {
|
|
|
|
|
fn same(&self, other: &Self) -> bool {
|
|
|
|
|
use Token::*;
|
|
|
|
|
|
|
|
|
|
match (self, other) {
|
|
|
|
|
(Word(a), Word(b)) => a.text() == b.text(),
|
|
|
|
|
(Glob(a), Glob(b)) => a.text() == b.text(),
|
|
|
|
|
_ => false,
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
pub struct Tokenizer<'text> {
|
|
|
|
|
source: Topoglypher<'text>,
|
|
|
|
|
}
|
|
|
|
@ -314,7 +318,8 @@ impl<'text> Tokenizer<'text> {
|
|
|
|
|
let next = self.source.next()?;
|
|
|
|
|
|
|
|
|
|
match next.glyph {
|
|
|
|
|
_ if next.glyph.is_alphabetic() => Some(self.lex_bare_string(vec![next])),
|
|
|
|
|
_ if next.is_word() => Some(self.lex_bare_string(vec![next])),
|
|
|
|
|
_ if next.is_glob() => Some(self.lex_glob(vec![next])),
|
|
|
|
|
'\\' => match self.source.pop() {
|
|
|
|
|
Ok(escaped) => Some(self.lex_bare_string(vec![escaped])),
|
|
|
|
|
Err(e) => Some(Err(e)),
|
|
|
|
@ -330,12 +335,15 @@ impl<'text> Tokenizer<'text> {
|
|
|
|
|
while let Some(next) = self.source.peek() {
|
|
|
|
|
match next.glyph {
|
|
|
|
|
_ if next.glyph.is_whitespace() => break,
|
|
|
|
|
_ if next.glyph.is_alphanumeric() => progress.push(self.source.pop()?),
|
|
|
|
|
_ if next.is_word() => progress.push(self.source.pop()?),
|
|
|
|
|
_ if next.is_glob() => {
|
|
|
|
|
progress.push(self.source.pop()?);
|
|
|
|
|
return self.lex_glob(progress);
|
|
|
|
|
}
|
|
|
|
|
'\\' => {
|
|
|
|
|
self.source.pop()?;
|
|
|
|
|
progress.push(self.source.pop()?);
|
|
|
|
|
}
|
|
|
|
|
'*' | '?' => return self.lex_glob(progress),
|
|
|
|
|
_ => return Err(LexError::UnexpectedCharacter(self.source.pop()?)),
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
@ -343,7 +351,7 @@ impl<'text> Tokenizer<'text> {
|
|
|
|
|
if progress.is_empty() {
|
|
|
|
|
Err(LexError::UnexpectedEOF)
|
|
|
|
|
} else {
|
|
|
|
|
Ok(Token::String(progress.into()))
|
|
|
|
|
Ok(Token::Word(progress.into()))
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
@ -351,8 +359,7 @@ impl<'text> Tokenizer<'text> {
|
|
|
|
|
while let Some(next) = self.source.peek() {
|
|
|
|
|
match next.glyph {
|
|
|
|
|
_ if next.glyph.is_whitespace() => break,
|
|
|
|
|
_ if next.glyph.is_alphanumeric() => progress.push(self.source.pop()?),
|
|
|
|
|
'*' | '?' => progress.push(self.source.pop()?),
|
|
|
|
|
_ if next.is_glob() => progress.push(self.source.pop()?),
|
|
|
|
|
'\\' => {
|
|
|
|
|
self.source.pop()?;
|
|
|
|
|
progress.push(self.source.pop()?);
|
|
|
|
@ -441,46 +448,61 @@ impl<'text> Iterator for Lexer<'text> {
|
|
|
|
|
#[cfg(test)]
|
|
|
|
|
mod tests {
|
|
|
|
|
use super::*;
|
|
|
|
|
use std::iter::zip;
|
|
|
|
|
|
|
|
|
|
fn lexeme(txt: &str) -> Lexeme {
|
|
|
|
|
let x: Vec<Topoglyph> = txt
|
|
|
|
|
.chars()
|
|
|
|
|
.map(|c| Topoglyph {
|
|
|
|
|
glyph: c,
|
|
|
|
|
position: Position::start(),
|
|
|
|
|
bytes: 0..0,
|
|
|
|
|
})
|
|
|
|
|
.collect();
|
|
|
|
|
x.into()
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
fn glob(txt: &str) -> Token {
|
|
|
|
|
Token::Glob(lexeme(txt))
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
fn word(txt: &str) -> Token {
|
|
|
|
|
Token::Word(lexeme(txt))
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// this macro allows us to specify a set of inputs that we expect to lex successfully.
|
|
|
|
|
macro_rules! accept {
|
|
|
|
|
($($name:ident: $line:literal ;)+) => {$(
|
|
|
|
|
#[test]
|
|
|
|
|
fn $name() {
|
|
|
|
|
println!("testing that we can lex the following input text:\n\t{}", $line);
|
|
|
|
|
let lexer = Tokenizer::new($line);
|
|
|
|
|
let tokens: Result<Vec<Token>, LexError> = lexer.collect();
|
|
|
|
|
match tokens {
|
|
|
|
|
Ok(tokens) => {
|
|
|
|
|
println!("output tokens: {tokens:?}");
|
|
|
|
|
}
|
|
|
|
|
Err(e) => {
|
|
|
|
|
println!("output error: {e:?}");
|
|
|
|
|
panic!("Encounter an unexpected lex error");
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
( $($test_name:ident $input_text:literal [ $( $token:expr )* ])+) => {$(
|
|
|
|
|
#[test]
|
|
|
|
|
fn $test_name() -> Result<(), LexError> {
|
|
|
|
|
#[allow(unused_mut)]
|
|
|
|
|
let mut expected: Vec<Token> = Vec::new();
|
|
|
|
|
$( expected.push($token); )*
|
|
|
|
|
|
|
|
|
|
let lexed = lex($input_text)?;
|
|
|
|
|
assert_eq!(expected.len(), lexed.len());
|
|
|
|
|
|
|
|
|
|
for pair in zip(expected, lexed) {
|
|
|
|
|
assert!(pair.0.same(&pair.1));
|
|
|
|
|
}
|
|
|
|
|
)*};
|
|
|
|
|
|
|
|
|
|
Ok(())
|
|
|
|
|
}
|
|
|
|
|
)*};
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// this macro allows us to specify a set of inputs that we expect to fail to lex successfully.
|
|
|
|
|
macro_rules! reject {
|
|
|
|
|
($($name:ident: $line:literal ;)+) => {$(
|
|
|
|
|
#[test]
|
|
|
|
|
fn $name() {
|
|
|
|
|
println!("testing that we will fail to lex the following input text:\n\t{}", $line);
|
|
|
|
|
let tokens = lex($line);
|
|
|
|
|
match tokens {
|
|
|
|
|
Ok(tokens) => {
|
|
|
|
|
println!("output tokens: {tokens:?}");
|
|
|
|
|
panic!("Did not encounter an expected lex error");
|
|
|
|
|
}
|
|
|
|
|
Err(e) => {
|
|
|
|
|
println!("output error: {e:?}");
|
|
|
|
|
}
|
|
|
|
|
($($test_name:ident: $input_text:literal;)+) => {$(
|
|
|
|
|
#[test]
|
|
|
|
|
fn $test_name() {
|
|
|
|
|
match lex($input_text) {
|
|
|
|
|
Ok(tokens) => {
|
|
|
|
|
println!("output tokens: {tokens:?}");
|
|
|
|
|
panic!("Did not encounter an expected lex error");
|
|
|
|
|
}
|
|
|
|
|
Err(e) => println!("output error: {e:?}"),
|
|
|
|
|
}
|
|
|
|
|
)*};
|
|
|
|
|
}
|
|
|
|
|
)*};
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
reject! {
|
|
|
|
@ -491,9 +513,6 @@ mod tests {
|
|
|
|
|
// input makes no sense
|
|
|
|
|
trailing_slash: r"one two three \";
|
|
|
|
|
|
|
|
|
|
// Globs aren't done yet
|
|
|
|
|
glob: "*";
|
|
|
|
|
|
|
|
|
|
// Vars aren't done yet
|
|
|
|
|
var: "@name";
|
|
|
|
|
|
|
|
|
@ -505,19 +524,24 @@ mod tests {
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
accept! {
|
|
|
|
|
empty: "";
|
|
|
|
|
|
|
|
|
|
spaces: " ";
|
|
|
|
|
|
|
|
|
|
identifier: "a";
|
|
|
|
|
|
|
|
|
|
identifier_2: " a";
|
|
|
|
|
|
|
|
|
|
identifier_3: "a ";
|
|
|
|
|
|
|
|
|
|
identifier_4: " a ";
|
|
|
|
|
|
|
|
|
|
multi_idents: "one two three four ";
|
|
|
|
|
empty "" []
|
|
|
|
|
spaces " " []
|
|
|
|
|
identifier "a" [ word("a") ]
|
|
|
|
|
identifier_2 " a" [ word("a") ]
|
|
|
|
|
identifier_3 "a " [ word("a") ]
|
|
|
|
|
identifier_4 " a " [ word("a") ]
|
|
|
|
|
file_name "poop.exe" [ word("poop.exe") ]
|
|
|
|
|
multi_idents "one two three four " [
|
|
|
|
|
word("one")
|
|
|
|
|
word("two")
|
|
|
|
|
word("three")
|
|
|
|
|
word("four")
|
|
|
|
|
]
|
|
|
|
|
glob_1 "*" [ glob("*") ]
|
|
|
|
|
glob_2 " * " [ glob("*") ]
|
|
|
|
|
glob_3 "x*" [ glob("x*") ]
|
|
|
|
|
glob_4 "*x" [ glob("*x") ]
|
|
|
|
|
glob_5 "*.py" [ glob("*.py") ]
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|