I'm redoing the lexing and parsing
parent
4a0db72d4e
commit
d152c4092a
@ -0,0 +1,466 @@
|
|||||||
|
use crate::error::LexError;
|
||||||
|
use std::{collections::VecDeque, fmt, ops::Range, str::Chars};
|
||||||
|
|
||||||
|
fn is_glob(c: char) -> bool {
|
||||||
|
match c {
|
||||||
|
'*' | '?' => true,
|
||||||
|
_ => false,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn is_special(c: char) -> bool {
|
||||||
|
match c {
|
||||||
|
'?' => true,
|
||||||
|
_ => false,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn is_keyword(s: &str) -> bool {
|
||||||
|
match s {
|
||||||
|
"for" => true,
|
||||||
|
_ => false,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// The position of a specific glyph within a corpus of text
|
||||||
|
#[derive(PartialEq, Clone, Copy)]
|
||||||
|
pub struct Position {
|
||||||
|
/// The visual line in which this glyph appears in the source text
|
||||||
|
line: u64,
|
||||||
|
|
||||||
|
/// The visual column in which this glyph appears in the source text
|
||||||
|
column: u64,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Position {
|
||||||
|
fn start() -> Self {
|
||||||
|
Self { line: 0, column: 0 }
|
||||||
|
}
|
||||||
|
|
||||||
|
fn incr(&mut self) -> Position {
|
||||||
|
let p = *self;
|
||||||
|
self.column += 1;
|
||||||
|
p
|
||||||
|
}
|
||||||
|
|
||||||
|
fn incr_line(&mut self) -> Position {
|
||||||
|
let p = *self;
|
||||||
|
self.column = 0;
|
||||||
|
self.line += 1;
|
||||||
|
p
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl fmt::Debug for Position {
|
||||||
|
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> Result<(), fmt::Error> {
|
||||||
|
write!(f, "{line}:{column}", line = self.line, column = self.column)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// A [Topoglyph] is a wrapper around a basic Rust [char] that includes information about where that
|
||||||
|
/// char appears in the source text. Where the char only describes the
|
||||||
|
/// [glyph](https://en.wikipedia.org/wiki/Glyph) (i.e., the graphical symbol), a topoglyph
|
||||||
|
/// includes both the glyph and its position, to be used to describe the locations of parsed
|
||||||
|
/// elements within a source text. Two glyphs appearing at different locations within a source text
|
||||||
|
/// would correspond to two distinct topoglyphs.
|
||||||
|
#[derive(PartialEq, Clone)]
|
||||||
|
pub struct Topoglyph {
|
||||||
|
/// the unicode code point of the glyph
|
||||||
|
glyph: char,
|
||||||
|
|
||||||
|
/// The visual position in which the glyph appears; i.e., the human-comprehensible location
|
||||||
|
/// of the glyph in the source text
|
||||||
|
position: Position,
|
||||||
|
|
||||||
|
/// The byte offsets corresponding to this topoglyph in the source data; i.e., the
|
||||||
|
/// machine-comprehensible location of the glyph in the source text
|
||||||
|
bytes: Range<u64>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl fmt::Debug for Topoglyph {
|
||||||
|
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> Result<(), fmt::Error> {
|
||||||
|
write!(f, "{char}@{pos:?}", char = self.glyph, pos = self.position)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// A topoglypher produces [topoglyphs](Topoglyph) for a source text; i.e., it is an iterator of
|
||||||
|
/// topoglyphs. The topoglypher is used to control reading from the source text and keeps a
|
||||||
|
/// lookahead buffer of topoglyphs that have not been processed. While a [Lexer] is responsible
|
||||||
|
/// for the creation and iteration of [tokens](Token), a topoglypher is responsible for the
|
||||||
|
/// creation and iteration of topoglyphs.
|
||||||
|
struct Topoglypher<'text> {
|
||||||
|
source: Chars<'text>,
|
||||||
|
next_position: Position,
|
||||||
|
bytes_read: u64,
|
||||||
|
lookahead: VecDeque<Topoglyph>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'text> Topoglypher<'text> {
|
||||||
|
fn new(source: &'text str) -> Self {
|
||||||
|
Self {
|
||||||
|
source: source.chars(),
|
||||||
|
next_position: Position::start(),
|
||||||
|
bytes_read: 0,
|
||||||
|
lookahead: VecDeque::new(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn feed(&mut self, n: usize) -> bool {
|
||||||
|
while self.lookahead.len() < n {
|
||||||
|
let c = match self.source.next() {
|
||||||
|
Some(c) => c,
|
||||||
|
None => break,
|
||||||
|
};
|
||||||
|
|
||||||
|
let len = c.len_utf8();
|
||||||
|
let start = self.bytes_read;
|
||||||
|
self.bytes_read += len as u64;
|
||||||
|
let position = if c == '\n' {
|
||||||
|
self.next_position.incr_line()
|
||||||
|
} else {
|
||||||
|
self.next_position.incr()
|
||||||
|
};
|
||||||
|
self.lookahead.push_back(Topoglyph {
|
||||||
|
glyph: c,
|
||||||
|
position,
|
||||||
|
bytes: Range {
|
||||||
|
start,
|
||||||
|
end: self.bytes_read,
|
||||||
|
},
|
||||||
|
})
|
||||||
|
}
|
||||||
|
self.lookahead.len() == n
|
||||||
|
}
|
||||||
|
|
||||||
|
fn peek(&mut self) -> Option<&Topoglyph> {
|
||||||
|
self.peek_at(0)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn pop(&mut self) -> Result<Topoglyph, LexError> {
|
||||||
|
self.next().ok_or(LexError::UnexpectedEOF)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn peek_at(&mut self, idx: usize) -> Option<&Topoglyph> {
|
||||||
|
self.feed(idx + 1);
|
||||||
|
self.lookahead.get(idx)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn next_is<F>(&mut self, pred: F) -> bool
|
||||||
|
where
|
||||||
|
F: FnOnce(&Topoglyph) -> bool,
|
||||||
|
{
|
||||||
|
self.peek().map(pred).unwrap_or(false)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn is_empty(&mut self) -> bool {
|
||||||
|
self.peek().is_none()
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn yeet_while<F>(&mut self, mut pred: F)
|
||||||
|
where
|
||||||
|
F: FnMut(&Topoglyph) -> bool,
|
||||||
|
{
|
||||||
|
while let Some(g) = self.peek() {
|
||||||
|
if pred(&g) {
|
||||||
|
self.next();
|
||||||
|
} else {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn yeet_whitespace(&mut self) {
|
||||||
|
self.yeet_while(|tg| tg.glyph.is_whitespace());
|
||||||
|
}
|
||||||
|
|
||||||
|
fn keep_word(&mut self) -> Result<Lexeme, LexError> {
|
||||||
|
let gs = self.keep_until(|g| g.glyph.is_whitespace());
|
||||||
|
if gs.is_empty() {
|
||||||
|
return Err(LexError::ExpectedWordCharacter);
|
||||||
|
}
|
||||||
|
Ok(Lexeme::from(gs))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn keep_while<F>(&mut self, mut pred: F) -> Vec<Topoglyph>
|
||||||
|
where
|
||||||
|
F: FnMut(&Topoglyph) -> bool,
|
||||||
|
{
|
||||||
|
let mut keep = Vec::new();
|
||||||
|
|
||||||
|
while let Some(g) = self.peek() {
|
||||||
|
if pred(&g) {
|
||||||
|
keep.push(g.clone());
|
||||||
|
self.next();
|
||||||
|
} else {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
keep
|
||||||
|
}
|
||||||
|
|
||||||
|
fn keep_until<F>(&mut self, mut pred: F) -> Vec<Topoglyph>
|
||||||
|
where
|
||||||
|
F: FnMut(&Topoglyph) -> bool,
|
||||||
|
{
|
||||||
|
self.keep_while(|g| !pred(g))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'text> Iterator for Topoglypher<'text> {
|
||||||
|
type Item = Topoglyph;
|
||||||
|
|
||||||
|
fn next(&mut self) -> Option<Self::Item> {
|
||||||
|
self.feed(1);
|
||||||
|
self.lookahead.pop_front()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// A Lexeme is the text of a given Token, without respect to that Token's type, but with respect
|
||||||
|
/// to where the text appears relative to some source code. This is, simply, a string that contains
|
||||||
|
/// the addresses of each of its characters with respect to some source text.
|
||||||
|
#[derive(PartialEq)]
|
||||||
|
pub struct Lexeme {
|
||||||
|
elems: Vec<Topoglyph>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Lexeme {
|
||||||
|
fn span(&self) -> Option<Range<Position>> {
|
||||||
|
if self.elems.is_empty() {
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
Some(Range {
|
||||||
|
start: self.elems[0].position,
|
||||||
|
end: self.elems[self.elems.len() - 1].position,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
fn text(&self) -> String {
|
||||||
|
self.elems.as_slice().iter().map(|tg| tg.glyph).collect()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl fmt::Debug for Lexeme {
|
||||||
|
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> Result<(), fmt::Error> {
|
||||||
|
let span = match self.span() {
|
||||||
|
Some(span) => span,
|
||||||
|
None => return write!(f, "<empty Lexeme>"),
|
||||||
|
};
|
||||||
|
|
||||||
|
write!(
|
||||||
|
f,
|
||||||
|
"<{text} @{start_line}:{start_column}-{end_line}:{end_column}>",
|
||||||
|
start_line = span.start.line,
|
||||||
|
start_column = span.start.column,
|
||||||
|
end_line = span.end.line,
|
||||||
|
end_column = span.end.column,
|
||||||
|
text = self.text(),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl fmt::Display for Lexeme {
|
||||||
|
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> Result<(), fmt::Error> {
|
||||||
|
write!(f, "{}", self.text())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl From<Vec<Topoglyph>> for Lexeme {
|
||||||
|
fn from(v: Vec<Topoglyph>) -> Self {
|
||||||
|
Self { elems: v }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[allow(dead_code)]
|
||||||
|
#[derive(Debug, PartialEq)]
|
||||||
|
pub enum Token {
|
||||||
|
BareString(Lexeme),
|
||||||
|
Glob(Lexeme),
|
||||||
|
}
|
||||||
|
|
||||||
|
struct Lexer<'text> {
|
||||||
|
source: Topoglypher<'text>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'text> Lexer<'text> {
|
||||||
|
fn new(text: &'text str) -> Self {
|
||||||
|
Self {
|
||||||
|
source: Topoglypher::new(text),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn next_token(&mut self) -> Option<Result<Token, LexError>> {
|
||||||
|
self.source.yeet_whitespace();
|
||||||
|
let next = self.source.next()?;
|
||||||
|
|
||||||
|
match next.glyph {
|
||||||
|
_ if next.glyph.is_alphabetic() => Some(self.lex_bare_string(vec![next])),
|
||||||
|
'\\' => match self.source.pop() {
|
||||||
|
Ok(escaped) => Some(self.lex_bare_string(vec![escaped])),
|
||||||
|
Err(e) => Some(Err(e)),
|
||||||
|
},
|
||||||
|
'@' => Some(self.lex_var(vec![next])),
|
||||||
|
'\'' => Some(self.lex_raw_string(vec![next])),
|
||||||
|
'"' => Some(self.lex_interp_string(vec![next])),
|
||||||
|
_ => Some(Err(LexError::UnexpectedCharacter(next))),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn lex_bare_string(&mut self, mut progress: Vec<Topoglyph>) -> Result<Token, LexError> {
|
||||||
|
while let Some(next) = self.source.peek() {
|
||||||
|
match next.glyph {
|
||||||
|
_ if next.glyph.is_whitespace() => break,
|
||||||
|
_ if next.glyph.is_alphanumeric() => progress.push(self.source.pop()?),
|
||||||
|
'\\' => {
|
||||||
|
self.source.pop()?;
|
||||||
|
progress.push(self.source.pop()?);
|
||||||
|
}
|
||||||
|
'*' | '?' => return self.lex_glob(progress),
|
||||||
|
_ => return Err(LexError::UnexpectedCharacter(self.source.pop()?)),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if progress.is_empty() {
|
||||||
|
Err(LexError::UnexpectedEOF)
|
||||||
|
} else {
|
||||||
|
Ok(Token::BareString(progress.into()))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn lex_glob(&mut self, mut progress: Vec<Topoglyph>) -> Result<Token, LexError> {
|
||||||
|
while let Some(next) = self.source.peek() {
|
||||||
|
match next.glyph {
|
||||||
|
_ if next.glyph.is_whitespace() => break,
|
||||||
|
_ if next.glyph.is_alphanumeric() => progress.push(self.source.pop()?),
|
||||||
|
'*' | '?' => progress.push(self.source.pop()?),
|
||||||
|
'\\' => {
|
||||||
|
self.source.pop()?;
|
||||||
|
progress.push(self.source.pop()?);
|
||||||
|
}
|
||||||
|
_ => return Err(LexError::UnexpectedCharacter(self.source.pop()?)),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if progress.is_empty() {
|
||||||
|
Err(LexError::UnexpectedEOF)
|
||||||
|
} else {
|
||||||
|
Ok(Token::Glob(progress.into()))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn lex_raw_string(&mut self, _progress: Vec<Topoglyph>) -> Result<Token, LexError> {
|
||||||
|
Err(LexError::not_yet("raw strings not done yet"))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn lex_interp_string(&mut self, _progress: Vec<Topoglyph>) -> Result<Token, LexError> {
|
||||||
|
Err(LexError::not_yet("interpreted strings not done yet"))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn lex_var(&mut self, _progress: Vec<Topoglyph>) -> Result<Token, LexError> {
|
||||||
|
Err(LexError::not_yet("variables are not done yet"))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'text> Iterator for Lexer<'text> {
|
||||||
|
type Item = Result<Token, LexError>;
|
||||||
|
|
||||||
|
fn next(&mut self) -> Option<Self::Item> {
|
||||||
|
self.next_token()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
/// this macro allows us to specify a set of inputs that we expect to lex successfully.
|
||||||
|
macro_rules! accept {
|
||||||
|
($($name:ident: $line:literal ;)+) => {$(
|
||||||
|
#[test]
|
||||||
|
fn $name() {
|
||||||
|
println!("testing that we can lex the following input text:\n\t{}", $line);
|
||||||
|
let lexer = Lexer::new($line);
|
||||||
|
let tokens: Result<Vec<Token>, LexError> = lexer.collect();
|
||||||
|
match tokens {
|
||||||
|
Ok(tokens) => {
|
||||||
|
println!("output tokens: {tokens:?}");
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
println!("output error: {e:?}");
|
||||||
|
panic!("Encounter an unexpected lex error");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
)*};
|
||||||
|
}
|
||||||
|
|
||||||
|
/// this macro allows us to specify a set of inputs that we expect to fail to lex successfully.
|
||||||
|
macro_rules! reject {
|
||||||
|
($($name:ident: $line:literal ;)+) => {$(
|
||||||
|
#[test]
|
||||||
|
fn $name() {
|
||||||
|
println!("testing that we will fail to lex the following input text:\n\t{}", $line);
|
||||||
|
let lexer = Lexer::new($line);
|
||||||
|
let tokens: Result<Vec<Token>, LexError> = lexer.collect();
|
||||||
|
match tokens {
|
||||||
|
Ok(tokens) => {
|
||||||
|
println!("output tokens: {tokens:?}");
|
||||||
|
panic!("Did not encounter an expected lex error");
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
println!("output error: {e:?}");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
)*};
|
||||||
|
}
|
||||||
|
|
||||||
|
reject! {
|
||||||
|
// A slash on its own makes no sense
|
||||||
|
lonely_slash: r"\";
|
||||||
|
|
||||||
|
// A slash is an escape character, so starting the escape sequence and then ending the
|
||||||
|
// input makes no sense
|
||||||
|
trailing_slash: r"one two three \";
|
||||||
|
|
||||||
|
// Globs aren't done yet
|
||||||
|
glob: "*";
|
||||||
|
|
||||||
|
// Vars aren't done yet
|
||||||
|
var: "@name";
|
||||||
|
|
||||||
|
// Single-quoted strings arne't done yet
|
||||||
|
strings: r"echo 'one' two";
|
||||||
|
|
||||||
|
// Single-quoted strings arne't done yet
|
||||||
|
double_quoted_strings: r#"echo "one" two"#;
|
||||||
|
}
|
||||||
|
|
||||||
|
accept! {
|
||||||
|
empty: "";
|
||||||
|
|
||||||
|
spaces: " ";
|
||||||
|
|
||||||
|
identifier: "a";
|
||||||
|
|
||||||
|
identifier_2: " a";
|
||||||
|
|
||||||
|
identifier_3: "a ";
|
||||||
|
|
||||||
|
identifier_4: " a ";
|
||||||
|
|
||||||
|
multi_idents: "one two three four ";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
|
||||||
|
Run a program or command named a, which is on the PATH
|
||||||
|
|
||||||
|
> a
|
||||||
|
|
||||||
|
Run a program or command named a, which is in the current directory
|
||||||
|
|
||||||
|
> ./a
|
||||||
|
|
||||||
|
*/
|
Loading…
Reference in New Issue