I'm redoing the lexing and parsing
parent
4a0db72d4e
commit
d152c4092a
@ -0,0 +1,466 @@
|
||||
use crate::error::LexError;
|
||||
use std::{collections::VecDeque, fmt, ops::Range, str::Chars};
|
||||
|
||||
fn is_glob(c: char) -> bool {
|
||||
match c {
|
||||
'*' | '?' => true,
|
||||
_ => false,
|
||||
}
|
||||
}
|
||||
|
||||
fn is_special(c: char) -> bool {
|
||||
match c {
|
||||
'?' => true,
|
||||
_ => false,
|
||||
}
|
||||
}
|
||||
|
||||
fn is_keyword(s: &str) -> bool {
|
||||
match s {
|
||||
"for" => true,
|
||||
_ => false,
|
||||
}
|
||||
}
|
||||
|
||||
/// The position of a specific glyph within a corpus of text
|
||||
#[derive(PartialEq, Clone, Copy)]
|
||||
pub struct Position {
|
||||
/// The visual line in which this glyph appears in the source text
|
||||
line: u64,
|
||||
|
||||
/// The visual column in which this glyph appears in the source text
|
||||
column: u64,
|
||||
}
|
||||
|
||||
impl Position {
|
||||
fn start() -> Self {
|
||||
Self { line: 0, column: 0 }
|
||||
}
|
||||
|
||||
fn incr(&mut self) -> Position {
|
||||
let p = *self;
|
||||
self.column += 1;
|
||||
p
|
||||
}
|
||||
|
||||
fn incr_line(&mut self) -> Position {
|
||||
let p = *self;
|
||||
self.column = 0;
|
||||
self.line += 1;
|
||||
p
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Debug for Position {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> Result<(), fmt::Error> {
|
||||
write!(f, "{line}:{column}", line = self.line, column = self.column)
|
||||
}
|
||||
}
|
||||
|
||||
/// A [Topoglyph] is a wrapper around a basic Rust [char] that includes information about where that
|
||||
/// char appears in the source text. Where the char only describes the
|
||||
/// [glyph](https://en.wikipedia.org/wiki/Glyph) (i.e., the graphical symbol), a topoglyph
|
||||
/// includes both the glyph and its position, to be used to describe the locations of parsed
|
||||
/// elements within a source text. Two glyphs appearing at different locations within a source text
|
||||
/// would correspond to two distinct topoglyphs.
|
||||
#[derive(PartialEq, Clone)]
|
||||
pub struct Topoglyph {
|
||||
/// the unicode code point of the glyph
|
||||
glyph: char,
|
||||
|
||||
/// The visual position in which the glyph appears; i.e., the human-comprehensible location
|
||||
/// of the glyph in the source text
|
||||
position: Position,
|
||||
|
||||
/// The byte offsets corresponding to this topoglyph in the source data; i.e., the
|
||||
/// machine-comprehensible location of the glyph in the source text
|
||||
bytes: Range<u64>,
|
||||
}
|
||||
|
||||
impl fmt::Debug for Topoglyph {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> Result<(), fmt::Error> {
|
||||
write!(f, "{char}@{pos:?}", char = self.glyph, pos = self.position)
|
||||
}
|
||||
}
|
||||
|
||||
/// A topoglypher produces [topoglyphs](Topoglyph) for a source text; i.e., it is an iterator of
|
||||
/// topoglyphs. The topoglypher is used to control reading from the source text and keeps a
|
||||
/// lookahead buffer of topoglyphs that have not been processed. While a [Lexer] is responsible
|
||||
/// for the creation and iteration of [tokens](Token), a topoglypher is responsible for the
|
||||
/// creation and iteration of topoglyphs.
|
||||
struct Topoglypher<'text> {
|
||||
source: Chars<'text>,
|
||||
next_position: Position,
|
||||
bytes_read: u64,
|
||||
lookahead: VecDeque<Topoglyph>,
|
||||
}
|
||||
|
||||
impl<'text> Topoglypher<'text> {
|
||||
fn new(source: &'text str) -> Self {
|
||||
Self {
|
||||
source: source.chars(),
|
||||
next_position: Position::start(),
|
||||
bytes_read: 0,
|
||||
lookahead: VecDeque::new(),
|
||||
}
|
||||
}
|
||||
|
||||
fn feed(&mut self, n: usize) -> bool {
|
||||
while self.lookahead.len() < n {
|
||||
let c = match self.source.next() {
|
||||
Some(c) => c,
|
||||
None => break,
|
||||
};
|
||||
|
||||
let len = c.len_utf8();
|
||||
let start = self.bytes_read;
|
||||
self.bytes_read += len as u64;
|
||||
let position = if c == '\n' {
|
||||
self.next_position.incr_line()
|
||||
} else {
|
||||
self.next_position.incr()
|
||||
};
|
||||
self.lookahead.push_back(Topoglyph {
|
||||
glyph: c,
|
||||
position,
|
||||
bytes: Range {
|
||||
start,
|
||||
end: self.bytes_read,
|
||||
},
|
||||
})
|
||||
}
|
||||
self.lookahead.len() == n
|
||||
}
|
||||
|
||||
fn peek(&mut self) -> Option<&Topoglyph> {
|
||||
self.peek_at(0)
|
||||
}
|
||||
|
||||
fn pop(&mut self) -> Result<Topoglyph, LexError> {
|
||||
self.next().ok_or(LexError::UnexpectedEOF)
|
||||
}
|
||||
|
||||
fn peek_at(&mut self, idx: usize) -> Option<&Topoglyph> {
|
||||
self.feed(idx + 1);
|
||||
self.lookahead.get(idx)
|
||||
}
|
||||
|
||||
fn next_is<F>(&mut self, pred: F) -> bool
|
||||
where
|
||||
F: FnOnce(&Topoglyph) -> bool,
|
||||
{
|
||||
self.peek().map(pred).unwrap_or(false)
|
||||
}
|
||||
|
||||
fn is_empty(&mut self) -> bool {
|
||||
self.peek().is_none()
|
||||
}
|
||||
|
||||
pub fn yeet_while<F>(&mut self, mut pred: F)
|
||||
where
|
||||
F: FnMut(&Topoglyph) -> bool,
|
||||
{
|
||||
while let Some(g) = self.peek() {
|
||||
if pred(&g) {
|
||||
self.next();
|
||||
} else {
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn yeet_whitespace(&mut self) {
|
||||
self.yeet_while(|tg| tg.glyph.is_whitespace());
|
||||
}
|
||||
|
||||
fn keep_word(&mut self) -> Result<Lexeme, LexError> {
|
||||
let gs = self.keep_until(|g| g.glyph.is_whitespace());
|
||||
if gs.is_empty() {
|
||||
return Err(LexError::ExpectedWordCharacter);
|
||||
}
|
||||
Ok(Lexeme::from(gs))
|
||||
}
|
||||
|
||||
fn keep_while<F>(&mut self, mut pred: F) -> Vec<Topoglyph>
|
||||
where
|
||||
F: FnMut(&Topoglyph) -> bool,
|
||||
{
|
||||
let mut keep = Vec::new();
|
||||
|
||||
while let Some(g) = self.peek() {
|
||||
if pred(&g) {
|
||||
keep.push(g.clone());
|
||||
self.next();
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
keep
|
||||
}
|
||||
|
||||
fn keep_until<F>(&mut self, mut pred: F) -> Vec<Topoglyph>
|
||||
where
|
||||
F: FnMut(&Topoglyph) -> bool,
|
||||
{
|
||||
self.keep_while(|g| !pred(g))
|
||||
}
|
||||
}
|
||||
|
||||
impl<'text> Iterator for Topoglypher<'text> {
|
||||
type Item = Topoglyph;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
self.feed(1);
|
||||
self.lookahead.pop_front()
|
||||
}
|
||||
}
|
||||
|
||||
/// A Lexeme is the text of a given Token, without respect to that Token's type, but with respect
|
||||
/// to where the text appears relative to some source code. This is, simply, a string that contains
|
||||
/// the addresses of each of its characters with respect to some source text.
|
||||
#[derive(PartialEq)]
|
||||
pub struct Lexeme {
|
||||
elems: Vec<Topoglyph>,
|
||||
}
|
||||
|
||||
impl Lexeme {
|
||||
fn span(&self) -> Option<Range<Position>> {
|
||||
if self.elems.is_empty() {
|
||||
return None;
|
||||
}
|
||||
Some(Range {
|
||||
start: self.elems[0].position,
|
||||
end: self.elems[self.elems.len() - 1].position,
|
||||
})
|
||||
}
|
||||
|
||||
fn text(&self) -> String {
|
||||
self.elems.as_slice().iter().map(|tg| tg.glyph).collect()
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Debug for Lexeme {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> Result<(), fmt::Error> {
|
||||
let span = match self.span() {
|
||||
Some(span) => span,
|
||||
None => return write!(f, "<empty Lexeme>"),
|
||||
};
|
||||
|
||||
write!(
|
||||
f,
|
||||
"<{text} @{start_line}:{start_column}-{end_line}:{end_column}>",
|
||||
start_line = span.start.line,
|
||||
start_column = span.start.column,
|
||||
end_line = span.end.line,
|
||||
end_column = span.end.column,
|
||||
text = self.text(),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Display for Lexeme {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> Result<(), fmt::Error> {
|
||||
write!(f, "{}", self.text())
|
||||
}
|
||||
}
|
||||
|
||||
impl From<Vec<Topoglyph>> for Lexeme {
|
||||
fn from(v: Vec<Topoglyph>) -> Self {
|
||||
Self { elems: v }
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(dead_code)]
|
||||
#[derive(Debug, PartialEq)]
|
||||
pub enum Token {
|
||||
BareString(Lexeme),
|
||||
Glob(Lexeme),
|
||||
}
|
||||
|
||||
struct Lexer<'text> {
|
||||
source: Topoglypher<'text>,
|
||||
}
|
||||
|
||||
impl<'text> Lexer<'text> {
|
||||
fn new(text: &'text str) -> Self {
|
||||
Self {
|
||||
source: Topoglypher::new(text),
|
||||
}
|
||||
}
|
||||
|
||||
fn next_token(&mut self) -> Option<Result<Token, LexError>> {
|
||||
self.source.yeet_whitespace();
|
||||
let next = self.source.next()?;
|
||||
|
||||
match next.glyph {
|
||||
_ if next.glyph.is_alphabetic() => Some(self.lex_bare_string(vec![next])),
|
||||
'\\' => match self.source.pop() {
|
||||
Ok(escaped) => Some(self.lex_bare_string(vec![escaped])),
|
||||
Err(e) => Some(Err(e)),
|
||||
},
|
||||
'@' => Some(self.lex_var(vec![next])),
|
||||
'\'' => Some(self.lex_raw_string(vec![next])),
|
||||
'"' => Some(self.lex_interp_string(vec![next])),
|
||||
_ => Some(Err(LexError::UnexpectedCharacter(next))),
|
||||
}
|
||||
}
|
||||
|
||||
fn lex_bare_string(&mut self, mut progress: Vec<Topoglyph>) -> Result<Token, LexError> {
|
||||
while let Some(next) = self.source.peek() {
|
||||
match next.glyph {
|
||||
_ if next.glyph.is_whitespace() => break,
|
||||
_ if next.glyph.is_alphanumeric() => progress.push(self.source.pop()?),
|
||||
'\\' => {
|
||||
self.source.pop()?;
|
||||
progress.push(self.source.pop()?);
|
||||
}
|
||||
'*' | '?' => return self.lex_glob(progress),
|
||||
_ => return Err(LexError::UnexpectedCharacter(self.source.pop()?)),
|
||||
}
|
||||
}
|
||||
|
||||
if progress.is_empty() {
|
||||
Err(LexError::UnexpectedEOF)
|
||||
} else {
|
||||
Ok(Token::BareString(progress.into()))
|
||||
}
|
||||
}
|
||||
|
||||
fn lex_glob(&mut self, mut progress: Vec<Topoglyph>) -> Result<Token, LexError> {
|
||||
while let Some(next) = self.source.peek() {
|
||||
match next.glyph {
|
||||
_ if next.glyph.is_whitespace() => break,
|
||||
_ if next.glyph.is_alphanumeric() => progress.push(self.source.pop()?),
|
||||
'*' | '?' => progress.push(self.source.pop()?),
|
||||
'\\' => {
|
||||
self.source.pop()?;
|
||||
progress.push(self.source.pop()?);
|
||||
}
|
||||
_ => return Err(LexError::UnexpectedCharacter(self.source.pop()?)),
|
||||
}
|
||||
}
|
||||
|
||||
if progress.is_empty() {
|
||||
Err(LexError::UnexpectedEOF)
|
||||
} else {
|
||||
Ok(Token::Glob(progress.into()))
|
||||
}
|
||||
}
|
||||
|
||||
fn lex_raw_string(&mut self, _progress: Vec<Topoglyph>) -> Result<Token, LexError> {
|
||||
Err(LexError::not_yet("raw strings not done yet"))
|
||||
}
|
||||
|
||||
fn lex_interp_string(&mut self, _progress: Vec<Topoglyph>) -> Result<Token, LexError> {
|
||||
Err(LexError::not_yet("interpreted strings not done yet"))
|
||||
}
|
||||
|
||||
fn lex_var(&mut self, _progress: Vec<Topoglyph>) -> Result<Token, LexError> {
|
||||
Err(LexError::not_yet("variables are not done yet"))
|
||||
}
|
||||
}
|
||||
|
||||
impl<'text> Iterator for Lexer<'text> {
|
||||
type Item = Result<Token, LexError>;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
self.next_token()
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
/// this macro allows us to specify a set of inputs that we expect to lex successfully.
|
||||
macro_rules! accept {
|
||||
($($name:ident: $line:literal ;)+) => {$(
|
||||
#[test]
|
||||
fn $name() {
|
||||
println!("testing that we can lex the following input text:\n\t{}", $line);
|
||||
let lexer = Lexer::new($line);
|
||||
let tokens: Result<Vec<Token>, LexError> = lexer.collect();
|
||||
match tokens {
|
||||
Ok(tokens) => {
|
||||
println!("output tokens: {tokens:?}");
|
||||
}
|
||||
Err(e) => {
|
||||
println!("output error: {e:?}");
|
||||
panic!("Encounter an unexpected lex error");
|
||||
}
|
||||
}
|
||||
}
|
||||
)*};
|
||||
}
|
||||
|
||||
/// this macro allows us to specify a set of inputs that we expect to fail to lex successfully.
|
||||
macro_rules! reject {
|
||||
($($name:ident: $line:literal ;)+) => {$(
|
||||
#[test]
|
||||
fn $name() {
|
||||
println!("testing that we will fail to lex the following input text:\n\t{}", $line);
|
||||
let lexer = Lexer::new($line);
|
||||
let tokens: Result<Vec<Token>, LexError> = lexer.collect();
|
||||
match tokens {
|
||||
Ok(tokens) => {
|
||||
println!("output tokens: {tokens:?}");
|
||||
panic!("Did not encounter an expected lex error");
|
||||
}
|
||||
Err(e) => {
|
||||
println!("output error: {e:?}");
|
||||
}
|
||||
}
|
||||
}
|
||||
)*};
|
||||
}
|
||||
|
||||
reject! {
|
||||
// A slash on its own makes no sense
|
||||
lonely_slash: r"\";
|
||||
|
||||
// A slash is an escape character, so starting the escape sequence and then ending the
|
||||
// input makes no sense
|
||||
trailing_slash: r"one two three \";
|
||||
|
||||
// Globs aren't done yet
|
||||
glob: "*";
|
||||
|
||||
// Vars aren't done yet
|
||||
var: "@name";
|
||||
|
||||
// Single-quoted strings arne't done yet
|
||||
strings: r"echo 'one' two";
|
||||
|
||||
// Single-quoted strings arne't done yet
|
||||
double_quoted_strings: r#"echo "one" two"#;
|
||||
}
|
||||
|
||||
accept! {
|
||||
empty: "";
|
||||
|
||||
spaces: " ";
|
||||
|
||||
identifier: "a";
|
||||
|
||||
identifier_2: " a";
|
||||
|
||||
identifier_3: "a ";
|
||||
|
||||
identifier_4: " a ";
|
||||
|
||||
multi_idents: "one two three four ";
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
|
||||
Run a program or command named a, which is on the PATH
|
||||
|
||||
> a
|
||||
|
||||
Run a program or command named a, which is in the current directory
|
||||
|
||||
> ./a
|
||||
|
||||
*/
|
Loading…
Reference in New Issue