diff --git a/Cargo.toml b/Cargo.toml index bcc833c..d52ad06 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -10,7 +10,7 @@ thiserror = "1.0" macros = { path = "macros" } dirs = "4" -log = "0.4" +log = { version = "0.4", features = [ "max_level_off", "release_max_level_off" ] } [dependencies.windows] version = "0.44.0" diff --git a/src/builtins.rs b/src/builtins.rs new file mode 100644 index 0000000..0c4090c --- /dev/null +++ b/src/builtins.rs @@ -0,0 +1,3 @@ +pub trait BuiltinFn { + fn call(&self, args: Vec<&str>); +} diff --git a/src/error.rs b/src/error.rs index ae5ac73..fa22456 100644 --- a/src/error.rs +++ b/src/error.rs @@ -1,4 +1,4 @@ -use crate::lex::{Token, Topoglyph}; +use crate::{lex::Token, topo::Glyph}; use std::io; use thiserror::Error; use windows::Win32::Foundation::{GetLastError, BOOL}; @@ -21,7 +21,7 @@ pub enum LexError { ExpectedWordCharacter, #[error("unexpected character: {0:?}")] - UnexpectedCharacter(Topoglyph), + UnexpectedCharacter(Glyph), #[error("unexpected eof")] UnexpectedEOF, diff --git a/src/lex.rs b/src/lex.rs index 89f9389..f270261 100644 --- a/src/lex.rs +++ b/src/lex.rs @@ -1,228 +1,12 @@ -use crate::error::LexError; +use crate::{ + error::LexError, + topo::{Glyph, Glyphs, Position}, +}; use std::{collections::VecDeque, fmt, ops::Range, str::Chars}; -/// The position of a specific glyph within a corpus of text. We use this for rendering error -/// messages and communicating to the user the location of errors. -#[derive(PartialEq, Clone, Copy)] -pub struct Position { - /// The visual line in which this glyph appears in the source text - line: u64, - - /// The visual column in which this glyph appears in the source text - column: u64, -} - -impl Position { - fn start() -> Self { - Self { line: 0, column: 0 } - } - - /// Increments position by column, going from the current line,column position to the next - /// column on the same line. - fn incr(&mut self) -> Position { - let p = *self; - self.column += 1; - p - } - - /// Increments the position by line, going from the current line,column position to the - /// beginning of the next line. - fn incr_line(&mut self) -> Position { - let p = *self; - self.column = 0; - self.line += 1; - p - } -} - -impl fmt::Debug for Position { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> Result<(), fmt::Error> { - write!(f, "{line}:{column}", line = self.line, column = self.column) - } -} - -/// A [Topoglyph] is a wrapper around a basic Rust [char] that includes information about where that -/// char appears in the source text. Where the char only describes the -/// [glyph](https://en.wikipedia.org/wiki/Glyph) (i.e., the graphical symbol), a topoglyph -/// includes both the glyph and its position, to be used to describe the locations of parsed -/// elements within a source text. Two glyphs appearing at different locations within a source text -/// would correspond to two distinct topoglyphs. -#[derive(PartialEq, Clone)] -pub struct Topoglyph { - /// the unicode code point of the glyph - glyph: char, - - /// The visual position in which the glyph appears; i.e., the human-comprehensible location - /// of the glyph in the source text - position: Position, - - /// The byte offsets corresponding to this topoglyph in the source data; i.e., the - /// machine-comprehensible location of the glyph in the source text - bytes: Range, -} - -impl Topoglyph { - fn is_word(&self) -> bool { - self.glyph.is_alphanumeric() || self.glyph == '.' - } - - fn is_glob(&self) -> bool { - self.is_word() || self.glyph == '*' - } -} - -impl fmt::Debug for Topoglyph { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> Result<(), fmt::Error> { - write!(f, "{char}@{pos:?}", char = self.glyph, pos = self.position) - } -} - -/// A topoglypher produces [topoglyphs](Topoglyph) for a source text; i.e., it is an iterator of -/// topoglyphs. The topoglypher is used to control reading from the source text and keeps a -/// lookahead buffer of topoglyphs that have not been processed. While a [Lexer] is responsible -/// for the creation and iteration of [tokens](Token), a topoglypher is responsible for the -/// creation and iteration of topoglyphs. -struct Topoglypher<'text> { - source: Chars<'text>, - next_position: Position, - bytes_read: u64, - lookahead: VecDeque, -} - -impl<'text> Topoglypher<'text> { - fn new(source: &'text str) -> Self { - Self { - source: source.chars(), - next_position: Position::start(), - bytes_read: 0, - lookahead: VecDeque::new(), - } - } - - /// reads the next n characters from the source text into our lookahead buffer - fn fill_lookahead(&mut self, n: usize) -> bool { - while self.lookahead.len() < n { - let c = match self.source.next() { - Some(c) => c, - None => break, - }; - - let len = c.len_utf8(); - let start = self.bytes_read; - self.bytes_read += len as u64; - let position = if c == '\n' { - self.next_position.incr_line() - } else { - self.next_position.incr() - }; - self.lookahead.push_back(Topoglyph { - glyph: c, - position, - bytes: Range { - start, - end: self.bytes_read, - }, - }) - } - self.lookahead.len() == n - } - - /// returns a reference to the next character from the source text, advancing our internal - /// lookahead buffer if necessary. Returns None if we're already at the end of our source text. - fn peek(&mut self) -> Option<&Topoglyph> { - self.peek_at(0) - } - - /// takes the next character from our input text - fn pop(&mut self) -> Result { - self.next().ok_or(LexError::UnexpectedEOF) - } - - /// returns a reference to a character in our lookahead buffer at a given position. This allows - /// us to perform a lookahead read without consuming any tokens, maintaining our current - /// position and keeping our unconsumed characters safe. - fn peek_at(&mut self, idx: usize) -> Option<&Topoglyph> { - self.fill_lookahead(idx + 1); - self.lookahead.get(idx) - } - - /// checks whether or not the next character in our source text matches some predicate - fn next_is(&mut self, pred: F) -> bool - where - F: FnOnce(&Topoglyph) -> bool, - { - self.peek().map(pred).unwrap_or(false) - } - - /// checks whether or not we're already at the end of our input text. If we're already at the - /// end of our input text, we do not expect any future reads to produce new characters. - fn at_eof(&mut self) -> bool { - self.peek().is_none() - } - - /// discards characters from our current position so long as the upcoming characters match some - /// predicate. This is called yeet_while instead of skip_while in order to avoid conflicting - /// with the - /// [skip_while](https://doc.rust-lang.org/std/iter/trait.Iterator.html#method.skip_while) - /// method of the stdlib Iterator trait. - pub fn yeet_while(&mut self, mut pred: F) - where - F: FnMut(&Topoglyph) -> bool, - { - while let Some(g) = self.peek() { - if pred(&g) { - self.next(); - } else { - return; - } - } - } - - fn yeet_whitespace(&mut self) { - self.yeet_while(|tg| tg.glyph.is_whitespace()); - } - - fn keep_word(&mut self) -> Result { - let gs = self.keep_until(|g| g.glyph.is_whitespace()); - if gs.is_empty() { - return Err(LexError::ExpectedWordCharacter); - } - Ok(Lexeme::from(gs)) - } - - fn keep_while(&mut self, mut pred: F) -> Vec - where - F: FnMut(&Topoglyph) -> bool, - { - let mut keep = Vec::new(); - - while let Some(g) = self.peek() { - if pred(&g) { - keep.push(g.clone()); - self.next(); - } else { - break; - } - } - - keep - } - - fn keep_until(&mut self, mut pred: F) -> Vec - where - F: FnMut(&Topoglyph) -> bool, - { - self.keep_while(|g| !pred(g)) - } -} - -impl<'text> Iterator for Topoglypher<'text> { - type Item = Topoglyph; - - fn next(&mut self) -> Option { - self.fill_lookahead(1); - self.lookahead.pop_front() - } +/// splits a corpus into Tokens. +pub fn lex(source: &str) -> Result, LexError> { + Lexer::new(source).collect() } /// A Lexeme is the text of a given Token, without respect to that Token's type, but with respect @@ -230,7 +14,7 @@ impl<'text> Iterator for Topoglypher<'text> { /// the addresses of each of its characters with respect to some source text. #[derive(PartialEq, Clone)] pub struct Lexeme { - elems: Vec, + elems: Vec, } impl Lexeme { @@ -274,8 +58,8 @@ impl fmt::Display for Lexeme { } } -impl From> for Lexeme { - fn from(v: Vec) -> Self { +impl From> for Lexeme { + fn from(v: Vec) -> Self { Self { elems: v } } } @@ -303,13 +87,13 @@ impl Token { } pub struct Tokenizer<'text> { - source: Topoglypher<'text>, + source: Glyphs<'text>, } impl<'text> Tokenizer<'text> { pub fn new(text: &'text str) -> Self { Self { - source: Topoglypher::new(text), + source: Glyphs::new(text), } } @@ -331,7 +115,7 @@ impl<'text> Tokenizer<'text> { } } - fn lex_bare_string(&mut self, mut progress: Vec) -> Result { + fn lex_bare_string(&mut self, mut progress: Vec) -> Result { while let Some(next) = self.source.peek() { match next.glyph { _ if next.glyph.is_whitespace() => break, @@ -355,7 +139,7 @@ impl<'text> Tokenizer<'text> { } } - fn lex_glob(&mut self, mut progress: Vec) -> Result { + fn lex_glob(&mut self, mut progress: Vec) -> Result { while let Some(next) = self.source.peek() { match next.glyph { _ if next.glyph.is_whitespace() => break, @@ -375,15 +159,15 @@ impl<'text> Tokenizer<'text> { } } - fn lex_raw_string(&mut self, _progress: Vec) -> Result { + fn lex_raw_string(&mut self, _progress: Vec) -> Result { Err(LexError::not_yet("raw strings not done yet")) } - fn lex_interp_string(&mut self, _progress: Vec) -> Result { + fn lex_interp_string(&mut self, _progress: Vec) -> Result { Err(LexError::not_yet("interpreted strings not done yet")) } - fn lex_var(&mut self, _progress: Vec) -> Result { + fn lex_var(&mut self, _progress: Vec) -> Result { Err(LexError::not_yet("variables are not done yet")) } } @@ -396,10 +180,6 @@ impl<'text> Iterator for Tokenizer<'text> { } } -pub fn lex(source: &str) -> Result, LexError> { - Tokenizer::new(source).collect() -} - pub struct Lexer<'text> { source: Tokenizer<'text>, lookahead: VecDeque, @@ -451,9 +231,9 @@ mod tests { use std::iter::zip; fn lexeme(txt: &str) -> Lexeme { - let x: Vec = txt + let x: Vec = txt .chars() - .map(|c| Topoglyph { + .map(|c| Glyph { glyph: c, position: Position::start(), bytes: 0..0, diff --git a/src/main.rs b/src/main.rs index a0061a2..fac1718 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,3 +1,4 @@ +mod builtins; mod error; mod ext; mod input; @@ -10,6 +11,7 @@ mod parse; mod prompt; mod shell; mod syntax; +mod topo; use crate::log::*; use prompt::Prompt; @@ -55,7 +57,12 @@ fn main() -> Result<()> { shell.output.newline()?; let s = shell.line.pop(); info!("◇ {}", s); - match syntax::x(&s) { + if let Ok(tokens) = lex::lex(&s) { + for t in tokens { + debug!(" {:?}", t); + } + } + match syntax::parse(&s) { Ok(tree) => { debug!(" {:?}", tree); let mut state = syntax::State::new(); diff --git a/src/parse.rs b/src/parse.rs index 265e5eb..c601158 100644 --- a/src/parse.rs +++ b/src/parse.rs @@ -3,6 +3,7 @@ use crate::lex::{Lexer, Token}; use std::{ cell::RefCell, collections::VecDeque, + io::Write, rc::{Rc, Weak}, sync::atomic::AtomicUsize, }; @@ -152,6 +153,13 @@ impl Cursor { idx: 0, } } + + pub fn render_textree(&self, w: &mut W, depth: u32) { + write!(w, "{:?} {pad:?}", self.target.value, pad = depth * 2); + for child in self.iter_children() { + child.render_textree(w, depth + 1); + } + } } pub struct Parser<'text> { diff --git a/src/shell.rs b/src/shell.rs index abfc9cd..b2c0695 100644 --- a/src/shell.rs +++ b/src/shell.rs @@ -3,7 +3,7 @@ use crate::{ input, line::Line, log::*, - output, + output, syntax, }; use std::path::{Path, PathBuf}; @@ -15,6 +15,7 @@ pub struct Shell { pub input: input::Reader, pub output: output::Writer, pub line: Line, + pub state: syntax::State, } impl Shell { @@ -23,6 +24,7 @@ impl Shell { input: input::Reader::new()?, output: output::Writer::stdout()?, line: Line::new(), + state: syntax::State::new(), }) } @@ -49,7 +51,7 @@ impl Shell { } pub fn seek_right(&mut self) -> Result<()> { - info!("» seek right"); + info!("»"); let n = self.line.seek_right(); if n > 0 { // move right by the distance seeked @@ -59,7 +61,7 @@ impl Shell { } pub fn seek_left(&mut self) -> Result<()> { - info!("» seek left"); + info!("«"); let n = self.line.seek_left(); if n > 0 { // move left by the distance seeked diff --git a/src/syntax.rs b/src/syntax.rs index 94f7216..da1fad1 100644 --- a/src/syntax.rs +++ b/src/syntax.rs @@ -1,15 +1,26 @@ use crate::{ + builtins::BuiltinFn, error::{ExecError, ParseError}, lex::{Lexer, Token}, - parse, + log::debug, + parse, syntax, +}; +use std::{ + collections::{HashMap, HashSet}, + process, }; -use std::{collections::HashSet, process}; -pub struct State {} +pub struct State { + builtins: HashMap<&'static str, Box>, + variables: HashMap<&'static str, syntax::Value>, +} impl State { pub fn new() -> Self { - Self {} + Self { + builtins: HashMap::new(), + variables: HashMap::new(), + } } } @@ -108,7 +119,6 @@ impl Eval for Command { } } -// ????? waht am i doing now struct TreeBuilder { visited: HashSet, } @@ -160,16 +170,11 @@ impl TreeBuilder { } } -fn build(mut source: parse::Cursor) -> Result { - source.up_to_root(); - let mut builder = TreeBuilder::new(); - builder.descend(&mut source) -} - -pub fn x(source: &str) -> Result { +pub fn parse(source: &str) -> Result { let tokens = Lexer::new(source); let parser = parse::Parser::new(tokens); let mut parse_tree = parser.parse()?; + debug!("parse tree: {parse_tree:?}"); let mut builder = TreeBuilder::new(); builder.descend(&mut parse_tree) } @@ -177,11 +182,10 @@ pub fn x(source: &str) -> Result { #[cfg(test)] mod test { use super::*; - use crate::lex::lex; #[test] fn hi() -> Result<(), ParseError> { - let e = x("ls one two three")?; + let e = parse("ls one two three")?; print!("{:?}", e); todo!() //Ok(()) diff --git a/src/topo.rs b/src/topo.rs new file mode 100644 index 0000000..316acf7 --- /dev/null +++ b/src/topo.rs @@ -0,0 +1,172 @@ +use crate::error::LexError; +use std::{collections::VecDeque, fmt, ops::Range, str::Chars}; + +/// The position of a specific glyph within a corpus of text. We use this for rendering error +/// messages and communicating to the user the location of errors. +#[derive(Debug, PartialEq, Clone, Copy)] +pub struct Position { + /// The visual line in which this glyph appears in the source text + pub line: u64, + + /// The visual column in which this glyph appears in the source text + pub column: u64, +} + +impl Position { + pub fn start() -> Self { + Self { line: 0, column: 0 } + } + + /// Increments position by column, going from the current line,column position to the next + /// column on the same line. + pub fn incr(&mut self) -> Position { + let p = *self; + self.column += 1; + p + } + + /// Increments the position by line, going from the current line,column position to the + /// beginning of the next line. + pub fn incr_line(&mut self) -> Position { + let p = *self; + self.column = 0; + self.line += 1; + p + } +} + +#[derive(PartialEq, Clone)] +pub struct Glyph { + /// the unicode code point of the glyph + pub glyph: char, + + /// The visual position in which the glyph appears; i.e., the human-comprehensible location + /// of the glyph in the source text + pub position: Position, + + /// The byte offsets corresponding to this topoglyph in the source data; i.e., the + /// machine-comprehensible location of the glyph in the source text + pub bytes: Range, +} + +impl Glyph { + pub fn is_word(&self) -> bool { + self.glyph.is_alphanumeric() || self.glyph == '.' + } + + pub fn is_glob(&self) -> bool { + self.is_word() || self.glyph == '*' + } +} + +impl fmt::Debug for Glyph { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> Result<(), fmt::Error> { + write!( + f, + "[{char} ({pos:?})]", + char = self.glyph, + pos = self.position + ) + } +} +/// Glyphs produces [glyphs](Glyph) for a source text; i.e., it is an iterator of [Glyph] values. +/// Glyphs is used to control reading from the source text and keeps a lookahead buffer of glyphs +/// that have not been processed. While a [crate::lex::Lexer] is responsible for the creation and +/// iteration of [tokens](crate::lex::Token), Glyphs is responsible for the creation and iteration +/// of glyphs. +pub struct Glyphs<'text> { + source: Chars<'text>, + next_position: Position, + bytes_read: u64, + lookahead: VecDeque, +} + +impl<'text> Glyphs<'text> { + pub fn new(source: &'text str) -> Self { + // neat + Self { + source: source.chars(), + next_position: Position::start(), + bytes_read: 0, + lookahead: VecDeque::new(), + } + } + + /// reads the next n characters from the source text into our lookahead buffer + fn fill_lookahead(&mut self, n: usize) -> bool { + while self.lookahead.len() < n { + let c = match self.source.next() { + Some(c) => c, + None => break, + }; + + let len = c.len_utf8(); + let start = self.bytes_read; + self.bytes_read += len as u64; + let position = if c == '\n' { + self.next_position.incr_line() + } else { + self.next_position.incr() + }; + self.lookahead.push_back(Glyph { + glyph: c, + position, + bytes: Range { + start, + end: self.bytes_read, + }, + }) + } + self.lookahead.len() == n + } + + /// returns a reference to the next character from the source text, advancing our internal + /// lookahead buffer if necessary. Returns None if we're already at the end of our source text. + pub fn peek(&mut self) -> Option<&Glyph> { + self.peek_at(0) + } + + /// takes the next character from our input text + pub fn pop(&mut self) -> Result { + self.next().ok_or(LexError::UnexpectedEOF) + } + + /// returns a reference to a character in our lookahead buffer at a given position. This allows + /// us to perform a lookahead read without consuming any tokens, maintaining our current + /// position and keeping our unconsumed characters safe. + fn peek_at(&mut self, idx: usize) -> Option<&Glyph> { + self.fill_lookahead(idx + 1); + self.lookahead.get(idx) + } + + /// discards characters from our current position so long as the upcoming characters match some + /// predicate. This is called yeet_while instead of skip_while in order to avoid conflicting + /// with the + /// [skip_while](https://doc.rust-lang.org/std/iter/trait.Iterator.html#method.skip_while) + /// method of the stdlib Iterator trait. + pub fn yeet_while(&mut self, mut pred: F) + where + F: FnMut(&Glyph) -> bool, + { + while let Some(g) = self.peek() { + if pred(&g) { + self.next(); + } else { + return; + } + } + } + + pub fn yeet_whitespace(&mut self) { + self.yeet_while(|tg| tg.glyph.is_whitespace()); + } +} + +impl<'text> Iterator for Glyphs<'text> { + type Item = Glyph; + + fn next(&mut self) -> Option { + self.fill_lookahead(1); + self.lookahead.pop_front() + } +}