From cd51f4cce1bf961e411852bdbddf7df52f6c2855 Mon Sep 17 00:00:00 2001 From: Jordan Orelli Date: Sun, 28 Jan 2024 18:34:30 -0600 Subject: [PATCH] i'm redoing all the parsing lol --- README.md | 97 +++++++++++++++++- src/error.rs | 21 +++- src/lex.rs | 100 +++++++++++++++--- src/log.rs | 12 +-- src/main.rs | 5 + src/parse.rs | 4 - src/parse2.rs | 278 ++++++++++++++++++++++++++++++++++++++++++++++++++ 7 files changed, 487 insertions(+), 30 deletions(-) create mode 100644 src/parse2.rs diff --git a/README.md b/README.md index 23041b1..79ced73 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,98 @@ # clyde -A command-line shell. Initial development is for Windows because Windows has the worst starting point when it comes to CLI shells. \ No newline at end of file +A command-line shell. Initial development is for Windows because Windows has +the worst starting point when it comes to CLI shells. Assuming nothing here +works and it doesn't even compile. + +## Background + +The needs of this project reflect my professional experience working on +multiplayer video games. Prior to working in the games industry, I worked in +the web industry, primarily writing server software. The working practices of +programmers in these industries differ substantially. The large differences in +these working practices create a substantial and difficult social divide +within studios that make multiplayer games. These social differences make an +already-difficult category of software even more difficult to develop. The +goal of this project is to reduce the tooling divide that exists between +client and server developers at multiplayer game studios, with the expectation +that shrinking this tooling gap can reduce the social divide that exists +between game developers and server/infrastructure developers at game studios. + +### Windows is a hard requirement + +The first gap that appears within the greater software developer landscape is +the question of using Windows at all. Let us dispense with this question +immediately: Windows is a necesary tool within the game development industry. +Supporting Windows is a hard requirement of this project. + +Many necessary tools, such as tools provided by game console manufacturers and +tools that integrate with the dominant game engines (Unity and Unreal) only +support Windows. If your reaction to this is "don't use those tools" or "make +your game in Godot", please just stop. This is a very common reaction from +professional programmers who have never worked in gamedev, who don't +understand the constraints that most game developers face. Unfortunately a +large portion of programming discussion happens on Hacker News, Reddit, and +Lobsters, each of which uses a democratic structure. + +I will not attempt to convince you that Windows is broadly +unavoidable in professional game development and that Windows is a necessary +tool for game development studios, even studios that employ engineers for whom +Windows is not necessary for their individual role. If you want to advocate +for Linux superiority, please go bother Nintendo, Sony, Microsoft, etc, not +individual game developers. If you really can't get over this and feel the +need to yell at me, please walk a few hundred yards into a dense forest and +scream your complaints into the wilderness to feel better. + +The command-line environments built into Windows are incredibly primitive. +These environments are so primitive that millions of professional programmers +who work primarily on Windows are led to believe that command-line +environments themselves are *inherently* primitive. + +. The limitations of built-in +command-line environments on Windows is so severe that there are entire +categories of professional computer programmers who believe that they are "not +terminal people", often without realizing that the terminal on Linux and MacOS +is a vastly different experience and different tooling ecosystem than it is on +Windows. Many professional Windows developers believe that CLI environments +are inherently primitive, because their only exposure to CLI environments is +through the built-in Windows shells. + +Windows ships with two built-in +shells. The first shell is the Command Prompt, which we'll refer to as +[cmd.exe](https://en.wikipedia.org/wiki/Cmd.exe) as it is colloquially known, +even when running inside of +[terminal.exe](https://en.wikipedia.org/wiki/Windows_Terminal). The second +shell is [PowerShell](https://en.wikipedia.org/wiki/PowerShell). + +### Insufficiency of cmd.exe + +The insufficiency of cmd.exe is widely understood to people with experience +with Unix-like operating systems such as Linux and MacOS, which typically +include either [bash](https://en.wikipedia.org/wiki/Bash_(Unix_shell)) or +[zsh](https://en.wikipedia.org/wiki/Z_shell). This insufficiency is severe +enough that it drives people off of Windows entirely. + +In some industries, such as games, Windows is a required development +environment, because many tools are available only on Windows. Free software +purists who want to insist that you can make video games on Linux, please feel +free to leave now, this project is not for you. + +## Terimology + +- **executable**: a shell built-in or an executable file (i.e., an .exe file) + +## examples + + $> a + +Find an executable named `a` somewhere on `PATH` and execute it in the +foreground. + + $> ./a + +Use an executable named `a` within the current directory and execute it in the +foreground. `a` must be a file in the current directory. If `a` would refer to +both a shell built-in and a file in the current directory, use the file in the +current directory. + + $> diff --git a/src/error.rs b/src/error.rs index 889c2d1..74346d3 100644 --- a/src/error.rs +++ b/src/error.rs @@ -1,4 +1,4 @@ -use crate::lex::Topoglyph; +use crate::lex::{Token, Topoglyph}; use std::io; use thiserror::Error; use windows::Win32::Foundation::{GetLastError, BOOL}; @@ -44,8 +44,23 @@ impl LexError { #[derive(Debug, Error)] pub enum ParseError { - #[error("Unexpected Token")] - UnexpectedToken, + #[error("lex error")] + LexError(#[from] LexError), + + #[error("Unexpected Token: {0:?}")] + UnexpectedToken(Token), + + #[error("Illegal attempt to climb parse tree while already at root")] + AtRootAlready, + + #[error("Illegal attempt to climb barse tree when target parent has already been dropped")] + ParentIsGone, + + #[error("Illegal attempt to double-borrow a node")] + BorrowError(#[from] std::cell::BorrowMutError), + + #[error("Illegal attempt to push a value as a child to a terminal value")] + PushOntoTerminal, } impl Error { diff --git a/src/lex.rs b/src/lex.rs index 9d86457..dd79ea3 100644 --- a/src/lex.rs +++ b/src/lex.rs @@ -22,7 +22,8 @@ fn is_keyword(s: &str) -> bool { } } -/// The position of a specific glyph within a corpus of text +/// The position of a specific glyph within a corpus of text. We use this for rendering error +/// messages and communicating to the user the location of errors. #[derive(PartialEq, Clone, Copy)] pub struct Position { /// The visual line in which this glyph appears in the source text @@ -37,12 +38,16 @@ impl Position { Self { line: 0, column: 0 } } + /// Increments position by column, going from the current line,column position to the next + /// column on the same line. fn incr(&mut self) -> Position { let p = *self; self.column += 1; p } + /// Increments the position by line, going from the current line,column position to the + /// beginning of the next line. fn incr_line(&mut self) -> Position { let p = *self; self.column = 0; @@ -105,7 +110,8 @@ impl<'text> Topoglypher<'text> { } } - fn feed(&mut self, n: usize) -> bool { + /// reads the next n characters from the source text into our lookahead buffer + fn fill_lookahead(&mut self, n: usize) -> bool { while self.lookahead.len() < n { let c = match self.source.next() { Some(c) => c, @@ -132,19 +138,26 @@ impl<'text> Topoglypher<'text> { self.lookahead.len() == n } + /// returns a reference to the next character from the source text, advancing our internal + /// lookahead buffer if necessary. Returns None if we're already at the end of our source text. fn peek(&mut self) -> Option<&Topoglyph> { self.peek_at(0) } + /// takes the next character from our input text fn pop(&mut self) -> Result { self.next().ok_or(LexError::UnexpectedEOF) } + /// returns a reference to a character in our lookahead buffer at a given position. This allows + /// us to perform a lookahead read without consuming any tokens, maintaining our current + /// position and keeping our unconsumed characters safe. fn peek_at(&mut self, idx: usize) -> Option<&Topoglyph> { - self.feed(idx + 1); + self.fill_lookahead(idx + 1); self.lookahead.get(idx) } + /// checks whether or not the next character in our source text matches some predicate fn next_is(&mut self, pred: F) -> bool where F: FnOnce(&Topoglyph) -> bool, @@ -152,10 +165,17 @@ impl<'text> Topoglypher<'text> { self.peek().map(pred).unwrap_or(false) } - fn is_empty(&mut self) -> bool { + /// checks whether or not we're already at the end of our input text. If we're already at the + /// end of our input text, we do not expect any future reads to produce new characters. + fn at_eof(&mut self) -> bool { self.peek().is_none() } + /// discards characters from our current position so long as the upcoming characters match some + /// predicate. This is called yeet_while instead of skip_while in order to avoid conflicting + /// with the + /// [skip_while](https://doc.rust-lang.org/std/iter/trait.Iterator.html#method.skip_while) + /// method of the stdlib Iterator trait. pub fn yeet_while(&mut self, mut pred: F) where F: FnMut(&Topoglyph) -> bool, @@ -211,7 +231,7 @@ impl<'text> Iterator for Topoglypher<'text> { type Item = Topoglyph; fn next(&mut self) -> Option { - self.feed(1); + self.fill_lookahead(1); self.lookahead.pop_front() } } @@ -219,7 +239,7 @@ impl<'text> Iterator for Topoglypher<'text> { /// A Lexeme is the text of a given Token, without respect to that Token's type, but with respect /// to where the text appears relative to some source code. This is, simply, a string that contains /// the addresses of each of its characters with respect to some source text. -#[derive(PartialEq)] +#[derive(PartialEq, Clone)] pub struct Lexeme { elems: Vec, } @@ -272,18 +292,18 @@ impl From> for Lexeme { } #[allow(dead_code)] -#[derive(Debug, PartialEq)] +#[derive(Debug, PartialEq, Clone)] pub enum Token { - BareString(Lexeme), + String(Lexeme), Glob(Lexeme), } -struct Lexer<'text> { +pub struct Tokenizer<'text> { source: Topoglypher<'text>, } -impl<'text> Lexer<'text> { - fn new(text: &'text str) -> Self { +impl<'text> Tokenizer<'text> { + pub fn new(text: &'text str) -> Self { Self { source: Topoglypher::new(text), } @@ -323,7 +343,7 @@ impl<'text> Lexer<'text> { if progress.is_empty() { Err(LexError::UnexpectedEOF) } else { - Ok(Token::BareString(progress.into())) + Ok(Token::String(progress.into())) } } @@ -361,7 +381,7 @@ impl<'text> Lexer<'text> { } } -impl<'text> Iterator for Lexer<'text> { +impl<'text> Iterator for Tokenizer<'text> { type Item = Result; fn next(&mut self) -> Option { @@ -369,6 +389,55 @@ impl<'text> Iterator for Lexer<'text> { } } +pub fn lex(source: &str) -> Result, LexError> { + Tokenizer::new(source).collect() +} + +pub struct Lexer<'text> { + source: Tokenizer<'text>, + lookahead: VecDeque, +} + +impl<'text> Lexer<'text> { + pub fn new(source: &'text str) -> Self { + Self { + source: Tokenizer::new(source), + lookahead: VecDeque::new(), + } + } + + fn fill_lookahead(&mut self, n: usize) -> Result { + while self.lookahead.len() < n { + let token = match self.source.next() { + Some(res) => res?, + None => return Ok(false), + }; + self.lookahead.push_back(token); + } + Ok(true) + } + + pub fn peek_at(&mut self, idx: usize) -> Result, LexError> { + self.fill_lookahead(idx + 1)?; + Ok(None) + } + + pub fn peek(&mut self) -> Result, LexError> { + self.peek_at(0) + } +} + +impl<'text> Iterator for Lexer<'text> { + type Item = Result; + + fn next(&mut self) -> Option { + match self.lookahead.pop_front() { + Some(token) => Some(Ok(token)), + None => self.source.next(), + } + } +} + #[cfg(test)] mod tests { use super::*; @@ -379,7 +448,7 @@ mod tests { #[test] fn $name() { println!("testing that we can lex the following input text:\n\t{}", $line); - let lexer = Lexer::new($line); + let lexer = Tokenizer::new($line); let tokens: Result, LexError> = lexer.collect(); match tokens { Ok(tokens) => { @@ -400,8 +469,7 @@ mod tests { #[test] fn $name() { println!("testing that we will fail to lex the following input text:\n\t{}", $line); - let lexer = Lexer::new($line); - let tokens: Result, LexError> = lexer.collect(); + let tokens = lex($line); match tokens { Ok(tokens) => { println!("output tokens: {tokens:?}"); diff --git a/src/log.rs b/src/log.rs index 5103b78..e5aa230 100644 --- a/src/log.rs +++ b/src/log.rs @@ -1,5 +1,5 @@ use crate::error::Error; -pub use log::{debug, error, info, set_logger, set_max_level, trace, warn, LevelFilter}; +pub use log::{debug, info, set_logger, set_max_level, warn, LevelFilter}; use std::{ fs::File, @@ -49,19 +49,19 @@ where match record.level() { log::Level::Error => { _ = write!(out, "\x1b[31m{}\x1b[0m\n", record.args()); - }, + } log::Level::Warn => { _ = write!(out, "\x1b[33m{}\x1b[0m\n", record.args()); - }, + } log::Level::Info => { _ = write!(out, "\x1b[37m{}\x1b[0m\n", record.args()); - }, + } log::Level::Debug => { _ = write!(out, "\x1b[90m{}\x1b[0m\n", record.args()); - }, + } log::Level::Trace => { _ = write!(out, "\x1b[36m{}\x1b[0m\n", record.args()); - }, + } } } } diff --git a/src/main.rs b/src/main.rs index 279f3ff..3c74a7a 100644 --- a/src/main.rs +++ b/src/main.rs @@ -7,6 +7,7 @@ mod line; mod log; mod output; mod parse; +mod parse2; mod prompt; mod shell; @@ -181,3 +182,7 @@ fn main() -> Result<()> { } } } + +/* + +*/ diff --git a/src/parse.rs b/src/parse.rs index de15dc5..9554754 100644 --- a/src/parse.rs +++ b/src/parse.rs @@ -47,10 +47,6 @@ impl Node { children: Vec::new(), } } - - // pub fn visit(self) -> Tree { - // self.into() - // } } impl fmt::Debug for Node { diff --git a/src/parse2.rs b/src/parse2.rs new file mode 100644 index 0000000..98f5688 --- /dev/null +++ b/src/parse2.rs @@ -0,0 +1,278 @@ +use crate::error::ParseError; +use crate::lex::{Lexer, Token}; +use std::{ + cell::RefCell, + collections::VecDeque, + rc::{Rc, Weak}, +}; + +#[derive(PartialEq)] +pub enum Value { + /// The start symbol of our parse tree. Each parse tree is rooted in a node whose value is the + /// start symbol. This is the only node in the tree that should utilize the start symbol. + Start, + + Statement, + + Terminal(Token), +} + +impl Value { + fn is_terminal(&self) -> bool { + matches!(self, Value::Terminal(_)) + } +} + +/// A node in a parse tree. +pub struct Node { + /// A node may or may not have a parent node. If a node does not have a parent node, that node + /// is the root node of a tree. + parent: Option>, + + /// The value of the element at this node + value: Value, + + /// A node may or may not have children. Since an empty vector is a valid vector, a node + /// without children is represented as having an empty children vector. A node having an empty + /// list of children is a leaf node in a tree. + children: RefCell>>, +} + +impl Node { + fn new() -> Cursor { + let root = Node { + parent: None, + value: Value::Start, + children: RefCell::new(Vec::new()), + }; + + let root = Rc::new(root); + Cursor { + target: Rc::clone(&root), + root, + } + } +} + +/// Cursor values expose access to a parse tree. +struct Cursor { + target: Rc, + root: Rc, +} + +impl Cursor { + /// Climbs one level up a parse tree. The cursor is re-pointed from its current target node to + /// the parent of its current target node. This method fails if the cursor is already at the + /// root node of the parse tree. + fn up(&mut self) -> Result<(), ParseError> { + match &self.target.parent { + None => Err(ParseError::AtRootAlready), + Some(parent) => match parent.upgrade() { + Some(parent) => { + self.target = parent; + Ok(()) + } + None => Err(ParseError::ParentIsGone), + }, + } + } + + /// Adds a value to the children of the current target node, then descends to select that + /// child. + fn push(&mut self, v: Value) -> Result<(), ParseError> { + if self.target.value.is_terminal() { + return Err(ParseError::PushOntoTerminal); + } + let node = Node { + parent: Some(Rc::downgrade(&self.target)), + value: v, + children: RefCell::new(Vec::new()), + }; + let node = Rc::new(node); + self.target + .children + .try_borrow_mut()? + .push(Rc::clone(&node)); + self.target = node; + Ok(()) + } + + fn is_root(&self) -> bool { + self.target.parent.is_none() + } + + fn into_root(self) -> Rc { + Rc::clone(&self.root) + } + + fn value(&self) -> &Value { + &self.target.value + } +} + +struct Parser<'text> { + source: Lexer<'text>, + cursor: Cursor, +} + +impl<'text> Parser<'text> { + pub fn new(source: Lexer<'text>) -> Self { + Self { + source, + cursor: Node::new(), + } + } + + pub fn parse(mut self) -> Result, ParseError> { + while self.step()? {} + Ok(self.cursor.into_root()) + } + + fn step(&mut self) -> Result { + match self.cursor.value() { + Value::Start => self.step_start(), + Value::Statement => self.step_statement(), + Value::Terminal(_) => panic!(), + } + } + + fn step_start(&mut self) -> Result { + assert!(matches!(self.cursor.value(), Value::Start)); + match self.source.peek()? { + Some(Token::String(_)) => { + self.cursor.push(Value::Statement)?; + let token = self.source.next().unwrap()?; + self.cursor.push(Value::Terminal(token))?; + self.cursor.up()?; + Ok(true) + } + Some(Token::Glob(_)) => { + let token = self.source.next().unwrap()?; + Err(ParseError::UnexpectedToken(token)) + } + None => Ok(false), + } + } + + fn step_statement(&mut self) -> Result { + assert!(matches!(self.cursor.value(), Value::Statement)); + match self.source.peek()? { + Some(Token::String(_) | Token::Glob(_)) => { + let token = self.source.next().unwrap()?; + self.cursor.push(Value::Terminal(token))?; + self.cursor.up()?; + Ok(true) + } + None => Ok(false), + } + } +} + +fn parse(source: &str) -> Result, ParseError> { + let tokens = Lexer::new(source); + let parser = Parser::new(tokens); + parser.parse() +} + +#[cfg(test)] +mod test { + use super::*; + use crate::lex::lex; + + #[test] + fn root() { + let mut cursor = Node::new(); + assert!(cursor.up().is_err()); + assert!(cursor.target.value == Value::Start); + assert!(cursor.is_root()); + } + + #[test] + fn single_val() { + let mut cursor = Node::new(); + + let mut tokens = lex(" ls ").unwrap(); + let ls = tokens.pop().unwrap(); + assert!(cursor.push(Value::Statement).is_ok()); + assert!(cursor.push(Value::Terminal(ls.clone())).is_ok()); + assert!(cursor.push(Value::Terminal(ls.clone())).is_err()); + assert!(cursor.target.value == Value::Terminal(ls)); + assert!(!cursor.is_root()); + assert!(cursor.up().is_ok()); + assert!(cursor.up().is_ok()); + assert!(cursor.is_root()); + assert!(cursor.up().is_err()); + assert!(cursor.value() == &Value::Start); + let root = cursor.into_root(); + assert!(root.value == Value::Start); + } + + #[test] + fn test_parse() -> Result<(), ParseError> { + parse("ls")?; + // parse("*")?; + // parse("x* ls")?; + Ok(()) + } +} + +/* + +> ls + + start + statement + ls + +> ls ; + + start + statement + ls + ; + +> ls ; ls + + start + statement + ls + ; + statement + ls + +> ls one two three + + start + statement + ls + one + two + three + +> ls > files.txt ; echo files.txt + + start + statement + ls + > + files.txt + ; + statement + echo + files.txt + +> if exists ~/.vimrc : echo you have a vimrc + +> if $x == 3: echo hi + + start + if + expression + $x + == + 3 + : + statement + echo + hi +*/