From 83cc98907e67db3dfbb7240bb150c7a0129835b4 Mon Sep 17 00:00:00 2001 From: benoitlx Date: Tue, 28 Jan 2025 20:45:34 +0100 Subject: [PATCH 1/5] wip: error-handling for tokenizer --- Cargo.toml | 3 ++- src/main.rs | 53 +++++++++++++++++++++++++++++++++++++++++--------- tests/test.asm | 10 ++++++++++ 3 files changed, 56 insertions(+), 10 deletions(-) create mode 100644 tests/test.asm diff --git a/Cargo.toml b/Cargo.toml index 470e8ad..4d44534 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -5,7 +5,8 @@ edition = "2021" [dependencies] logos = "0.15.0" -miette = "7.4.0" +miette = { version = "7.4.0", features = ["fancy"] } +thiserror = "2.0.11" [dev-dependencies] rusty-hook = "^0.11.2" diff --git a/src/main.rs b/src/main.rs index c06fec6..c574dae 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,29 +1,64 @@ mod constants; mod tokenizer; -use tokenizer::Token; use logos::Logos; +use miette::{Diagnostic, NamedSource, SourceSpan}; +use std::env; use std::fs::File; use std::io::prelude::*; -use std::env; +use thiserror::Error; +use tokenizer::Token; + +#[derive(Error, Debug, Diagnostic)] +#[error("Unrecognized token")] +#[diagnostic(code(oops), url("https://rezoleo.fr"), help("Try with A *A V *V or C for a register"))] +pub struct TokenError { + #[source_code] + src: NamedSource, + + #[label("problem here")] + bad_bit: SourceSpan, +} + +#[derive(Error, Diagnostic, Debug)] +pub enum TokenizerError { + #[error(transparent)] + #[diagnostic(code(tokernizer::io_error))] + IoError(#[from] std::io::Error), -fn main() -> std::io::Result<()> { - let args: Vec = env::args().collect(); + #[error(transparent)] + #[diagnostic(transparent)] + TokenError(#[from] TokenError), +} + +use miette::Result; +fn tokenizer_app(args: std::env::Args) -> Result<(), TokenizerError> { + let args: Vec = args.collect(); - let mut file = File::open(&args[1])?; + let filename: &str = &args[1]; + + let mut file = File::open(filename)?; let mut contents = String::new(); file.read_to_string(&mut contents)?; + let mut lex = Token::lexer(contents.as_str()); - let lex = Token::lexer(contents.as_str()); - - for result in lex { + while let Some(result) = lex.next() { match result { Ok(token) => println!("{:#?}", token), - Err(_) => panic!("Err occured"), + Err(_) => Err(TokenError { + src: NamedSource::new(filename, contents.clone()), + bad_bit: lex.span().into() + })? } } Ok(()) } + +fn main() -> Result<()> { + let _ = tokenizer_app(env::args())?; + + Ok(()) +} diff --git a/tests/test.asm b/tests/test.asm new file mode 100644 index 0000000..078c405 --- /dev/null +++ b/tests/test.asm @@ -0,0 +1,10 @@ +A <-- 1 + C +A <- 256 +*A <- C +A >= 3 +A +D + +thisis_a_label: + +0x333 \ No newline at end of file From 4fe1fc5595c42abbff26395e19669a24b98354ea Mon Sep 17 00:00:00 2001 From: benoitlx Date: Tue, 28 Jan 2025 21:56:23 +0100 Subject: [PATCH 2/5] refactor: error definition separated from main --- src/error_handling.rs | 28 ++++++++++++++++++++++++++++ src/main.rs | 33 ++++++--------------------------- 2 files changed, 34 insertions(+), 27 deletions(-) create mode 100644 src/error_handling.rs diff --git a/src/error_handling.rs b/src/error_handling.rs new file mode 100644 index 0000000..042ba93 --- /dev/null +++ b/src/error_handling.rs @@ -0,0 +1,28 @@ +use miette::{Diagnostic, NamedSource, SourceSpan}; +use thiserror::Error; + +#[derive(Error, Debug, Diagnostic)] +#[error("Unrecognized token")] +#[diagnostic( + code(tokenizer::no_matching_pattern), + url("https://my-incredible-doc.fr"), + help("TODO provide the closest pattern") +)] +pub struct UnrecognizedToken { + #[source_code] + pub src: NamedSource, + + #[label("This doesn't match any Token pattern")] + pub src_span: SourceSpan, +} + +#[derive(Error, Diagnostic, Debug)] +pub enum TokenizerError { + #[error(transparent)] + #[diagnostic(code(tokenizer::io_error), help("try this filename:"))] + IoError(#[from] std::io::Error), + + #[error(transparent)] + #[diagnostic(transparent)] + TokenError(#[from] UnrecognizedToken), +} diff --git a/src/main.rs b/src/main.rs index c574dae..00ba6b3 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,36 +1,15 @@ mod constants; +mod error_handling; mod tokenizer; +use error_handling::{TokenizerError, UnrecognizedToken}; use logos::Logos; -use miette::{Diagnostic, NamedSource, SourceSpan}; +use miette::NamedSource; use std::env; use std::fs::File; use std::io::prelude::*; -use thiserror::Error; use tokenizer::Token; -#[derive(Error, Debug, Diagnostic)] -#[error("Unrecognized token")] -#[diagnostic(code(oops), url("https://rezoleo.fr"), help("Try with A *A V *V or C for a register"))] -pub struct TokenError { - #[source_code] - src: NamedSource, - - #[label("problem here")] - bad_bit: SourceSpan, -} - -#[derive(Error, Diagnostic, Debug)] -pub enum TokenizerError { - #[error(transparent)] - #[diagnostic(code(tokernizer::io_error))] - IoError(#[from] std::io::Error), - - #[error(transparent)] - #[diagnostic(transparent)] - TokenError(#[from] TokenError), -} - use miette::Result; fn tokenizer_app(args: std::env::Args) -> Result<(), TokenizerError> { let args: Vec = args.collect(); @@ -47,10 +26,10 @@ fn tokenizer_app(args: std::env::Args) -> Result<(), TokenizerError> { while let Some(result) = lex.next() { match result { Ok(token) => println!("{:#?}", token), - Err(_) => Err(TokenError { + Err(_) => Err(UnrecognizedToken{ src: NamedSource::new(filename, contents.clone()), - bad_bit: lex.span().into() - })? + src_span: lex.span().into(), + })?, } } From 356fb1ceb30bf1681f0ec1b2254e245f1d206451 Mon Sep 17 00:00:00 2001 From: benoitlx Date: Fri, 31 Jan 2025 17:51:50 +0100 Subject: [PATCH 3/5] feat!: better error handling - catch multiple label definitions when lexing - --- src/constants.rs | 1 + src/error_handling.rs | 28 ----- src/lexer/lexer.rs | 206 ++++++++++++++++++++++++++++++++++ src/lexer/lexer_error.rs | 92 +++++++++++++++ src/lexer/token_definition.rs | 89 +++++++++++++++ src/main.rs | 39 +------ src/tokenizer.rs | 140 ----------------------- tests/test.asm | 15 ++- 8 files changed, 398 insertions(+), 212 deletions(-) delete mode 100644 src/error_handling.rs create mode 100644 src/lexer/lexer.rs create mode 100644 src/lexer/lexer_error.rs create mode 100644 src/lexer/token_definition.rs delete mode 100644 src/tokenizer.rs diff --git a/src/constants.rs b/src/constants.rs index 96fa0e9..dc24172 100644 --- a/src/constants.rs +++ b/src/constants.rs @@ -1 +1,2 @@ +#[allow(dead_code)] pub const MAX_LOAD_VALUE: u16 = 2_u16.pow(15); \ No newline at end of file diff --git a/src/error_handling.rs b/src/error_handling.rs deleted file mode 100644 index 042ba93..0000000 --- a/src/error_handling.rs +++ /dev/null @@ -1,28 +0,0 @@ -use miette::{Diagnostic, NamedSource, SourceSpan}; -use thiserror::Error; - -#[derive(Error, Debug, Diagnostic)] -#[error("Unrecognized token")] -#[diagnostic( - code(tokenizer::no_matching_pattern), - url("https://my-incredible-doc.fr"), - help("TODO provide the closest pattern") -)] -pub struct UnrecognizedToken { - #[source_code] - pub src: NamedSource, - - #[label("This doesn't match any Token pattern")] - pub src_span: SourceSpan, -} - -#[derive(Error, Diagnostic, Debug)] -pub enum TokenizerError { - #[error(transparent)] - #[diagnostic(code(tokenizer::io_error), help("try this filename:"))] - IoError(#[from] std::io::Error), - - #[error(transparent)] - #[diagnostic(transparent)] - TokenError(#[from] UnrecognizedToken), -} diff --git a/src/lexer/lexer.rs b/src/lexer/lexer.rs new file mode 100644 index 0000000..621c7d1 --- /dev/null +++ b/src/lexer/lexer.rs @@ -0,0 +1,206 @@ +#[path = "../constants.rs"] +mod constants; +mod lexer_error; + +use lexer_error::{AppError, LexingError, UnrecognizedToken}; +use logos::{Lexer, Logos}; +use miette::NamedSource; +use std::io::Read; + +pub fn lex_from_file(filename: &str) -> miette::Result<(), AppError> { + if let Ok(mut file) = std::fs::File::open(filename) { + let mut content = String::new(); + + let _ = file.read_to_string(&mut content); + + let mut lex = Token::lexer_with_extras(content.as_str(), filename.to_owned()); + + while let Some(result) = lex.next() { + match result { + Ok(token) => println!("{:#?}", token), + Err(e) => match e { + LexingError::Utoken(_) => { + Err(AppError::A(LexingError::Utoken(UnrecognizedToken { + src: NamedSource::new(filename, content.clone()), + src_span: lex.span().into(), + })))? + } + any_error => Err(AppError::A(any_error))?, + }, + } + } + + return Ok(()); + } + Err(AppError::IoError) +} + +#[derive(Logos, Debug, PartialEq, Clone)] +#[logos(skip r"[ \t\n\f]+")] +#[logos(error = LexingError)] +#[logos(extras = String)] +enum Token { + // Operations + #[token("+")] + #[token("ADD")] + Add, + + #[token("-")] + #[token("SUB")] + Sub, + + #[token("&")] + #[token("AND")] + And, + + #[token("|")] + #[token("OR")] + Or, + + #[token("^")] + #[token("XOR")] + Xor, + + #[token("~")] + #[token("NOT")] + Not, + + #[token("<-")] + Assignment, + + // Branch + #[token("JMP")] + Jmp, + + #[token(">")] + Gt, + + #[token("<")] + Lt, + + #[token("==")] + Eq, + + #[token("!=")] + Neq, + + #[token(">=")] + Gtoeq, + + #[token("<=")] + Ltoeq, + + // Registers + #[token("A")] + A, + + #[token("*A")] + StarA, + + #[token("V")] + V, + + #[token("*V")] + StarV, + + #[token("C")] + C, + + // Values + #[regex("[0-9]+", |lex| parse_value("", 10, lex))] + #[regex("(0x|0X){1}[a-fA-F0-9]+", |lex| parse_value("0x", 16, lex))] + #[regex("(0b|0B){1}(0|1)+", |lex| parse_value("0b", 2, lex))] + Number(u16), + + // Labels + #[regex("[a-zA-Z_]+:", parse_label)] + Label(String), +} + +fn parse_label(lex: &mut Lexer) -> Result { + let slice = lex.slice().replace(":", ""); + + for maybe_token in lex.clone().spanned() { + match maybe_token.0 { + Ok(Token::Label(s)) if s == slice.clone() => Err(lexer_error::ParseLabelError { + src: NamedSource::new(lex.extras.clone(), lex.source().to_owned()), + src_span: lex.span().into(), + previous_label_span: maybe_token.1.into(), + })?, + Ok(_) | Err(_) => (), + } + } + + Ok(slice) +} + +fn parse_value( + prefix: &str, + base: u32, + lex: &mut Lexer, +) -> Result { + use constants::MAX_LOAD_VALUE; + use lexer_error::LoadValueOverflowError; + use lexer_error::ParseValueError; + use std::num::IntErrorKind::PosOverflow; + + let slice = lex.slice(); + let raw_bits = slice.trim_start_matches(prefix); + + return match u16::from_str_radix(raw_bits, base) { + Ok(n) if n > MAX_LOAD_VALUE => { + Err(ParseValueError::OverflowError(LoadValueOverflowError { + src: NamedSource::new(lex.extras.clone(), lex.source().to_owned()), + src_span: lex.span().into(), + })) + } + Err(e) if *e.kind() == PosOverflow => { + println!("value should fir in 16 bits"); + Err(ParseValueError::OverflowError(LoadValueOverflowError { + src: NamedSource::new(lex.extras.clone(), lex.source().to_owned()), + src_span: lex.span().into(), + })) + } + Ok(n) => Ok(n), + Err(e) => Err(ParseValueError::ParseIntError(e)), + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_add_token() { + let mut lex = Token::lexer("+ ADD"); + assert_eq!(lex.next(), Some(Ok(Token::Add))); + assert_eq!(lex.next(), Some(Ok(Token::Add))); + } + + #[test] + fn test_labels_token() { + let mut lex = Token::lexer("some_label:"); + assert_eq!( + lex.next(), + Some(Ok(Token::Label(String::from("some_label")))) + ); + + let mut lex = Token::lexer("SOME_LABEL:"); + assert_eq!( + lex.next(), + Some(Ok(Token::Label(String::from("SOME_LABEL")))) + ); + } + + #[test] + fn test_values() { + let inputs = ["554", "0x5fa4", "0b1000110"]; + let expected_numbers = [554, 0x5fa4, 0b1000110]; + + for (l, r) in std::iter::zip(inputs, expected_numbers) { + let mut lex = Token::lexer(l); + + assert_eq!(lex.next(), Some(Ok(Token::Number(r)))) + } + } +} diff --git a/src/lexer/lexer_error.rs b/src/lexer/lexer_error.rs new file mode 100644 index 0000000..dace221 --- /dev/null +++ b/src/lexer/lexer_error.rs @@ -0,0 +1,92 @@ +#[path = "../constants.rs"] +mod constants; + +use miette::{Diagnostic, NamedSource, SourceSpan}; +use thiserror::Error; + +#[derive(Error, Diagnostic, Debug, Clone, PartialEq)] +#[error(transparent)] +#[diagnostic(transparent)] +pub enum LexingError { + Utoken(#[from] UnrecognizedToken), + + LabelError(#[from] ParseLabelError), + + ValueError(#[from] ParseValueError), +} + +impl Default for LexingError { + fn default() -> Self { + LexingError::Utoken(UnrecognizedToken { + src: NamedSource::new("", String::new()), + src_span: (0, 1).into(), + }) + } +} + +#[derive(Error, Debug, Diagnostic, Clone, PartialEq)] +#[error("Unrecognized Token")] +#[diagnostic( + code(token_definition::Token), + help("See the list of tokens in src/lexer/token_definition.rs (todo: give the closest token to the slice given)") +)] +pub struct UnrecognizedToken { + #[source_code] + pub src: NamedSource, + + #[label("This doesn't match any token")] + pub src_span: SourceSpan, +} + +#[derive(Error, Debug, Diagnostic, Clone, PartialEq)] +#[error("Multiple Definitions of the same label")] +#[diagnostic(code(lexer::parse_label))] +pub struct ParseLabelError { + #[source_code] + pub src: NamedSource, + + #[label("Can't declare this label")] + pub previous_label_span: SourceSpan, + + #[label("This label is already defined here")] + pub src_span: SourceSpan, +} + +#[derive(Error, Diagnostic, Debug, PartialEq, Clone)] +pub enum ParseValueError { + #[error(transparent)] + #[diagnostic( + code(lexer::parse_value), + help("try finding clues in std::num::IntErrorKind") + )] + ParseIntError(#[from] std::num::ParseIntError), + + #[error(transparent)] + #[diagnostic(transparent)] + OverflowError(#[from] LoadValueOverflowError), +} + +#[derive(Error, Debug, Diagnostic, Clone, PartialEq)] +#[error("Value Load Overflow")] +#[diagnostic( + code(lexer::parse_value), + help( + "- The value should be under 0x8000 in hexadecimal\n- The value should be under 32768 in decimal\n- The value should fit in 15 bits\n\nnote: future note on how to quickfix this problem" + ) +)] +pub struct LoadValueOverflowError { + #[source_code] + pub src: NamedSource, + + #[label("This value should be under {}", constants::MAX_LOAD_VALUE)] + pub src_span: SourceSpan, +} + +#[derive(Error, Debug, Diagnostic)] +pub enum AppError { + #[error(transparent)] + #[diagnostic(transparent)] + A(LexingError), + #[error("Io error")] + IoError, +} diff --git a/src/lexer/token_definition.rs b/src/lexer/token_definition.rs new file mode 100644 index 0000000..be8bf68 --- /dev/null +++ b/src/lexer/token_definition.rs @@ -0,0 +1,89 @@ +mod lexer_error; + +use logos::Logos; +use lexer_error::LexingError; + +#[derive(Logos, Debug, PartialEq, Clone)] +#[logos(skip r"[ \t\n\f]+")] +#[logos(error = LexingError)] +#[logos(extras = String)] +// #[logos(extras = (filename, contents))] +// see: https://docs.rs/logos/latest/logos/trait.Logos.html +pub enum Token { + // Operations + #[token("+")] + #[token("ADD")] + Add, + + #[token("-")] + #[token("SUB")] + Sub, + + #[token("&")] + #[token("AND")] + And, + + #[token("|")] + #[token("OR")] + Or, + + #[token("^")] + #[token("XOR")] + Xor, + + #[token("~")] + #[token("NOT")] + Not, + + #[token("<-")] + Assignment, + + // Branch + #[token("JMP")] + Jmp, + + #[token(">")] + Gt, + + #[token("<")] + Lt, + + #[token("==")] + Eq, + + #[token("!=")] + Neq, + + #[token(">=")] + Gtoeq, + + #[token("<=")] + Ltoeq, + + // Registers + #[token("A")] + A, + + #[token("*A")] + StarA, + + #[token("V")] + V, + + #[token("*V")] + StarV, + + #[token("C")] + C, + + // Values + #[regex("[0-9]+", |lex| parse_value("", 10, lex))] + #[regex("(0x|0X){1}[a-fA-F0-9]+", |lex| parse_value("0x", 16, lex))] + #[regex("(0b|0B){1}(0|1)+", |lex| parse_value("0b", 2, lex))] + Number(u16), + + // Labels + // #[regex("[a-zA-Z_]+:", |lex| lex.slice().replace(":", ""))] + #[regex("[a-zA-Z_]+:", parse_label)] + Label(String), +} \ No newline at end of file diff --git a/src/main.rs b/src/main.rs index 00ba6b3..573000f 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,43 +1,10 @@ mod constants; -mod error_handling; -mod tokenizer; - -use error_handling::{TokenizerError, UnrecognizedToken}; -use logos::Logos; -use miette::NamedSource; -use std::env; -use std::fs::File; -use std::io::prelude::*; -use tokenizer::Token; +#[path = "lexer/lexer.rs"] +mod lexer; use miette::Result; -fn tokenizer_app(args: std::env::Args) -> Result<(), TokenizerError> { - let args: Vec = args.collect(); - - let filename: &str = &args[1]; - - let mut file = File::open(filename)?; - let mut contents = String::new(); - - file.read_to_string(&mut contents)?; - - let mut lex = Token::lexer(contents.as_str()); - - while let Some(result) = lex.next() { - match result { - Ok(token) => println!("{:#?}", token), - Err(_) => Err(UnrecognizedToken{ - src: NamedSource::new(filename, contents.clone()), - src_span: lex.span().into(), - })?, - } - } - - Ok(()) -} - fn main() -> Result<()> { - let _ = tokenizer_app(env::args())?; + let _ = lexer::lex_from_file("tests/test.asm")?; Ok(()) } diff --git a/src/tokenizer.rs b/src/tokenizer.rs deleted file mode 100644 index 7a2a7d5..0000000 --- a/src/tokenizer.rs +++ /dev/null @@ -1,140 +0,0 @@ -use crate::constants::MAX_LOAD_VALUE; -use logos::{Lexer, Logos}; - -fn parse_values(prefix: &str, base: u32, lex: &mut Lexer) -> Option { - let slice = lex.slice(); - let raw_bits = slice.trim_start_matches(prefix); - let n: u16 = u16::from_str_radix(raw_bits, base).ok()?; - assert!( - n <= MAX_LOAD_VALUE, - "Can't load data exceeding {} from ram", - MAX_LOAD_VALUE - ); - Some(n) -} - -#[derive(Logos, Debug, PartialEq)] -#[logos(skip r"[ \t\n\f]+")] -pub enum Token { - // Operations - #[token("+")] - #[token("ADD")] - Add, - - #[token("-")] - #[token("SUB")] - Sub, - - #[token("&")] - #[token("AND")] - And, - - #[token("|")] - #[token("OR")] - Or, - - #[token("^")] - #[token("XOR")] - Xor, - - #[token("~")] - #[token("NOT")] - Not, - - #[token("<-")] - Assignment, - - // Branch - #[token("JMP")] - Jmp, - - #[token(">")] - Gt, - - #[token("<")] - Lt, - - #[token("==")] - Eq, - - #[token("!=")] - Neq, - - #[token(">=")] - Gtoeq, - - #[token("<=")] - Ltoeq, - - // Registers - #[token("A")] - A, - - #[token("*A")] - StarA, - - #[token("V")] - V, - - #[token("*V")] - StarV, - - #[token("C")] - C, - - // Values - #[regex("[0-9]+", |lex| parse_values("", 10, lex))] - #[regex("(0x|0X){1}[a-fA-F0-9]+", |lex| parse_values("0x", 16, lex))] - #[regex("(0b|0B){1}(0|1)+", |lex| parse_values("0b", 2, lex))] - Number(u16), - - // Labels - #[regex("[a-zA-Z_]+:", |lex| lex.slice().replace(":", ""))] - Label(String), -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_add_token() { - let mut lex = Token::lexer("+ ADD"); - assert_eq!(lex.next(), Some(Ok(Token::Add))); - assert_eq!(lex.next(), Some(Ok(Token::Add))); - } - - #[test] - fn test_labels_token() { - let mut lex = Token::lexer("some_label:"); - assert_eq!( - lex.next(), - Some(Ok(Token::Label(String::from("some_label")))) - ); - - let mut lex = Token::lexer("SOME_LABEL:"); - assert_eq!( - lex.next(), - Some(Ok(Token::Label(String::from("SOME_LABEL")))) - ); - } - - #[test] - fn test_not_a_label() { - let mut lex = Token::lexer("Centrale Lille"); - - assert_eq!(lex.next(), Some(Err(()))); - } - - #[test] - fn test_values() { - let inputs = ["554", "0x5fa4", "0b1000110"]; - let expected_numbers = [554, 0x5fa4, 0b1000110]; - - for (l, r) in std::iter::zip(inputs, expected_numbers) { - let mut lex = Token::lexer(l); - - assert_eq!(lex.next(), Some(Ok(Token::Number(r)))) - } - } -} diff --git a/tests/test.asm b/tests/test.asm index 078c405..7640c86 100644 --- a/tests/test.asm +++ b/tests/test.asm @@ -1,10 +1,9 @@ -A <-- 1 + C -A <- 256 -*A <- C -A >= 3 -A -D +0x8000 +A <- A majhkdf & 3 +V <- V + 1 -thisis_a_label: -0x333 \ No newline at end of file +main: +0x033 + +main: \ No newline at end of file From a1f61555da8997525a3901929934d4a47233ab77 Mon Sep 17 00:00:00 2001 From: benoitlx Date: Sat, 1 Feb 2025 21:54:52 +0100 Subject: [PATCH 4/5] feat: support define and single line comments --- Cargo.toml | 1 + README.md | 8 ++ src/constants.rs | 2 +- src/lexer/lexer.rs | 242 +++++++++++++++++++++++++++++++++++++-- src/lexer/lexer_error.rs | 83 ++++++++++++-- tests/test.asm | 32 +++++- 6 files changed, 340 insertions(+), 28 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 4d44534..0f8d0fc 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -6,6 +6,7 @@ edition = "2021" [dependencies] logos = "0.15.0" miette = { version = "7.4.0", features = ["fancy"] } +regex = "1.11.1" thiserror = "2.0.11" [dev-dependencies] diff --git a/README.md b/README.md index 3329925..102a995 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,10 @@ # simple-assembler + Simple assembler for my custom cpu + +## TODO + +- [ ] improve test coverage +- [ ] export graph from the lexer +- [ ] refactor lexer_error (might introduce breaking change as error name's can change) +- [ ] refactor parsing code \ No newline at end of file diff --git a/src/constants.rs b/src/constants.rs index dc24172..b4adc80 100644 --- a/src/constants.rs +++ b/src/constants.rs @@ -1,2 +1,2 @@ #[allow(dead_code)] -pub const MAX_LOAD_VALUE: u16 = 2_u16.pow(15); \ No newline at end of file +pub const MAX_LOAD_VALUE: u16 = 2_u16.pow(15) - 1; \ No newline at end of file diff --git a/src/lexer/lexer.rs b/src/lexer/lexer.rs index 621c7d1..4c9b270 100644 --- a/src/lexer/lexer.rs +++ b/src/lexer/lexer.rs @@ -115,18 +115,55 @@ enum Token { // Labels #[regex("[a-zA-Z_]+:", parse_label)] Label(String), + + // Define + // #[regex("DEFINE [a-zA-Z_]+ [0-9]+", |lex| parse_define("", 10, lex))] + // #[regex("DEFINE [a-zA-Z_]+ (0x|0X){1}[a-fA-F0-9]+", |lex| parse_define("0x", 16, lex))] + // #[regex("DEFINE [a-zA-Z_]+ (0b|0B){1}(0|1)+", |lex| parse_define("0b", 2, lex))] + // #[regex(r"DEFINE\s*[^\s]*", define_too_few_arguments)] + #[regex(r"DEFINE [^[\n(//)]]*", parse_define)] // tofix: wrong error with "DEFINE t/est 0x0" + Define((String, u16)), + + // Comments + // #[regex(r"\s*/*.**/")] // multiline comments + #[regex(r"\s*//.*")] + Comments, } fn parse_label(lex: &mut Lexer) -> Result { + // check for regex [a-zA-Z_]+ + // if it fail => NameError + let slice = lex.slice().replace(":", ""); - for maybe_token in lex.clone().spanned() { + parse_text_raw( + slice, + lex.span(), + lex.extras.clone(), + lex.source(), + lex.clone().spanned(), + ) +} + +fn parse_text_raw( + slice: String, + span: logos::Span, + f: String, + source: &str, + spanned: logos::SpannedIter, +) -> Result { + for maybe_token in spanned { match maybe_token.0 { Ok(Token::Label(s)) if s == slice.clone() => Err(lexer_error::ParseLabelError { - src: NamedSource::new(lex.extras.clone(), lex.source().to_owned()), - src_span: lex.span().into(), + src: NamedSource::new(&f, source.to_owned()), + src_span: span.clone().into(), previous_label_span: maybe_token.1.into(), })?, + Ok(Token::Define((s, _))) if s == slice.clone() => Err(lexer_error::ParseLabelError { + src: NamedSource::new(&f, source.to_owned()), + src_span: span.clone().into(), + previous_label_span: (maybe_token.1.start + 7, s.len()).into(), + })?, Ok(_) | Err(_) => (), } } @@ -138,36 +175,151 @@ fn parse_value( prefix: &str, base: u32, lex: &mut Lexer, +) -> Result { + let slice = lex.slice(); + + parse_value_raw( + prefix, + base, + slice, + lex.span(), + lex.extras.clone(), + lex.source(), + ) +} + +fn parse_value_raw( + prefix: &str, + base: u32, + slice: &str, + span: logos::Span, + f: String, + s: &str, ) -> Result { use constants::MAX_LOAD_VALUE; + use lexer_error::InvalidDigitError; use lexer_error::LoadValueOverflowError; use lexer_error::ParseValueError; + use std::num::IntErrorKind::InvalidDigit; use std::num::IntErrorKind::PosOverflow; - let slice = lex.slice(); let raw_bits = slice.trim_start_matches(prefix); return match u16::from_str_radix(raw_bits, base) { Ok(n) if n > MAX_LOAD_VALUE => { Err(ParseValueError::OverflowError(LoadValueOverflowError { - src: NamedSource::new(lex.extras.clone(), lex.source().to_owned()), - src_span: lex.span().into(), + src: NamedSource::new(f, s.to_owned()), + src_span: span.into(), })) } Err(e) if *e.kind() == PosOverflow => { - println!("value should fir in 16 bits"); Err(ParseValueError::OverflowError(LoadValueOverflowError { - src: NamedSource::new(lex.extras.clone(), lex.source().to_owned()), - src_span: lex.span().into(), + src: NamedSource::new(f, s.to_owned()), + src_span: span.into(), + })) + } + Err(e) if *e.kind() == InvalidDigit => { + Err(ParseValueError::WrongDigitError(InvalidDigitError { + src: NamedSource::new(f, s.to_owned()), + src_span: span.into(), })) } Ok(n) => Ok(n), Err(e) => Err(ParseValueError::ParseIntError(e)), + }; +} + +fn parse_define(lex: &mut Lexer) -> Result<(String, u16), lexer_error::ParseDefineError> { + use lexer_error::DefineFewOperandError; + use lexer_error::DefineManyOperandError; + use lexer_error::NameError; + use lexer_error::ParseDefineError; + use regex::Regex; + + let mut slices = lex.slice().trim_end().split_whitespace(); + let mut result: (String, u16) = (String::from(""), 0); + + let _ = slices.next(); + + let mut arg_number = 0; + if let Some(label) = slices.next() { + arg_number += 1; + + let label_length = label.len(); + let span_start = lex.span().start + 7; + let span_range = span_start..(span_start + label_length); + + let re = Regex::new(r"^[a-zA-Z_]+$").unwrap(); + + if !re.is_match(label) { + return Err(ParseDefineError::InvalidName(NameError { + src: NamedSource::new(lex.extras.clone(), lex.source().to_owned()), + src_span: span_range.into(), + })); + } + + result.0 = parse_text_raw( + label.to_owned(), + span_range, + lex.extras.clone(), + lex.source(), + lex.clone().spanned(), + )?; + + if let Some(value) = slices.next() { + let mut prefix = ""; + let mut base = 10; + + if value.len() >= 2 && &value[0..2] == "0x" { + prefix = "0x"; + base = 16; + } + if value.len() >= 2 && &value[0..2] == "0b" { + prefix = "0b"; + base = 2; + } + + arg_number += 1; + + let value_length = value.len(); + + result.1 = parse_value_raw( + prefix, + base, + value, + (lex.span().start + 8 + label_length) + ..(lex.span().start + 8 + label_length + value_length), + lex.extras.clone(), + lex.source(), + )?; + } + } + + if slices.next() != None { + return Err(ParseDefineError::TooManyOperandError( + DefineManyOperandError { + src: NamedSource::new(lex.extras.clone(), lex.source().to_owned()), + src_span: lex.span().into(), + }, + )); + } + + if arg_number != 2 { + return Err(ParseDefineError::TooFewOperandError( + DefineFewOperandError { + src: NamedSource::new(lex.extras.clone(), lex.source().to_owned()), + src_span: lex.span().into(), + }, + )); } + + Ok(result) } #[cfg(test)] mod tests { + use crate::lexer::lexer_error::ParseDefineError; + use super::*; #[test] @@ -178,7 +330,9 @@ mod tests { } #[test] - fn test_labels_token() { + fn test_labels() { + use LexingError::LabelError; + let mut lex = Token::lexer("some_label:"); assert_eq!( lex.next(), @@ -190,6 +344,19 @@ mod tests { lex.next(), Some(Ok(Token::Label(String::from("SOME_LABEL")))) ); + + // wrong syntax + + // Multiple use of the same name + let mut lex = Token::lexer("test:\nDEFINE test 0"); + assert!(matches!(lex.next(), Some(Err(LabelError(_))))); + + let mut lex = Token::lexer("test:\ntest:"); + assert!(matches!(lex.next(), Some(Err(LabelError(_))))); + + // wrong label name + /* let mut lex = Token::lexer("te/st:"); + assert!(matches!(lex.next(), Some(Err(LabelError(_))))); // to fix: return Err(LexingError::Utoken) */ } #[test] @@ -203,4 +370,59 @@ mod tests { assert_eq!(lex.next(), Some(Ok(Token::Number(r)))) } } + + #[test] + fn test_defines() { + use LexingError::DefineError; + use Token::Define; + use ParseDefineError::LabelError; + use ParseDefineError::ValueError; + + // good syntax + let mut lex = Token::lexer("DEFINE test 0"); + assert_eq!(lex.next(), Some(Ok(Define((String::from("test"), 0))))); + + let mut lex = Token::lexer("DEFINE test 1"); + assert_eq!(lex.next(), Some(Ok(Define((String::from("test"), 1))))); + + let mut lex = Token::lexer("DEFINE test 32767"); + assert_eq!(lex.next(), Some(Ok(Define((String::from("test"), 32767))))); + + let mut lex = Token::lexer("DEFINE test 0x0"); + assert_eq!(lex.next(), Some(Ok(Define((String::from("test"), 0))))); + + let mut lex = Token::lexer("DEFINE test 0xff"); + assert_eq!(lex.next(), Some(Ok(Define((String::from("test"), 255))))); + + let mut lex = Token::lexer("DEFINE test 0x7fff"); + assert_eq!(lex.next(), Some(Ok(Define((String::from("test"), 32767))))); + + let mut lex = Token::lexer("DEFINE test 0b0"); + assert_eq!(lex.next(), Some(Ok(Define((String::from("test"), 0))))); + + let mut lex = Token::lexer("DEFINE TOTO 0b11"); + assert_eq!(lex.next(), Some(Ok(Define((String::from("TOTO"), 3))))); + + let mut lex = Token::lexer("DEFINE titi_test 0b111111111111111"); + assert_eq!( + lex.next(), + Some(Ok(Define((String::from("titi_test"), 32767)))) + ); + + // wrong syntax + + // Multiple use of the same name + let mut lex = Token::lexer("DEFINE test 0\ntest:"); + assert!(matches!(lex.next(), Some(Err(DefineError(LabelError(_)))))); + + let mut lex = Token::lexer("DEFINE test 0\nDEFINE test 0"); + assert!(matches!(lex.next(), Some(Err(DefineError(LabelError(_)))))); + + // Value Error + let mut lex = Token::lexer("DEFINE test 0feaj138"); + assert!(matches!(lex.next(), Some(Err(DefineError(ValueError(_)))))); + + let mut lex = Token::lexer("DEFINE test 0x8000"); // load value overflow + assert!(matches!(lex.next(), Some(Err(DefineError(ValueError(_)))))); + } } diff --git a/src/lexer/lexer_error.rs b/src/lexer/lexer_error.rs index dace221..3e5dd37 100644 --- a/src/lexer/lexer_error.rs +++ b/src/lexer/lexer_error.rs @@ -13,6 +13,8 @@ pub enum LexingError { LabelError(#[from] ParseLabelError), ValueError(#[from] ParseValueError), + + DefineError(#[from] ParseDefineError), } impl Default for LexingError { @@ -24,6 +26,51 @@ impl Default for LexingError { } } +#[derive(Error, Diagnostic, Debug, Clone, PartialEq)] +#[error(transparent)] +#[diagnostic(transparent)] +pub enum ParseDefineError { + LabelError(#[from] ParseLabelError), + + ValueError(#[from] ParseValueError), + + TooFewOperandError(#[from] DefineFewOperandError), + + TooManyOperandError(#[from] DefineManyOperandError), + + InvalidName(#[from] NameError), +} + +#[derive(Error, Debug, Diagnostic, Clone, PartialEq)] +#[error("Name should be of the form [a-zA-Z_]+")] +pub struct NameError { + #[source_code] + pub src: NamedSource, + + #[label("here")] + pub src_span: SourceSpan, +} + +#[derive(Error, Debug, Diagnostic, Clone, PartialEq)] +#[error("Too few operands after DEFINE")] +pub struct DefineFewOperandError { + #[source_code] + pub src: NamedSource, + + #[label("missing operand for define")] + pub src_span: SourceSpan, +} + +#[derive(Error, Debug, Diagnostic, Clone, PartialEq)] +#[error("Too many operands after DEFINE")] +pub struct DefineManyOperandError { + #[source_code] + pub src: NamedSource, + + #[label("additional operand here")] + pub src_span: SourceSpan, +} + #[derive(Error, Debug, Diagnostic, Clone, PartialEq)] #[error("Unrecognized Token")] #[diagnostic( @@ -39,26 +86,26 @@ pub struct UnrecognizedToken { } #[derive(Error, Debug, Diagnostic, Clone, PartialEq)] -#[error("Multiple Definitions of the same label")] -#[diagnostic(code(lexer::parse_label))] +#[error("Multiple Definitions/Labels with the same name")] +#[diagnostic(code(lexer::parse_text_raw))] pub struct ParseLabelError { #[source_code] pub src: NamedSource, - #[label("Can't declare this label")] - pub previous_label_span: SourceSpan, - - #[label("This label is already defined here")] + #[label("Can't use this name")] pub src_span: SourceSpan, + + #[label("the name is already declared here")] + pub previous_label_span: SourceSpan, } #[derive(Error, Diagnostic, Debug, PartialEq, Clone)] pub enum ParseValueError { #[error(transparent)] - #[diagnostic( - code(lexer::parse_value), - help("try finding clues in std::num::IntErrorKind") - )] + #[diagnostic(transparent)] + WrongDigitError(#[from] InvalidDigitError), + + #[error(transparent)] ParseIntError(#[from] std::num::ParseIntError), #[error(transparent)] @@ -66,6 +113,20 @@ pub enum ParseValueError { OverflowError(#[from] LoadValueOverflowError), } +#[derive(Error, Debug, Diagnostic, Clone, PartialEq)] +#[error("Invalid digit found in string")] +#[diagnostic( + code(lexer::parse_value), + help("Verify the base prefix and the digits") +)] +pub struct InvalidDigitError { + #[source_code] + pub src: NamedSource, + + #[label("Invalid digit here")] + pub src_span: SourceSpan, +} + #[derive(Error, Debug, Diagnostic, Clone, PartialEq)] #[error("Value Load Overflow")] #[diagnostic( @@ -86,7 +147,7 @@ pub struct LoadValueOverflowError { pub enum AppError { #[error(transparent)] #[diagnostic(transparent)] - A(LexingError), + A(LexingError), // todo: change this name #[error("Io error")] IoError, } diff --git a/tests/test.asm b/tests/test.asm index 7640c86..23cfdc0 100644 --- a/tests/test.asm +++ b/tests/test.asm @@ -1,9 +1,29 @@ -0x8000 -A <- A majhkdf & 3 -V <- V + 1 +// testing decimal numbers +DEFINE titi 0 +DEFINE tito 1 +DEFINE toto 32767 +// DEFINE tata 32768 // error here +// DEFINE big_number 99999999999999 // error here +// testing hexadecimal numbers +DEFINE a 0xff +DEFINE b 0x0 +DEFINE c 0x7fff +// DEFINE d 0x8001 // error here -main: -0x033 +// testing binary numbers +DEFINE d 0b0 +DEFINE e 0b11001 +DEFINE f 0b111111111111111 +// DEFINE g 0b1111111111111111 // error here -main: \ No newline at end of file +// error below +// DEFINE foo 0x0 +// DEFINE foo 0x0 + +DEFINE test // error here +// DEFINE many 0x0 argument // error here + +// DEFINE test test // error here +// DEFINE t*est 0x0 // error here +// DEFINE t/est 0x0 // error here From 42d04ee9003d560983ed9873b93892b085e28dd5 Mon Sep 17 00:00:00 2001 From: benoitlx Date: Sun, 2 Feb 2025 14:02:52 +0100 Subject: [PATCH 5/5] chore: not pursuing this branch --- .gitignore | 2 ++ tests/test.asm | 4 +++- tests/two_labels_error.asm | 2 ++ 3 files changed, 7 insertions(+), 1 deletion(-) create mode 100644 tests/two_labels_error.asm diff --git a/.gitignore b/.gitignore index efe3eb1..f594f86 100644 --- a/.gitignore +++ b/.gitignore @@ -23,3 +23,5 @@ Cargo.lock # Added by cargo /target + +.vscode/ diff --git a/tests/test.asm b/tests/test.asm index 23cfdc0..5ca6970 100644 --- a/tests/test.asm +++ b/tests/test.asm @@ -21,9 +21,11 @@ DEFINE f 0b111111111111111 // DEFINE foo 0x0 // DEFINE foo 0x0 -DEFINE test // error here +// DEFINE test // error here // DEFINE many 0x0 argument // error here // DEFINE test test // error here // DEFINE t*est 0x0 // error here // DEFINE t/est 0x0 // error here + +A: diff --git a/tests/two_labels_error.asm b/tests/two_labels_error.asm new file mode 100644 index 0000000..90b4924 --- /dev/null +++ b/tests/two_labels_error.asm @@ -0,0 +1,2 @@ +A_aaAZ: +A_aaAZ: \ No newline at end of file