Why does the tokenize function parse lists incorrectly when combined with strings?
⚓ Rust 📅 2025-09-10 👤 surdeus 👁️ 11I am making a lisp interpreter in rust. I am currently writing a tokenize fun that takes a string and turns into a vector of tokens:
pub fn tokenize(expr: &str, loc: &str) -> Vec<Token> {
let mut tokens = vec![]; // tokens
let mut row = [1, 1]; // starting column
let mut col = [1, 1]; // ending column
let chars: Vec<char> = expr.chars().collect();
let mut i = 0;
let mut last_space = 0; // last space
while i < chars.len() {
match &chars[i] {
'"' => { // if there is a quote
// The start of a string
i += 1;
let mut end = i; // end quote
let mut endcol = col[0]; // end column
let mut slashes = 0; // number of slashes
let mut startrow = row[0]; // starting column
let mut endrow = row[0]; // ending column
let mut content = String::new(); // the string to use
for j in &chars[i..] { // loop through chars
if *j == '\\' { // if there is a backslash
slashes += 1 // increment the counter
} else if *j == '"' && slashes % 2 == 0 { // if there is a quote and is not escaped
break // stop string lookththrough
} else if *j == '\n' { // if there is a newline
endrow += 1; // new row
endcol = 1; // first col
} else {
slashes = 0; // reset shlash counter
content.push(*j) // push the string
}
end += 1; // end must be later
endcol += 1; // new col
println!("{}, {}",i, j)
}
row = [startrow, endrow]; // set the row for next expr
col = [i, endcol]; // set the col for next expr
tokens.push(Token { // add the token
expr: Expr::String(parse_str(&content)), // parse the string for escape chars
row,
col,
file: loc.to_string()
});
i = end+1; // increment i past the string
}
'(' => {
let mut depth = 1; // layers of nested parentheses
let start = i + 1; // starting parenthesis
let mut end = start; // ending parenthesis
let mut instring = false; // prevent cleverly tricking the program by putting the closing bracket in a string
while end < chars.len() && depth > 0 { // while not at EOF
if chars[end] == '(' && !instring { // if it is a (
depth += 1; // increment depth
} else if chars[end] == ')' && !instring { // if it is a )
depth -= 1; // decrement depth
} else if chars[end] == '"' {
instring = !instring
}
end += 1; // end is atleast 1 char ahead
}
if depth != 0 { // if depth is not 0
panic!("Unmatched '('"); // there must be an unmatched parenthesis
}
let mut inside: String = chars[start..end - 1].iter().collect(); // get the inside of items
inside.push(' '); // push space
println!("About to parse: {:?}",inside);
let inner_tokens = tokenize(&inside, loc); // recursively tokenize inner contents
println!("I parsed: {:?}",&inner_tokens);
let inner_exprs = inner_tokens.into_iter().map(|t| t.expr).collect(); // remove row, col and file data.
i = end; // make i skip closing bracket
tokens.push(Token { // push new tokens
expr: Expr::list(inner_exprs),
row,
col,
file: loc.to_string()
})
}
')' => {
panic!("Unmatched ')'")
}
' ' | '\t' | '\n' => { // if there is a space, tab or newline
let expression: String; // initialize expression
if last_space != 0 { // if this is the first space
expression = chars[last_space+1..i].iter().collect(); // get the string between this space and the space before this (or start of expr)
} else { // not the first space
expression = chars[last_space..i].iter().collect(); // get the string between this space and the space before this (or start of expr)
}
let isquoted = expression.starts_with('\''); // if it starts with a single quote, it is quoted
let isquasi = expression.starts_with('`'); // if it starts with a backtick, it is quasiquoted
tokens.push(parse(&expression, &row, &col, loc, isquoted, isquasi)); // parse the atom with this data
last_space = i; // this was the last space
}
_ => {}
}
i += 1;
/*if i < chars.len() {
println!(">> {}", chars[i]);
}*/
}
tokens
}
/* parse: parses string into token atom*/
fn parse(s: &str, row: &[usize; 2], col: &[usize; 2], file: &str, is_quoted: bool, is_quasi: bool) -> Token {
let expr = if let Ok(n) = s.parse::<f64>() {
Expr::Number(n)
} else if s == "t" || s == "nil" {
if s == "t" {
Expr::Bool(true)
} else {
Expr::Bool(false)
}
} else {
if is_quoted {
Expr::Function(vec![Expr::Symbol(String::from("quote")),Expr::Symbol(s.to_string())])
} else if is_quasi {
Expr::Function(vec![Expr::Symbol(String::from("quasiquote")),Expr::Symbol(s.to_string())])
} else {
Expr::Symbol(s.to_lowercase().to_string())
}
};
Token {
expr,
row: *row,
col: *col,
file: file.to_string(),
}
}
fn main() {
println!("Type an s-expr or use C-c to quit.");
let args: Vec<String> = env::args().collect();
// Tracing for debug
let trace = args.iter().any(|arg| arg == "--debug");
let mut starterStack = vec![ // Starter stack to pass to eval
Rc::new(RefCell::new(Scope::new()))
];
starterStack[0].clone().borrow_mut().set(String::from("*cwd*"),ScopeEntry {
val: Expr::String(env::current_dir().unwrap().display().to_string()),
constant: false,
});
/*for i in it::tokenize(
&it::clean(
&fs::read_to_string("src/init.lisp").expect("Cannot read init.lisp")
), "src/init.lisp"
) {
it::eval(i, &mut starterStack, trace);
}*/
loop {
let mut input = String::new();
print!("eval> ");
io::stdout().flush().unwrap();
std::io::stdin().read_line(&mut input).expect("readline error");
let mut data = it::tokenize(&it::clean(&input),"REPL Input");
println!("{:?}",data);
}
}
The tokenizer however cannot see strings in list with ( or ) properly:
Type an s-expr or use C-c to quit.
eval> 2
[Token { expr: Number(2.0), row: [1, 1], col: [1, 1], file: "REPL Input" }]
Returned 2
eval> (1 2 3)
About to parse: "1 2 3 "
I parsed: [Token { expr: Number(1.0), row: [1, 1], col: [1, 1], file: "REPL Input" }, Token { expr: Number(2.0), row: [1, 1], col: [1, 1], file: "REPL Input" }, Token { expr: Number(3.0), row: [1, 1], col: [1, 1], file: "REPL Input" }]
[Token { expr: Cons(Number(1.0), Cons(Number(2.0), Cons(Number(3.0), Bool(false)))), row: [1, 1], col: [1, 1], file: "REPL Input" }]
Returned (1 2 3)
eval> (1 2 3 ")" 2 3)
About to parse: "1 2 3 \")\" 2 3 "
7, )
I parsed: [Token { expr: Number(1.0), row: [1, 1], col: [1, 1], file: "REPL Input" }, Token { expr: Number(2.0), row: [1, 1], col: [1, 1], file: "REPL Input" }, Token { expr: Number(3.0), row: [1, 1], col: [1, 1], file: "REPL Input" }, Token { expr: String(")"), row: [1, 1], col: [7, 2], file: "REPL Input" }, Token { expr: Symbol("\")\" 2"), row: [1, 1], col: [7, 2], file: "REPL Input" }, Token { expr: Number(3.0), row: [1, 1], col: [7, 2], file: "REPL Input" }]
[Token { expr: Cons(Number(1.0), Cons(Number(2.0), Cons(Number(3.0), Cons(String(")"), Cons(Symbol("\")\" 2"), Cons(Number(3.0), Bool(false))))))), row: [1, 1], col: [1, 1], file: "REPL Input" }]
Returned (1 2 3 ) ")" 2 3)
Why is that? EDIT: It is a bug that says the code is in a string as it is not when viewed through another text editor.
1 post - 1 participant
🏷️ Rust_feed