Why does the tokenize function parse lists incorrectly when combined with strings?

⚓ Rust 📅 2025-09-10 👤 surdeus 👁️ 11

Warning

This post was published 63 days ago. The information described in this article may have changed.

Info

This post is auto-generated from RSS feed The Rust Programming Language Forum - Latest topics. Source: Why does the tokenize function parse lists incorrectly when combined with strings?

I am making a lisp interpreter in rust. I am currently writing a tokenize fun that takes a string and turns into a vector of tokens:

pub fn tokenize(expr: &str, loc: &str) -> Vec<Token> {
    let mut tokens = vec![]; // tokens
    let mut row = [1, 1]; // starting column
    let mut col = [1, 1]; // ending column
    let chars: Vec<char> = expr.chars().collect();
    let mut i = 0;
    let mut last_space = 0; // last space
    while i < chars.len() {
	match &chars[i] {
	    '"' => { // if there is a quote
		// The start of a string
		i += 1;
		let mut end = i; // end quote
		let mut endcol = col[0]; // end column
		let mut slashes = 0; // number of slashes
		let mut startrow = row[0]; // starting column
		let mut endrow = row[0]; // ending column
		let mut content = String::new(); // the string to use
		for j in &chars[i..] { // loop through chars
		    if *j == '\\' { // if there is a backslash
			slashes += 1 // increment the counter
		    } else if *j == '"' && slashes % 2 == 0 { // if there is a quote and is not escaped
			break // stop string lookththrough
		    } else if *j == '\n' { // if there is a newline
			endrow += 1; // new row
			endcol = 1; // first col
		    } else {
			slashes = 0; // reset shlash counter
			content.push(*j) // push the string
		    }
		    end += 1; // end must be later
		    endcol += 1; // new col
		    println!("{}, {}",i, j)
		}
		row = [startrow, endrow]; // set the row for next expr
		col = [i, endcol]; // set the col for next expr
		tokens.push(Token { // add the token
		    expr: Expr::String(parse_str(&content)), // parse the string for escape chars
		    row,
		    col,
		    file: loc.to_string()
		});
		i = end+1; // increment i past the string
	    }
	    '(' => {
		let mut depth = 1; // layers of nested parentheses
                let start = i + 1; // starting parenthesis
                let mut end = start; // ending parenthesis
		let mut instring = false; // prevent cleverly tricking the program by putting the closing bracket in a string
                while end < chars.len() && depth > 0 { // while not at EOF
                    if chars[end] == '(' && !instring { // if it is a (
                        depth += 1; // increment depth
                    } else if chars[end] == ')' && !instring { // if it is a )
                        depth -= 1; // decrement depth 
                    } else if chars[end] == '"' {
			instring = !instring
		    }
                    end += 1; // end is atleast 1 char ahead
                }
                if depth != 0 { // if depth is not 0
                    panic!("Unmatched '('"); // there must be an unmatched parenthesis
                }
		let mut inside: String = chars[start..end - 1].iter().collect(); // get the inside of items
		inside.push(' '); // push space
		println!("About to parse: {:?}",inside);
		let inner_tokens = tokenize(&inside, loc); // recursively tokenize inner contents
		println!("I parsed: {:?}",&inner_tokens);
		let inner_exprs = inner_tokens.into_iter().map(|t| t.expr).collect(); // remove row, col and file data.
		i = end; // make i skip closing bracket
		tokens.push(Token { // push new tokens
		    expr: Expr::list(inner_exprs),
		    row,
		    col,
		    file: loc.to_string()
		})
		
	    }
	    ')' => {
		panic!("Unmatched ')'")
	    }
	    ' ' | '\t' | '\n'  => { // if there is a space, tab or newline
		let expression: String; // initialize expression
		if last_space != 0 { // if this is the first space
		    expression = chars[last_space+1..i].iter().collect(); // get the string between this space and the space before this (or start of expr)
		} else { // not the first space
		   expression = chars[last_space..i].iter().collect(); // get the string between this space and the space before this (or start of expr)
		}
		let isquoted = expression.starts_with('\''); // if it starts with a single quote, it is quoted
		let isquasi = expression.starts_with('`'); // if it starts with a backtick, it is quasiquoted
		tokens.push(parse(&expression, &row, &col, loc, isquoted, isquasi)); // parse the atom with this data
		last_space = i; // this was the last space
	    }
	    _ => {}
	}
	i += 1;
	/*if i < chars.len() {
	    println!(">> {}", chars[i]);
	}*/
    }
    tokens
}

/* parse: parses string into token atom*/
fn parse(s: &str, row: &[usize; 2], col: &[usize; 2], file: &str, is_quoted: bool, is_quasi: bool) -> Token {
    let expr = if let Ok(n) = s.parse::<f64>() {
        Expr::Number(n)
    } else if s == "t" || s == "nil" {
        if s == "t" {
	    Expr::Bool(true)
	} else {
	    Expr::Bool(false)
	}
    } else {
	if is_quoted {
	    Expr::Function(vec![Expr::Symbol(String::from("quote")),Expr::Symbol(s.to_string())])
	} else if is_quasi {
	    Expr::Function(vec![Expr::Symbol(String::from("quasiquote")),Expr::Symbol(s.to_string())])
	} else {
	    Expr::Symbol(s.to_lowercase().to_string())
	}	    
    };

    Token {
        expr,
        row: *row,
        col: *col,
        file: file.to_string(),
    }
}

fn main() {
    println!("Type an s-expr or use C-c to quit.");
    let args: Vec<String> = env::args().collect();
    // Tracing for debug
    let trace = args.iter().any(|arg| arg == "--debug");
    let mut starterStack = vec![ // Starter stack to pass to eval
	Rc::new(RefCell::new(Scope::new()))
    ];
    starterStack[0].clone().borrow_mut().set(String::from("*cwd*"),ScopeEntry {
	val: Expr::String(env::current_dir().unwrap().display().to_string()),
	constant: false,
    });
    /*for i in it::tokenize(
	    &it::clean(
		&fs::read_to_string("src/init.lisp").expect("Cannot read init.lisp")
	    ), "src/init.lisp" 
	) {
	it::eval(i, &mut starterStack, trace);
    }*/

    loop {
	let mut input = String::new();
	print!("eval> ");
	io::stdout().flush().unwrap();
	std::io::stdin().read_line(&mut input).expect("readline error");
	let mut data = it::tokenize(&it::clean(&input),"REPL Input");
	println!("{:?}",data);
}
}

The tokenizer however cannot see strings in list with ( or ) properly:

Type an s-expr or use C-c to quit.
eval> 2
[Token { expr: Number(2.0), row: [1, 1], col: [1, 1], file: "REPL Input" }]

Returned 2
eval> (1 2 3)
About to parse: "1 2 3 "
I parsed: [Token { expr: Number(1.0), row: [1, 1], col: [1, 1], file: "REPL Input" }, Token { expr: Number(2.0), row: [1, 1], col: [1, 1], file: "REPL Input" }, Token { expr: Number(3.0), row: [1, 1], col: [1, 1], file: "REPL Input" }]
[Token { expr: Cons(Number(1.0), Cons(Number(2.0), Cons(Number(3.0), Bool(false)))), row: [1, 1], col: [1, 1], file: "REPL Input" }]

Returned (1 2 3)
eval> (1 2 3 ")" 2 3)
About to parse: "1 2 3 \")\" 2 3 "
7, )
I parsed: [Token { expr: Number(1.0), row: [1, 1], col: [1, 1], file: "REPL Input" }, Token { expr: Number(2.0), row: [1, 1], col: [1, 1], file: "REPL Input" }, Token { expr: Number(3.0), row: [1, 1], col: [1, 1], file: "REPL Input" }, Token { expr: String(")"), row: [1, 1], col: [7, 2], file: "REPL Input" }, Token { expr: Symbol("\")\" 2"), row: [1, 1], col: [7, 2], file: "REPL Input" }, Token { expr: Number(3.0), row: [1, 1], col: [7, 2], file: "REPL Input" }]
[Token { expr: Cons(Number(1.0), Cons(Number(2.0), Cons(Number(3.0), Cons(String(")"), Cons(Symbol("\")\" 2"), Cons(Number(3.0), Bool(false))))))), row: [1, 1], col: [1, 1], file: "REPL Input" }]

Returned (1 2 3 ) ")" 2 3)

Why is that? EDIT: It is a bug that says the code is in a string as it is not when viewed through another text editor.

1 post - 1 participant

Read full topic

🏷️ Rust_feed

👍 󠁮󠁮󠁮󠁮 👎 󠁮󠁮󠁮󠁮