Skip to main content

cmake_tidy_lexer/
lexer.rs

1use cmake_tidy_ast::TextRange;
2
3use crate::cursor::Cursor;
4use crate::token::{Token, TokenKind};
5
6#[must_use]
7pub fn tokenize(source: &str) -> Vec<Token> {
8    let mut lexer = Lexer::new(source);
9    lexer.tokenize()
10}
11
12struct Lexer<'a> {
13    cursor: Cursor<'a>,
14}
15
16impl<'a> Lexer<'a> {
17    const fn new(source: &'a str) -> Self {
18        Self {
19            cursor: Cursor::new(source),
20        }
21    }
22
23    fn tokenize(&mut self) -> Vec<Token> {
24        let mut tokens = Vec::new();
25
26        while !self.cursor.is_eof() {
27            let start = self.cursor.offset();
28            let token = match self.cursor.peek_char() {
29                Some('(') => {
30                    self.cursor.bump_char();
31                    Token::new(
32                        TokenKind::LeftParen,
33                        TextRange::new(start, self.cursor.offset()),
34                    )
35                }
36                Some(')') => {
37                    self.cursor.bump_char();
38                    Token::new(
39                        TokenKind::RightParen,
40                        TextRange::new(start, self.cursor.offset()),
41                    )
42                }
43                Some('\n') => {
44                    self.cursor.bump_char();
45                    Token::new(
46                        TokenKind::Newline,
47                        TextRange::new(start, self.cursor.offset()),
48                    )
49                }
50                Some('\r') => {
51                    if self.cursor.starts_with("\r\n") {
52                        self.cursor.advance_bytes(2);
53                    } else {
54                        self.cursor.bump_char();
55                    }
56                    Token::new(
57                        TokenKind::Newline,
58                        TextRange::new(start, self.cursor.offset()),
59                    )
60                }
61                Some(' ' | '\t' | '\u{0C}') => self.lex_whitespace(start),
62                Some('#') => self.lex_comment(start),
63                Some('"') => self.lex_quoted_argument(start),
64                Some('[') => self.lex_bracket_or_unquoted(start),
65                Some(_) => self.lex_bare(start),
66                None => break,
67            };
68
69            tokens.push(token);
70        }
71
72        tokens
73    }
74
75    fn lex_whitespace(&mut self, start: usize) -> Token {
76        while let Some(character) = self.cursor.peek_char() {
77            if matches!(character, ' ' | '\t' | '\u{0C}') {
78                self.cursor.bump_char();
79            } else {
80                break;
81            }
82        }
83
84        let text = self.cursor.slice(start, self.cursor.offset()).to_owned();
85        Token::new(
86            TokenKind::Whitespace(text),
87            TextRange::new(start, self.cursor.offset()),
88        )
89    }
90
91    fn lex_comment(&mut self, start: usize) -> Token {
92        while let Some(character) = self.cursor.peek_char() {
93            if matches!(character, '\n' | '\r') {
94                break;
95            }
96            self.cursor.bump_char();
97        }
98
99        let text = self.cursor.slice(start, self.cursor.offset()).to_owned();
100        Token::new(
101            TokenKind::Comment(text),
102            TextRange::new(start, self.cursor.offset()),
103        )
104    }
105
106    fn lex_quoted_argument(&mut self, start: usize) -> Token {
107        self.cursor.bump_char();
108
109        while let Some(character) = self.cursor.peek_char() {
110            self.cursor.bump_char();
111
112            if character == '\\' {
113                let _ = self.cursor.bump_char();
114                continue;
115            }
116
117            if character == '"' {
118                break;
119            }
120        }
121
122        let text = self.cursor.slice(start, self.cursor.offset()).to_owned();
123        Token::new(
124            TokenKind::QuotedArgument(text),
125            TextRange::new(start, self.cursor.offset()),
126        )
127    }
128
129    fn lex_bracket_or_unquoted(&mut self, start: usize) -> Token {
130        if let Some(open_len) = bracket_open_len(self.cursor.remaining()) {
131            let eq_count = open_len.saturating_sub(2);
132            let closing = format!("]{}]", "=".repeat(eq_count));
133
134            self.cursor.advance_bytes(open_len);
135
136            if let Some(relative_end) = self.cursor.remaining().find(&closing) {
137                self.cursor.advance_bytes(relative_end + closing.len());
138            } else {
139                self.cursor.advance_bytes(self.cursor.remaining().len());
140            }
141
142            let text = self.cursor.slice(start, self.cursor.offset()).to_owned();
143            return Token::new(
144                TokenKind::BracketArgument(text),
145                TextRange::new(start, self.cursor.offset()),
146            );
147        }
148
149        self.lex_bare(start)
150    }
151
152    fn lex_bare(&mut self, start: usize) -> Token {
153        while let Some(character) = self.cursor.peek_char() {
154            if is_bare_terminator(character) {
155                break;
156            }
157            self.cursor.bump_char();
158        }
159
160        let text = self.cursor.slice(start, self.cursor.offset()).to_owned();
161        let kind = if is_identifier(&text) {
162            TokenKind::Identifier(text)
163        } else {
164            TokenKind::UnquotedArgument(text)
165        };
166
167        Token::new(kind, TextRange::new(start, self.cursor.offset()))
168    }
169}
170
171fn bracket_open_len(text: &str) -> Option<usize> {
172    let bytes = text.as_bytes();
173    if bytes.first() != Some(&b'[') {
174        return None;
175    }
176
177    let mut offset = 1;
178    while bytes.get(offset) == Some(&b'=') {
179        offset += 1;
180    }
181
182    if bytes.get(offset) == Some(&b'[') {
183        Some(offset + 1)
184    } else {
185        None
186    }
187}
188
189const fn is_bare_terminator(character: char) -> bool {
190    matches!(
191        character,
192        '(' | ')' | '#' | ' ' | '\t' | '\n' | '\r' | '\u{0C}'
193    )
194}
195
196fn is_identifier(text: &str) -> bool {
197    let mut characters = text.chars();
198    let Some(first) = characters.next() else {
199        return false;
200    };
201
202    if !(first == '_' || first.is_ascii_alphabetic()) {
203        return false;
204    }
205
206    characters.all(|character| character == '_' || character.is_ascii_alphanumeric())
207}