cmake_tidy_lexer/
lexer.rs1use cmake_tidy_ast::TextRange;
2
3use crate::cursor::Cursor;
4use crate::token::{Token, TokenKind};
5
6#[must_use]
7pub fn tokenize(source: &str) -> Vec<Token> {
8 let mut lexer = Lexer::new(source);
9 lexer.tokenize()
10}
11
12struct Lexer<'a> {
13 cursor: Cursor<'a>,
14}
15
16impl<'a> Lexer<'a> {
17 const fn new(source: &'a str) -> Self {
18 Self {
19 cursor: Cursor::new(source),
20 }
21 }
22
23 fn tokenize(&mut self) -> Vec<Token> {
24 let mut tokens = Vec::new();
25
26 while !self.cursor.is_eof() {
27 let start = self.cursor.offset();
28 let token = match self.cursor.peek_char() {
29 Some('(') => {
30 self.cursor.bump_char();
31 Token::new(
32 TokenKind::LeftParen,
33 TextRange::new(start, self.cursor.offset()),
34 )
35 }
36 Some(')') => {
37 self.cursor.bump_char();
38 Token::new(
39 TokenKind::RightParen,
40 TextRange::new(start, self.cursor.offset()),
41 )
42 }
43 Some('\n') => {
44 self.cursor.bump_char();
45 Token::new(
46 TokenKind::Newline,
47 TextRange::new(start, self.cursor.offset()),
48 )
49 }
50 Some('\r') => {
51 if self.cursor.starts_with("\r\n") {
52 self.cursor.advance_bytes(2);
53 } else {
54 self.cursor.bump_char();
55 }
56 Token::new(
57 TokenKind::Newline,
58 TextRange::new(start, self.cursor.offset()),
59 )
60 }
61 Some(' ' | '\t' | '\u{0C}') => self.lex_whitespace(start),
62 Some('#') => self.lex_comment(start),
63 Some('"') => self.lex_quoted_argument(start),
64 Some('[') => self.lex_bracket_or_unquoted(start),
65 Some(_) => self.lex_bare(start),
66 None => break,
67 };
68
69 tokens.push(token);
70 }
71
72 tokens
73 }
74
75 fn lex_whitespace(&mut self, start: usize) -> Token {
76 while let Some(character) = self.cursor.peek_char() {
77 if matches!(character, ' ' | '\t' | '\u{0C}') {
78 self.cursor.bump_char();
79 } else {
80 break;
81 }
82 }
83
84 let text = self.cursor.slice(start, self.cursor.offset()).to_owned();
85 Token::new(
86 TokenKind::Whitespace(text),
87 TextRange::new(start, self.cursor.offset()),
88 )
89 }
90
91 fn lex_comment(&mut self, start: usize) -> Token {
92 while let Some(character) = self.cursor.peek_char() {
93 if matches!(character, '\n' | '\r') {
94 break;
95 }
96 self.cursor.bump_char();
97 }
98
99 let text = self.cursor.slice(start, self.cursor.offset()).to_owned();
100 Token::new(
101 TokenKind::Comment(text),
102 TextRange::new(start, self.cursor.offset()),
103 )
104 }
105
106 fn lex_quoted_argument(&mut self, start: usize) -> Token {
107 self.cursor.bump_char();
108
109 while let Some(character) = self.cursor.peek_char() {
110 self.cursor.bump_char();
111
112 if character == '\\' {
113 let _ = self.cursor.bump_char();
114 continue;
115 }
116
117 if character == '"' {
118 break;
119 }
120 }
121
122 let text = self.cursor.slice(start, self.cursor.offset()).to_owned();
123 Token::new(
124 TokenKind::QuotedArgument(text),
125 TextRange::new(start, self.cursor.offset()),
126 )
127 }
128
129 fn lex_bracket_or_unquoted(&mut self, start: usize) -> Token {
130 if let Some(open_len) = bracket_open_len(self.cursor.remaining()) {
131 let eq_count = open_len.saturating_sub(2);
132 let closing = format!("]{}]", "=".repeat(eq_count));
133
134 self.cursor.advance_bytes(open_len);
135
136 if let Some(relative_end) = self.cursor.remaining().find(&closing) {
137 self.cursor.advance_bytes(relative_end + closing.len());
138 } else {
139 self.cursor.advance_bytes(self.cursor.remaining().len());
140 }
141
142 let text = self.cursor.slice(start, self.cursor.offset()).to_owned();
143 return Token::new(
144 TokenKind::BracketArgument(text),
145 TextRange::new(start, self.cursor.offset()),
146 );
147 }
148
149 self.lex_bare(start)
150 }
151
152 fn lex_bare(&mut self, start: usize) -> Token {
153 while let Some(character) = self.cursor.peek_char() {
154 if is_bare_terminator(character) {
155 break;
156 }
157 self.cursor.bump_char();
158 }
159
160 let text = self.cursor.slice(start, self.cursor.offset()).to_owned();
161 let kind = if is_identifier(&text) {
162 TokenKind::Identifier(text)
163 } else {
164 TokenKind::UnquotedArgument(text)
165 };
166
167 Token::new(kind, TextRange::new(start, self.cursor.offset()))
168 }
169}
170
171fn bracket_open_len(text: &str) -> Option<usize> {
172 let bytes = text.as_bytes();
173 if bytes.first() != Some(&b'[') {
174 return None;
175 }
176
177 let mut offset = 1;
178 while bytes.get(offset) == Some(&b'=') {
179 offset += 1;
180 }
181
182 if bytes.get(offset) == Some(&b'[') {
183 Some(offset + 1)
184 } else {
185 None
186 }
187}
188
189const fn is_bare_terminator(character: char) -> bool {
190 matches!(
191 character,
192 '(' | ')' | '#' | ' ' | '\t' | '\n' | '\r' | '\u{0C}'
193 )
194}
195
196fn is_identifier(text: &str) -> bool {
197 let mut characters = text.chars();
198 let Some(first) = characters.next() else {
199 return false;
200 };
201
202 if !(first == '_' || first.is_ascii_alphabetic()) {
203 return false;
204 }
205
206 characters.all(|character| character == '_' || character.is_ascii_alphanumeric())
207}