css_parse/
parser.rs

1use crate::{
2	Cursor, Diagnostic, Feature, Kind, KindSet, ParserCheckpoint, ParserReturn, Result, SourceOffset, ToCursors,
3	traits::{Parse, Peek},
4};
5use bitmask_enum::bitmask;
6use bumpalo::{Bump, collections::Vec};
7use css_lexer::{AtomSet, DynAtomSet, SourceCursor};
8use std::mem;
9
10// This is chosen rather arbitrarily, but:
11// - It needs to be a number larger than BUFFER_REFILL_INDEX (the largest `peek_n` distance we currently peek).
12// - It would be nice to keep Parser aligned to 64. It's not moved/copied... ever, so struct size doesn't really matter
13//   but making it, say, 1000, doesn't really improve performance. Always benchmark when changing!
14const BUFFER_LEN: usize = 12;
15// This number is chosen specifically because we peek_n(5) at most. Ensuring the buffer is always full enough that
16// peeks only use the buffer and don't end up cloning the lexer. While cloning the lexer is quite cheap, it's definitely
17// cheaper to simply look into the buffer. If we ever peek more than 5 tokens, we should change this number.
18const BUFFER_REFILL_INDEX: usize = BUFFER_LEN - 5;
19
20#[derive(Debug)]
21pub struct Parser<'a, I: Iterator<Item = Cursor> + Clone> {
22	pub(crate) source_text: &'a str,
23
24	pub(crate) cursor_iter: I,
25
26	#[allow(dead_code)]
27	pub(crate) features: Feature,
28
29	pub(crate) errors: Vec<'a, Diagnostic>,
30
31	pub(crate) trivia: Vec<'a, (Vec<'a, Cursor>, Cursor)>,
32
33	pub(crate) state: State,
34
35	pub(crate) bump: &'a Bump,
36
37	skip: KindSet,
38
39	stop: KindSet,
40
41	buffer: [Cursor; BUFFER_LEN],
42	buffer_index: usize,
43
44	#[cfg(debug_assertions)]
45	pub(crate) last_cursor: Option<Cursor>,
46}
47
48#[bitmask(u8)]
49#[bitmask_config(vec_debug)]
50#[derive(Default)]
51pub enum State {
52	Nested = 0b0000_0001,
53	/// Disallow relative selectors (:has). Set when inside :has() since nested :has() is invalid.
54	DisallowRelativeSelector = 0b0000_0010,
55}
56
57#[inline]
58fn eof_cursor(len: usize) -> Cursor {
59	let eof_offset = css_lexer::SourceOffset(len as u32);
60	Cursor::new(eof_offset, css_lexer::Token::EOF)
61}
62
63impl<'a, I> Parser<'a, I>
64where
65	I: Iterator<Item = Cursor> + Clone,
66{
67	/// Create a new parser with an iterator over cursors
68	pub fn new(bump: &'a Bump, source_text: &'a str, mut cursor_iter: I) -> Self {
69		let eof_cursor = eof_cursor(source_text.len());
70		let mut buffer = [eof_cursor; BUFFER_LEN];
71		buffer.fill_with(|| cursor_iter.next().unwrap_or(eof_cursor));
72
73		Self {
74			source_text,
75			cursor_iter,
76			features: Feature::none(),
77			errors: Vec::new_in(bump),
78			trivia: Vec::new_in(bump),
79			state: State::none(),
80			skip: KindSet::TRIVIA,
81			stop: KindSet::NONE,
82			buffer,
83			buffer_index: 0,
84			bump,
85			#[cfg(debug_assertions)]
86			last_cursor: None,
87		}
88	}
89
90	pub fn with_features(mut self, features: Feature) -> Self {
91		self.features = features;
92		self
93	}
94
95	fn fill_buffer(&mut self, from: usize) {
96		// Shift remaining buffer cursors left to the start of the slice.
97		self.buffer.copy_within(from..BUFFER_LEN, 0);
98		// Re-fill the buffer with new cursors.
99		let eof = eof_cursor(self.source_text.len());
100		for i in BUFFER_LEN - from..BUFFER_LEN {
101			self.buffer[i] = self.cursor_iter.next().unwrap_or(eof);
102		}
103		self.buffer_index = 0;
104	}
105
106	#[inline]
107	pub fn bump(&self) -> &'a Bump {
108		self.bump
109	}
110
111	#[inline]
112	pub fn enabled(&self, other: Feature) -> bool {
113		self.features.contains(other)
114	}
115
116	#[inline]
117	pub fn is(&self, state: State) -> bool {
118		self.state.contains(state)
119	}
120
121	#[inline]
122	pub fn set_state(&mut self, state: State) -> State {
123		let old = self.state;
124		self.state = state;
125		old
126	}
127
128	#[inline]
129	pub fn set_skip(&mut self, skip: KindSet) -> KindSet {
130		let old = self.skip;
131		self.skip = skip;
132		old
133	}
134
135	#[inline]
136	pub fn set_stop(&mut self, stop: KindSet) -> KindSet {
137		let old = self.stop;
138		self.stop = stop;
139		old
140	}
141
142	pub fn parse_entirely<T: Parse<'a> + ToCursors>(&mut self) -> ParserReturn<'a, T> {
143		let output = match T::parse(self) {
144			Ok(output) => Some(output),
145			Err(error) => {
146				self.errors.push(error);
147				None
148			}
149		};
150		let remaining_non_trivia = !self.at_end() && self.peek_n(1) != Kind::Eof;
151		let at_end = self.peek_n_with_skip(1, KindSet::NONE) == Kind::Eof;
152
153		if !at_end {
154			let start = self.peek_n_with_skip(1, KindSet::NONE);
155			let mut end;
156			loop {
157				end = self.next();
158				if end == Kind::Eof {
159					break;
160				}
161			}
162			if remaining_non_trivia {
163				self.errors.push(Diagnostic::new(start, Diagnostic::expected_end).with_end_cursor(end));
164			}
165		}
166		let errors = mem::replace(&mut self.errors, Vec::new_in(self.bump));
167		let trivia = mem::replace(&mut self.trivia, Vec::new_in(self.bump));
168		ParserReturn::new(output, self.source_text, errors, trivia)
169	}
170
171	pub fn parse<T: Parse<'a>>(&mut self) -> Result<T> {
172		T::parse(self)
173	}
174
175	pub fn peek<T: Peek<'a>>(&self) -> bool {
176		T::peek(self, self.peek_n(1))
177	}
178
179	pub fn parse_if_peek<T: Peek<'a> + Parse<'a>>(&mut self) -> Result<Option<T>> {
180		if T::peek(self, self.peek_n(1)) { T::parse(self).map(Some) } else { Ok(None) }
181	}
182
183	pub fn try_parse<T: Parse<'a>>(&mut self) -> Result<T> {
184		T::try_parse(self)
185	}
186
187	pub fn try_parse_if_peek<T: Peek<'a> + Parse<'a>>(&mut self) -> Result<Option<T>> {
188		if T::peek(self, self.peek_n(1)) { T::try_parse(self).map(Some) } else { Ok(None) }
189	}
190
191	pub fn equals_atom(&self, c: Cursor, atom: &'static dyn DynAtomSet) -> bool {
192		let mut cursor_bits = c.atom_bits();
193		if cursor_bits == 0 {
194			let source_cursor = self.to_source_cursor(c);
195			cursor_bits = atom.str_to_bits(&source_cursor.parse(self.bump));
196		}
197		cursor_bits == atom.bits()
198	}
199
200	pub fn to_atom<A: AtomSet + PartialEq>(&self, c: Cursor) -> A {
201		let bits = c.atom_bits();
202		if bits == 0 {
203			let source_cursor = self.to_source_cursor(c);
204			return A::from_str(&source_cursor.parse(self.bump));
205		}
206		#[cfg(debug_assertions)]
207		{
208			let source_cursor = self.to_source_cursor(c);
209			if !(c == Kind::Ident && c.token().is_dashed_ident()) {
210				debug_assert!(
211					A::from_bits(bits) == A::from_str(&source_cursor.parse(self.bump)),
212					"{:?} -> {:?} != {:?} ({:?})",
213					c,
214					A::from_bits(bits),
215					A::from_str(&source_cursor.parse(self.bump)),
216					source_cursor.parse(self.bump)
217				);
218			}
219		}
220		A::from_bits(bits)
221	}
222
223	#[inline(always)]
224	pub fn offset(&self) -> SourceOffset {
225		self.buffer[self.buffer_index].offset()
226	}
227
228	#[inline(always)]
229	pub fn at_end(&self) -> bool {
230		self.buffer[self.buffer_index] == Kind::Eof
231	}
232
233	pub fn rewind(&mut self, checkpoint: ParserCheckpoint<I>) {
234		let ParserCheckpoint { iter, errors_pos, trivia_pos, buffer, buffer_index, skip, stop, state, .. } = checkpoint;
235
236		self.cursor_iter = iter;
237
238		self.errors.truncate(errors_pos as usize);
239		self.trivia.truncate(trivia_pos as usize);
240
241		self.buffer = buffer;
242		self.buffer_index = buffer_index;
243
244		self.skip = skip;
245		self.stop = stop;
246		self.state = state;
247
248		#[cfg(debug_assertions)]
249		{
250			self.last_cursor = None;
251		}
252	}
253
254	#[inline]
255	pub fn checkpoint(&self) -> ParserCheckpoint<I> {
256		ParserCheckpoint {
257			cursor: self.buffer[self.buffer_index],
258			errors_pos: self.errors.len() as u8,
259			trivia_pos: self.trivia.len() as u16,
260			iter: self.cursor_iter.clone(),
261			buffer: self.buffer,
262			buffer_index: self.buffer_index,
263			skip: self.skip,
264			stop: self.stop,
265			state: self.state,
266		}
267	}
268
269	#[inline]
270	pub fn next_is_stop(&self) -> bool {
271		for c in &self.buffer[self.buffer_index..BUFFER_LEN] {
272			if c != self.skip {
273				return c == self.stop;
274			}
275		}
276
277		let mut iter = self.cursor_iter.clone();
278		loop {
279			let Some(cursor) = iter.next() else {
280				return false;
281			};
282			if cursor != self.skip {
283				return cursor == self.stop;
284			}
285		}
286	}
287
288	#[inline]
289	pub(crate) fn peek_n_with_skip(&self, n: u8, skip: KindSet) -> Cursor {
290		let mut remaining = n;
291
292		for c in &self.buffer[self.buffer_index..BUFFER_LEN] {
293			if c == Kind::Eof {
294				return *c;
295			}
296			if c != skip {
297				remaining -= 1;
298				if remaining == 0 {
299					return *c;
300				}
301			}
302		}
303
304		let mut iter = self.cursor_iter.clone();
305		loop {
306			let Some(cursor) = iter.next() else {
307				return eof_cursor(self.source_text.len());
308			};
309			if cursor == Kind::Eof {
310				return cursor;
311			}
312			if cursor != skip {
313				remaining -= 1;
314				if remaining == 0 {
315					return cursor;
316				}
317			}
318		}
319	}
320
321	#[inline]
322	pub fn peek_n(&self, n: u8) -> Cursor {
323		self.peek_n_with_skip(n, self.skip)
324	}
325
326	pub fn to_source_cursor(&self, cursor: Cursor) -> SourceCursor<'a> {
327		SourceCursor::from(cursor, cursor.str_slice(self.source_text))
328	}
329
330	pub fn consume_trivia(&mut self) -> Vec<'a, Cursor> {
331		let mut trivia = Vec::new_in(self.bump);
332		for i in self.buffer_index..BUFFER_LEN {
333			let c = self.buffer[i];
334			if c == Kind::Eof {
335				return trivia;
336			} else if c == self.skip {
337				trivia.push(c)
338			} else {
339				self.fill_buffer(i);
340				return trivia;
341			}
342		}
343
344		loop {
345			let Some(c) = self.cursor_iter.next() else {
346				return trivia;
347			};
348			if c == Kind::Eof {
349				return trivia;
350			} else if c == self.skip {
351				trivia.push(c)
352			} else {
353				let eof = eof_cursor(self.source_text.len());
354				self.buffer[0] = c;
355				for i in 1..BUFFER_LEN {
356					self.buffer[i] = self.cursor_iter.next().unwrap_or(eof);
357				}
358				self.buffer_index = 0;
359				return trivia;
360			}
361		}
362	}
363
364	/// Consume trivia and attach it to the next content token for output preservation.
365	/// This should be called when you want to consume whitespace/comments but preserve
366	/// them for round-trip output fidelity.
367	pub fn consume_trivia_as_leading(&mut self) {
368		let trivia = self.consume_trivia();
369		if !trivia.is_empty() {
370			// Peek the next content token to attach trivia to it
371			let next = self.peek_n(1);
372			self.trivia.push((trivia, next));
373		}
374	}
375
376	#[allow(clippy::should_implement_trait)]
377	pub fn next(&mut self) -> Cursor {
378		// Collect trivia that should be associated with the next content token
379		let mut pending_trivia = Vec::new_in(self.bump);
380
381		if self.buffer_index >= BUFFER_REFILL_INDEX {
382			self.fill_buffer(self.buffer_index);
383		}
384
385		for i in self.buffer_index..BUFFER_LEN {
386			let c = self.buffer[i];
387			if c == Kind::Eof {
388				self.buffer_index = i + 1;
389				// Associate pending trivia with EOF if any
390				if !pending_trivia.is_empty() {
391					self.trivia.push((pending_trivia.clone(), c));
392				}
393				return c;
394			} else if c == self.skip {
395				pending_trivia.push(c);
396				self.buffer_index = i + 1;
397			} else {
398				self.buffer_index = i + 1;
399				// Associate all pending trivia with this content token
400				if !pending_trivia.is_empty() {
401					self.trivia.push((pending_trivia.clone(), c));
402				}
403				return c;
404			}
405		}
406
407		let c;
408		loop {
409			let Some(cursor) = self.cursor_iter.next() else {
410				let eof_cursor = eof_cursor(self.source_text.len());
411				if !pending_trivia.is_empty() {
412					self.trivia.push((pending_trivia.clone(), eof_cursor));
413				}
414				return eof_cursor;
415			};
416			if cursor == Kind::Eof || cursor != self.skip {
417				c = cursor;
418				break;
419			}
420			pending_trivia.push(cursor);
421		}
422
423		// Associate pending trivia with the content token we found
424		if !pending_trivia.is_empty() {
425			self.trivia.push((pending_trivia.clone(), c));
426		}
427
428		#[cfg(debug_assertions)]
429		if let Some(last_cursor) = self.last_cursor {
430			debug_assert!(last_cursor != c, "Detected a next loop, {c:?} was fetched twice");
431		}
432		#[cfg(debug_assertions)]
433		if c == Kind::Eof {
434			self.last_cursor = None;
435		} else {
436			self.last_cursor = Some(c);
437		}
438
439		c
440	}
441}
442
443#[test]
444fn peek_and_next() {
445	let str = "0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21";
446	let bump = bumpalo::Bump::default();
447	let lexer = css_lexer::Lexer::new(&css_lexer::EmptyAtomSet::ATOMS, &str);
448	let mut p = Parser::new(&bump, &str, lexer);
449	assert_eq!(p.at_end(), false);
450	assert_eq!(p.offset(), 0);
451	for n in 0..=1 {
452		let c = p.checkpoint();
453		for i in 0..=19 {
454			let c = p.peek_n(1);
455			assert_eq!(c.token(), Kind::Number);
456			assert_eq!(c.token().value(), i as f32);
457			let c = p.peek_n(2);
458			assert_eq!(c.token(), Kind::Number);
459			assert_eq!(c.token().value(), (i + 1) as f32);
460			let c = p.peek_n(3);
461			assert_eq!(c.token(), Kind::Number);
462			assert_eq!(c.token().value(), (i + 2) as f32);
463			let c = p.next();
464			assert_eq!(c.token().value(), i as f32);
465			let c = p.peek_n(1);
466			assert_eq!(c.token(), Kind::Number);
467			assert_eq!(c.token().value(), (i + 1) as f32);
468		}
469		if n == 0 {
470			p.rewind(c)
471		}
472	}
473	let c = p.next();
474	assert_eq!(c.token(), Kind::Number);
475	assert_eq!(c.token().value(), 20.0);
476	let c = p.next();
477	assert_eq!(c.token(), Kind::Number);
478	assert_eq!(c.token().value(), 21.0);
479	let c = p.next();
480	assert_eq!(c.token(), Kind::Eof);
481}
482
483#[test]
484fn peek_and_next_with_whitsespace() {
485	let str = "0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21";
486	let bump = bumpalo::Bump::default();
487	let lexer = css_lexer::Lexer::new(&css_lexer::EmptyAtomSet::ATOMS, &str);
488	let mut p = Parser::new(&bump, &str, lexer);
489	p.set_skip(KindSet::COMMENTS);
490	assert_eq!(p.at_end(), false);
491	assert_eq!(p.offset(), 0);
492	for n in 0..=1 {
493		let c = p.checkpoint();
494		for i in 0..=19 {
495			let c = p.peek_n(1);
496			assert_eq!(c.token(), Kind::Number);
497			assert_eq!(c.token().value(), i as f32);
498			let c = p.peek_n(2);
499			assert_eq!(c.token(), Kind::Whitespace);
500			let c = p.peek_n(3);
501			assert_eq!(c.token(), Kind::Number);
502			assert_eq!(c.token().value(), (i + 1) as f32);
503			let c = p.peek_n(4);
504			assert_eq!(c.token(), Kind::Whitespace);
505			let c = p.peek_n(5);
506			assert_eq!(c.token(), Kind::Number);
507			assert_eq!(c.token().value(), (i + 2) as f32);
508			let c = p.next();
509			assert_eq!(c.token().value(), i as f32);
510			let c = p.peek_n(1);
511			assert_eq!(c.token(), Kind::Whitespace);
512			let c = p.peek_n(2);
513			assert_eq!(c.token(), Kind::Number);
514			assert_eq!(c.token().value(), (i + 1) as f32);
515			p.next();
516		}
517		if n == 0 {
518			p.rewind(c);
519		}
520	}
521	let c = p.next();
522	assert_eq!(c.token(), Kind::Number);
523	assert_eq!(c.token().value(), 20.0);
524	let c = p.next();
525	assert_eq!(c.token(), Kind::Whitespace);
526	let c = p.next();
527	assert_eq!(c.token(), Kind::Number);
528	assert_eq!(c.token().value(), 21.0);
529	let c = p.next();
530	assert_eq!(c.token(), Kind::Eof);
531}