css_parse/
parser.rs

1use crate::{
2	Cursor, Diagnostic, Feature, Kind, KindSet, ParserCheckpoint, ParserReturn, Result, SourceOffset, ToCursors,
3	traits::{Parse, Peek},
4};
5use bitmask_enum::bitmask;
6use bumpalo::{Bump, collections::Vec};
7use css_lexer::{AtomSet, DynAtomSet, SourceCursor};
8use std::mem;
9
10// This is chosen rather arbitrarily, but:
11// - It needs to be a number larger than BUFFER_REFILL_INDEX (the largest `peek_n` distance we currently peek).
12// - It would be nice to keep Parser aligned to 64. It's not moved/copied... ever, so struct size doesn't really matter
13//   but making it, say, 1000, doesn't really improve performance. Always benchmark when changing!
14const BUFFER_LEN: usize = 12;
15// This number is chosen specifically because we peek_n(5) at most. Ensuring the buffer is always full enough that
16// peeks only use the buffer and don't end up cloning the lexer. While cloning the lexer is quite cheap, it's definitely
17// cheaper to simply look into the buffer. If we ever peek more than 5 tokens, we should change this number.
18const BUFFER_REFILL_INDEX: usize = BUFFER_LEN - 5;
19
20#[derive(Debug)]
21pub struct Parser<'a, I: Iterator<Item = Cursor> + Clone> {
22	pub(crate) source_text: &'a str,
23
24	pub(crate) cursor_iter: I,
25
26	#[allow(dead_code)]
27	pub(crate) features: Feature,
28
29	pub(crate) errors: Vec<'a, Diagnostic>,
30
31	pub(crate) trivia: Vec<'a, (Vec<'a, Cursor>, Cursor)>,
32
33	pub(crate) state: State,
34
35	pub(crate) bump: &'a Bump,
36
37	skip: KindSet,
38
39	stop: KindSet,
40
41	buffer: [Cursor; BUFFER_LEN],
42	buffer_index: usize,
43
44	#[cfg(debug_assertions)]
45	pub(crate) last_cursor: Option<Cursor>,
46}
47
48#[bitmask(u8)]
49#[bitmask_config(vec_debug)]
50#[derive(Default)]
51pub enum State {
52	Nested = 0b0000_0001,
53}
54
55#[inline]
56fn eof_cursor(len: usize) -> Cursor {
57	let eof_offset = css_lexer::SourceOffset(len as u32);
58	Cursor::new(eof_offset, css_lexer::Token::EOF)
59}
60
61impl<'a, I> Parser<'a, I>
62where
63	I: Iterator<Item = Cursor> + Clone,
64{
65	/// Create a new parser with an iterator over cursors
66	pub fn new(bump: &'a Bump, source_text: &'a str, mut cursor_iter: I) -> Self {
67		let eof_cursor = eof_cursor(source_text.len());
68		let mut buffer = [eof_cursor; BUFFER_LEN];
69		buffer.fill_with(|| cursor_iter.next().unwrap_or(eof_cursor));
70
71		Self {
72			source_text,
73			cursor_iter,
74			features: Feature::none(),
75			errors: Vec::new_in(bump),
76			trivia: Vec::new_in(bump),
77			state: State::none(),
78			skip: KindSet::TRIVIA,
79			stop: KindSet::NONE,
80			buffer,
81			buffer_index: 0,
82			bump,
83			#[cfg(debug_assertions)]
84			last_cursor: None,
85		}
86	}
87
88	pub fn with_features(mut self, features: Feature) -> Self {
89		self.features = features;
90		self
91	}
92
93	fn fill_buffer(&mut self, from: usize) {
94		// Shift remaining buffer cursors left to the start of the slice.
95		self.buffer.copy_within(from..BUFFER_LEN, 0);
96		// Re-fill the buffer with new cursors.
97		let eof = eof_cursor(self.source_text.len());
98		for i in BUFFER_LEN - from..BUFFER_LEN {
99			self.buffer[i] = self.cursor_iter.next().unwrap_or(eof);
100		}
101		self.buffer_index = 0;
102	}
103
104	#[inline]
105	pub fn bump(&self) -> &'a Bump {
106		self.bump
107	}
108
109	#[inline]
110	pub fn enabled(&self, other: Feature) -> bool {
111		self.features.contains(other)
112	}
113
114	#[inline]
115	pub fn is(&self, state: State) -> bool {
116		self.state.contains(state)
117	}
118
119	#[inline]
120	pub fn set_state(&mut self, state: State) -> State {
121		let old = self.state;
122		self.state = state;
123		old
124	}
125
126	#[inline]
127	pub fn set_skip(&mut self, skip: KindSet) -> KindSet {
128		let old = self.skip;
129		self.skip = skip;
130		old
131	}
132
133	#[inline]
134	pub fn set_stop(&mut self, stop: KindSet) -> KindSet {
135		let old = self.stop;
136		self.stop = stop;
137		old
138	}
139
140	pub fn parse_entirely<T: Parse<'a> + ToCursors>(&mut self) -> ParserReturn<'a, T> {
141		let output = match T::parse(self) {
142			Ok(output) => Some(output),
143			Err(error) => {
144				self.errors.push(error);
145				None
146			}
147		};
148		let remaining_non_trivia = !self.at_end() && self.peek_n(1) != Kind::Eof;
149		let at_end = self.peek_n_with_skip(1, KindSet::NONE) == Kind::Eof;
150
151		if !at_end {
152			let start = self.peek_n_with_skip(1, KindSet::NONE);
153			let mut end;
154			loop {
155				end = self.next();
156				if end == Kind::Eof {
157					break;
158				}
159			}
160			if remaining_non_trivia {
161				self.errors.push(Diagnostic::new(start, Diagnostic::expected_end).with_end_cursor(end));
162			}
163		}
164		let errors = mem::replace(&mut self.errors, Vec::new_in(self.bump));
165		let trivia = mem::replace(&mut self.trivia, Vec::new_in(self.bump));
166		ParserReturn::new(output, self.source_text, errors, trivia)
167	}
168
169	pub fn parse<T: Parse<'a>>(&mut self) -> Result<T> {
170		T::parse(self)
171	}
172
173	pub fn peek<T: Peek<'a>>(&self) -> bool {
174		T::peek(self, self.peek_n(1))
175	}
176
177	pub fn parse_if_peek<T: Peek<'a> + Parse<'a>>(&mut self) -> Result<Option<T>> {
178		if T::peek(self, self.peek_n(1)) { T::parse(self).map(Some) } else { Ok(None) }
179	}
180
181	pub fn try_parse<T: Parse<'a>>(&mut self) -> Result<T> {
182		T::try_parse(self)
183	}
184
185	pub fn try_parse_if_peek<T: Peek<'a> + Parse<'a>>(&mut self) -> Result<Option<T>> {
186		if T::peek(self, self.peek_n(1)) { T::try_parse(self).map(Some) } else { Ok(None) }
187	}
188
189	pub fn equals_atom(&self, c: Cursor, atom: &'static dyn DynAtomSet) -> bool {
190		let mut cursor_bits = c.atom_bits();
191		if cursor_bits == 0 {
192			let source_cursor = self.to_source_cursor(c);
193			cursor_bits = atom.str_to_bits(&source_cursor.parse(self.bump));
194		}
195		cursor_bits == atom.bits()
196	}
197
198	pub fn to_atom<A: AtomSet + PartialEq>(&self, c: Cursor) -> A {
199		let bits = c.atom_bits();
200		if bits == 0 {
201			let source_cursor = self.to_source_cursor(c);
202			return A::from_str(&source_cursor.parse(self.bump));
203		}
204		#[cfg(debug_assertions)]
205		{
206			let source_cursor = self.to_source_cursor(c);
207			debug_assert!(
208				A::from_bits(bits) == A::from_str(&source_cursor.parse(self.bump)),
209				"{:?} -> {:?} != {:?} ({:?})",
210				c,
211				A::from_bits(bits),
212				A::from_str(&source_cursor.parse(self.bump)),
213				source_cursor.parse(self.bump)
214			);
215		}
216		A::from_bits(bits)
217	}
218
219	#[inline(always)]
220	pub fn offset(&self) -> SourceOffset {
221		self.buffer[self.buffer_index].offset()
222	}
223
224	#[inline(always)]
225	pub fn at_end(&self) -> bool {
226		self.buffer[self.buffer_index] == Kind::Eof
227	}
228
229	pub fn rewind(&mut self, checkpoint: ParserCheckpoint<I>) {
230		let ParserCheckpoint { iter, errors_pos, trivia_pos, buffer, buffer_index, .. } = checkpoint;
231
232		self.cursor_iter = iter;
233
234		self.errors.truncate(errors_pos as usize);
235		self.trivia.truncate(trivia_pos as usize);
236
237		self.buffer = buffer;
238		self.buffer_index = buffer_index;
239
240		#[cfg(debug_assertions)]
241		{
242			self.last_cursor = None;
243		}
244	}
245
246	#[inline]
247	pub fn checkpoint(&self) -> ParserCheckpoint<I> {
248		ParserCheckpoint {
249			cursor: self.buffer[self.buffer_index],
250			errors_pos: self.errors.len() as u8,
251			trivia_pos: self.trivia.len() as u16,
252			iter: self.cursor_iter.clone(),
253			buffer: self.buffer,
254			buffer_index: self.buffer_index,
255		}
256	}
257
258	#[inline]
259	pub fn next_is_stop(&self) -> bool {
260		for c in &self.buffer[self.buffer_index..BUFFER_LEN] {
261			if c != self.skip {
262				return c == self.stop;
263			}
264		}
265
266		let mut iter = self.cursor_iter.clone();
267		loop {
268			let Some(cursor) = iter.next() else {
269				return false;
270			};
271			if cursor != self.skip {
272				return cursor == self.stop;
273			}
274		}
275	}
276
277	#[inline]
278	pub(crate) fn peek_n_with_skip(&self, n: u8, skip: KindSet) -> Cursor {
279		let mut remaining = n;
280
281		for c in &self.buffer[self.buffer_index..BUFFER_LEN] {
282			if c == Kind::Eof {
283				return *c;
284			}
285			if c != skip {
286				remaining -= 1;
287				if remaining == 0 {
288					return *c;
289				}
290			}
291		}
292
293		let mut iter = self.cursor_iter.clone();
294		loop {
295			let Some(cursor) = iter.next() else {
296				return eof_cursor(self.source_text.len());
297			};
298			if cursor == Kind::Eof {
299				return cursor;
300			}
301			if cursor != skip {
302				remaining -= 1;
303				if remaining == 0 {
304					return cursor;
305				}
306			}
307		}
308	}
309
310	#[inline]
311	pub fn peek_n(&self, n: u8) -> Cursor {
312		self.peek_n_with_skip(n, self.skip)
313	}
314
315	pub fn to_source_cursor(&self, cursor: Cursor) -> SourceCursor<'a> {
316		SourceCursor::from(cursor, cursor.str_slice(self.source_text))
317	}
318
319	pub fn consume_trivia(&mut self) -> Vec<'a, Cursor> {
320		let mut trivia = Vec::new_in(self.bump);
321		for i in self.buffer_index..BUFFER_LEN {
322			let c = self.buffer[i];
323			if c == Kind::Eof {
324				return trivia;
325			} else if c == self.skip {
326				trivia.push(c)
327			} else {
328				self.fill_buffer(i);
329				return trivia;
330			}
331		}
332
333		loop {
334			let Some(c) = self.cursor_iter.next() else {
335				return trivia;
336			};
337			if c == Kind::Eof {
338				return trivia;
339			} else if c == self.skip {
340				trivia.push(c)
341			} else {
342				let eof = eof_cursor(self.source_text.len());
343				self.buffer[0] = c;
344				for i in 1..BUFFER_LEN {
345					self.buffer[i] = self.cursor_iter.next().unwrap_or(eof);
346				}
347				self.buffer_index = 0;
348				return trivia;
349			}
350		}
351	}
352
353	#[allow(clippy::should_implement_trait)]
354	pub fn next(&mut self) -> Cursor {
355		// Collect trivia that should be associated with the next content token
356		let mut pending_trivia = Vec::new_in(self.bump);
357
358		if self.buffer_index >= BUFFER_REFILL_INDEX {
359			self.fill_buffer(self.buffer_index);
360		}
361
362		for i in self.buffer_index..BUFFER_LEN {
363			let c = self.buffer[i];
364			if c == Kind::Eof {
365				self.buffer_index = i + 1;
366				// Associate pending trivia with EOF if any
367				if !pending_trivia.is_empty() {
368					self.trivia.push((pending_trivia.clone(), c));
369				}
370				return c;
371			} else if c == self.skip {
372				pending_trivia.push(c);
373				self.buffer_index = i + 1;
374			} else {
375				self.buffer_index = i + 1;
376				// Associate all pending trivia with this content token
377				if !pending_trivia.is_empty() {
378					self.trivia.push((pending_trivia.clone(), c));
379				}
380				return c;
381			}
382		}
383
384		let c;
385		loop {
386			let Some(cursor) = self.cursor_iter.next() else {
387				let eof_cursor = eof_cursor(self.source_text.len());
388				if !pending_trivia.is_empty() {
389					self.trivia.push((pending_trivia.clone(), eof_cursor));
390				}
391				return eof_cursor;
392			};
393			if cursor == Kind::Eof || cursor != self.skip {
394				c = cursor;
395				break;
396			}
397			pending_trivia.push(cursor);
398		}
399
400		// Associate pending trivia with the content token we found
401		if !pending_trivia.is_empty() {
402			self.trivia.push((pending_trivia.clone(), c));
403		}
404
405		#[cfg(debug_assertions)]
406		if let Some(last_cursor) = self.last_cursor {
407			debug_assert!(last_cursor != c, "Detected a next loop, {c:?} was fetched twice");
408		}
409		#[cfg(debug_assertions)]
410		if c == Kind::Eof {
411			self.last_cursor = None;
412		} else {
413			self.last_cursor = Some(c);
414		}
415
416		c
417	}
418}
419
420#[test]
421fn peek_and_next() {
422	let str = "0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21";
423	let bump = bumpalo::Bump::default();
424	let lexer = css_lexer::Lexer::new(&css_lexer::EmptyAtomSet::ATOMS, &str);
425	let mut p = Parser::new(&bump, &str, lexer);
426	assert_eq!(p.at_end(), false);
427	assert_eq!(p.offset(), 0);
428	for n in 0..=1 {
429		let c = p.checkpoint();
430		for i in 0..=19 {
431			let c = p.peek_n(1);
432			assert_eq!(c.token(), Kind::Number);
433			assert_eq!(c.token().value(), i as f32);
434			let c = p.peek_n(2);
435			assert_eq!(c.token(), Kind::Number);
436			assert_eq!(c.token().value(), (i + 1) as f32);
437			let c = p.peek_n(3);
438			assert_eq!(c.token(), Kind::Number);
439			assert_eq!(c.token().value(), (i + 2) as f32);
440			let c = p.next();
441			assert_eq!(c.token().value(), i as f32);
442			let c = p.peek_n(1);
443			assert_eq!(c.token(), Kind::Number);
444			assert_eq!(c.token().value(), (i + 1) as f32);
445		}
446		if n == 0 {
447			p.rewind(c)
448		}
449	}
450	let c = p.next();
451	assert_eq!(c.token(), Kind::Number);
452	assert_eq!(c.token().value(), 20.0);
453	let c = p.next();
454	assert_eq!(c.token(), Kind::Number);
455	assert_eq!(c.token().value(), 21.0);
456	let c = p.next();
457	assert_eq!(c.token(), Kind::Eof);
458}
459
460#[test]
461fn peek_and_next_with_whitsespace() {
462	let str = "0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21";
463	let bump = bumpalo::Bump::default();
464	let lexer = css_lexer::Lexer::new(&css_lexer::EmptyAtomSet::ATOMS, &str);
465	let mut p = Parser::new(&bump, &str, lexer);
466	p.set_skip(KindSet::COMMENTS);
467	assert_eq!(p.at_end(), false);
468	assert_eq!(p.offset(), 0);
469	for n in 0..=1 {
470		let c = p.checkpoint();
471		for i in 0..=19 {
472			let c = p.peek_n(1);
473			assert_eq!(c.token(), Kind::Number);
474			assert_eq!(c.token().value(), i as f32);
475			let c = p.peek_n(2);
476			assert_eq!(c.token(), Kind::Whitespace);
477			let c = p.peek_n(3);
478			assert_eq!(c.token(), Kind::Number);
479			assert_eq!(c.token().value(), (i + 1) as f32);
480			let c = p.peek_n(4);
481			assert_eq!(c.token(), Kind::Whitespace);
482			let c = p.peek_n(5);
483			assert_eq!(c.token(), Kind::Number);
484			assert_eq!(c.token().value(), (i + 2) as f32);
485			let c = p.next();
486			assert_eq!(c.token().value(), i as f32);
487			let c = p.peek_n(1);
488			assert_eq!(c.token(), Kind::Whitespace);
489			let c = p.peek_n(2);
490			assert_eq!(c.token(), Kind::Number);
491			assert_eq!(c.token().value(), (i + 1) as f32);
492			p.next();
493		}
494		if n == 0 {
495			p.rewind(c);
496		}
497	}
498	let c = p.next();
499	assert_eq!(c.token(), Kind::Number);
500	assert_eq!(c.token().value(), 20.0);
501	let c = p.next();
502	assert_eq!(c.token(), Kind::Whitespace);
503	let c = p.next();
504	assert_eq!(c.token(), Kind::Number);
505	assert_eq!(c.token().value(), 21.0);
506	let c = p.next();
507	assert_eq!(c.token(), Kind::Eof);
508}