css_lexer/
source_cursor.rs

1use crate::{
2	AssociatedWhitespaceRules, CommentStyle, CowStr, Cursor, Kind, KindSet, QuoteStyle, SourceOffset, Span, ToSpan,
3	Token,
4	syntax::{ParseEscape, is_newline},
5};
6use allocator_api2::{alloc::Allocator, boxed::Box, vec::Vec};
7use std::char::REPLACEMENT_CHARACTER;
8use std::fmt::{Display, Formatter, Result};
9
10/// Wraps [Cursor] with a [str] that represents the underlying character data for this cursor.
11#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
12pub struct SourceCursor<'a> {
13	cursor: Cursor,
14	source: &'a str,
15}
16
17impl<'a> ToSpan for SourceCursor<'a> {
18	fn to_span(&self) -> Span {
19		self.cursor.to_span()
20	}
21}
22
23impl<'a> Display for SourceCursor<'a> {
24	fn fmt(&self, f: &mut Formatter<'_>) -> Result {
25		match self.token().kind() {
26			Kind::Eof => Ok(()),
27			// It is important to manually write out quotes for 2 reasons:
28			//  1. The quote style can be mutated from the source string (such as the case of normalising/switching quotes.
29			//  2. Some strings may not have the closing quote, which should be corrected.
30			Kind::String => match self.token().quote_style() {
31				QuoteStyle::Single => {
32					let inner =
33						&self.source[1..(self.token().len() as usize) - self.token().has_close_quote() as usize];
34					write!(f, "'{inner}'")
35				}
36				QuoteStyle::Double => {
37					let inner =
38						&self.source[1..(self.token().len() as usize) - self.token().has_close_quote() as usize];
39					write!(f, "\"{inner}\"")
40				}
41				// Strings must always be quoted!
42				QuoteStyle::None => unreachable!(),
43			},
44			Kind::Delim
45			| Kind::Colon
46			| Kind::Semicolon
47			| Kind::Comma
48			| Kind::LeftSquare
49			| Kind::LeftParen
50			| Kind::RightSquare
51			| Kind::RightParen
52			| Kind::LeftCurly
53			| Kind::RightCurly => self.token().char().unwrap().fmt(f),
54			_ => f.write_str(self.source),
55		}
56	}
57}
58
59impl<'a> SourceCursor<'a> {
60	pub const SPACE: SourceCursor<'static> = SourceCursor::from(Cursor::new(SourceOffset(0), Token::SPACE), " ");
61	pub const TAB: SourceCursor<'static> = SourceCursor::from(Cursor::new(SourceOffset(0), Token::TAB), "\t");
62	pub const NEWLINE: SourceCursor<'static> = SourceCursor::from(Cursor::new(SourceOffset(0), Token::NEWLINE), "\n");
63
64	#[inline(always)]
65	pub const fn from(cursor: Cursor, source: &'a str) -> Self {
66		debug_assert!(
67			(cursor.len() as usize) == source.len(),
68			"A SourceCursor should be constructed with a source that matches the length of the cursor!"
69		);
70		Self { cursor, source }
71	}
72
73	#[inline(always)]
74	pub const fn cursor(&self) -> Cursor {
75		self.cursor
76	}
77
78	#[inline(always)]
79	pub const fn token(&self) -> Token {
80		self.cursor.token()
81	}
82
83	#[inline(always)]
84	pub const fn source(&self) -> &'a str {
85		self.source
86	}
87
88	pub fn with_quotes(&self, quote_style: QuoteStyle) -> Self {
89		Self::from(self.cursor.with_quotes(quote_style), self.source)
90	}
91
92	pub fn with_associated_whitespace(&self, rules: AssociatedWhitespaceRules) -> Self {
93		Self::from(self.cursor.with_associated_whitespace(rules), self.source)
94	}
95
96	pub fn eq_ignore_ascii_case(&self, other: &str) -> bool {
97		debug_assert!(self.token() != Kind::Delim && self.token() != Kind::Url);
98		debug_assert!(other.to_ascii_lowercase() == other);
99		let start = self.token().leading_len() as usize;
100		let end = self.source.len() - self.token().trailing_len() as usize;
101		if !self.token().contains_escape_chars() {
102			if end - start != other.len() {
103				return false;
104			}
105			if self.token().is_lower_case() {
106				debug_assert!(self.source[start..end].to_ascii_lowercase() == self.source[start..end]);
107				return &self.source[start..end] == other;
108			}
109			return self.source[start..end].eq_ignore_ascii_case(other);
110		}
111		let mut chars = self.source[start..end].chars().peekable();
112		let mut other_chars = other.chars();
113		let mut i = 0;
114		while let Some(c) = chars.next() {
115			let o = other_chars.next();
116			if o.is_none() {
117				return false;
118			}
119			let o = o.unwrap();
120			if c == '\0' {
121				if REPLACEMENT_CHARACTER != o {
122					return false;
123				}
124				i += 1;
125			} else if c == '\\' {
126				// String has special rules
127				// https://drafts.csswg.org/css-syntax-3/#consume-string-token
128				if self.token().kind_bits() == Kind::String as u8 {
129					// When the token is a string, escaped EOF points are not consumed
130					// U+005C REVERSE SOLIDUS (\)
131					//   If the next input code point is EOF, do nothing.
132					//   Otherwise, if the next input code point is a newline, consume it.
133					let c = chars.peek();
134					if let Some(c) = c {
135						if is_newline(*c) {
136							chars.next();
137							if chars.peek() == Some(&'\n') {
138								i += 1;
139							}
140							i += 2;
141							chars = self.source[(start + i)..end].chars().peekable();
142							continue;
143						}
144					} else {
145						break;
146					}
147				}
148				i += 1;
149				let (ch, n) = self.source[(start + i)..].chars().parse_escape_sequence();
150				i += n as usize;
151				chars = self.source[(start + i)..end].chars().peekable();
152				if (ch == '\0' && REPLACEMENT_CHARACTER != o) || ch != o {
153					return false;
154				}
155			} else if c != o {
156				return false;
157			} else {
158				i += c.len_utf8();
159			}
160		}
161		other_chars.next().is_none()
162	}
163
164	/// Parse the cursor's content using any allocator that implements the Allocator trait.
165	pub fn parse<A: Allocator + Clone + 'a>(&self, allocator: A) -> CowStr<'a, A> {
166		debug_assert!(self.token() != Kind::Delim);
167		let start = self.token().leading_len() as usize;
168		let end = self.source.len() - self.token().trailing_len() as usize;
169		if !self.token().contains_escape_chars() {
170			return CowStr::<A>::Borrowed(&self.source[start..end]);
171		}
172		let mut chars = self.source[start..end].chars().peekable();
173		let mut i = 0;
174		let mut vec: Option<Vec<u8, A>> = None;
175		while let Some(c) = chars.next() {
176			if c == '\0' {
177				if vec.is_none() {
178					vec = if i == 0 {
179						Some(Vec::new_in(allocator.clone()))
180					} else {
181						Some({
182							let mut v = Vec::new_in(allocator.clone());
183							v.extend(self.source[start..(start + i)].bytes());
184							v
185						})
186					}
187				}
188				let mut buf = [0; 4];
189				let bytes = REPLACEMENT_CHARACTER.encode_utf8(&mut buf).as_bytes();
190				vec.as_mut().unwrap().extend_from_slice(bytes);
191				i += 1;
192			} else if c == '\\' {
193				if vec.is_none() {
194					vec = if i == 0 {
195						Some(Vec::new_in(allocator.clone()))
196					} else {
197						Some({
198							let mut v = Vec::new_in(allocator.clone());
199							v.extend(self.source[start..(start + i)].bytes());
200							v
201						})
202					}
203				}
204				// String has special rules
205				// https://drafts.csswg.org/css-syntax-3/#consume-string-cursor
206				if self.token().kind_bits() == Kind::String as u8 {
207					// When the token is a string, escaped EOF points are not consumed
208					// U+005C REVERSE SOLIDUS (\)
209					//   If the next input code point is EOF, do nothing.
210					//   Otherwise, if the next input code point is a newline, consume it.
211					let c = chars.peek();
212					if let Some(c) = c {
213						if is_newline(*c) {
214							chars.next();
215							if chars.peek() == Some(&'\n') {
216								i += 1;
217							}
218							i += 2;
219							chars = self.source[(start + i)..end].chars().peekable();
220							continue;
221						}
222					} else {
223						break;
224					}
225				}
226				i += 1;
227				let (ch, n) = self.source[(start + i)..].chars().parse_escape_sequence();
228				let char_to_push = if ch == '\0' { REPLACEMENT_CHARACTER } else { ch };
229				let mut buf = [0; 4];
230				let bytes = char_to_push.encode_utf8(&mut buf).as_bytes();
231				vec.as_mut().unwrap().extend_from_slice(bytes);
232				i += n as usize;
233				chars = self.source[(start + i)..end].chars().peekable();
234			} else {
235				if let Some(bytes) = &mut vec {
236					let mut buf = [0; 4];
237					let char_bytes = c.encode_utf8(&mut buf).as_bytes();
238					bytes.extend_from_slice(char_bytes);
239				}
240				i += c.len_utf8();
241			}
242		}
243		match vec {
244			Some(vec) => {
245				let boxed_slice = vec.into_boxed_slice();
246				// SAFETY: The source is valid UTF-8, so the slice is valid UTF-8
247				unsafe { CowStr::Owned(Box::from_raw_in(Box::into_raw(boxed_slice) as *mut str, allocator)) }
248			}
249			None => CowStr::Borrowed(&self.source[start..start + i]),
250		}
251	}
252
253	/// Parse the cursor's content to ASCII lowercase using any allocator that implements the Allocator trait.
254	pub fn parse_ascii_lower<A: Allocator + Clone + 'a>(&self, allocator: A) -> CowStr<'a, A> {
255		debug_assert!(self.token() != Kind::Delim);
256		let start = self.token().leading_len() as usize;
257		let end = self.source.len() - self.token().trailing_len() as usize;
258		if !self.token().contains_escape_chars() && self.token().is_lower_case() {
259			return CowStr::Borrowed(&self.source[start..end]);
260		}
261		let mut chars = self.source[start..end].chars().peekable();
262		let mut i = 0;
263		let mut vec: Vec<u8, A> = Vec::new_in(allocator.clone());
264		while let Some(c) = chars.next() {
265			if c == '\0' {
266				let mut buf = [0; 4];
267				let bytes = REPLACEMENT_CHARACTER.encode_utf8(&mut buf).as_bytes();
268				vec.extend_from_slice(bytes);
269				i += 1;
270			} else if c == '\\' {
271				// String has special rules
272				// https://drafts.csswg.org/css-syntax-3/#consume-string-cursor
273				if self.token().kind_bits() == Kind::String as u8 {
274					// When the token is a string, escaped EOF points are not consumed
275					// U+005C REVERSE SOLIDUS (\)
276					//   If the next input code point is EOF, do nothing.
277					//   Otherwise, if the next input code point is a newline, consume it.
278					let c = chars.peek();
279					if let Some(c) = c {
280						if is_newline(*c) {
281							chars.next();
282							if chars.peek() == Some(&'\n') {
283								i += 1;
284							}
285							i += 2;
286							chars = self.source[(start + i)..end].chars().peekable();
287							continue;
288						}
289					} else {
290						break;
291					}
292				}
293				i += 1;
294				let (ch, n) = self.source[(start + i)..].chars().parse_escape_sequence();
295				let char_to_push = if ch == '\0' { REPLACEMENT_CHARACTER } else { ch.to_ascii_lowercase() };
296				let mut buf = [0; 4];
297				let bytes = char_to_push.encode_utf8(&mut buf).as_bytes();
298				vec.extend_from_slice(bytes);
299				i += n as usize;
300				chars = self.source[(start + i)..end].chars().peekable();
301			} else {
302				let mut buf = [0; 4];
303				let bytes = c.to_ascii_lowercase().encode_utf8(&mut buf).as_bytes();
304				vec.extend_from_slice(bytes);
305				i += c.len_utf8();
306			}
307		}
308		let boxed_slice = vec.into_boxed_slice();
309		// SAFETY: The source is valid UTF-8, so the slice is valid UTF-8
310		unsafe { CowStr::Owned(Box::from_raw_in(Box::into_raw(boxed_slice) as *mut str, allocator)) }
311	}
312}
313
314impl PartialEq<Kind> for SourceCursor<'_> {
315	fn eq(&self, other: &Kind) -> bool {
316		self.token() == *other
317	}
318}
319
320impl PartialEq<CommentStyle> for SourceCursor<'_> {
321	fn eq(&self, other: &CommentStyle) -> bool {
322		self.token() == *other
323	}
324}
325
326impl From<SourceCursor<'_>> for KindSet {
327	fn from(cursor: SourceCursor<'_>) -> Self {
328		cursor.token().into()
329	}
330}
331
332impl PartialEq<KindSet> for SourceCursor<'_> {
333	fn eq(&self, other: &KindSet) -> bool {
334		self.token() == *other
335	}
336}
337
338#[cfg(test)]
339mod test {
340	use crate::{Cursor, QuoteStyle, SourceCursor, SourceOffset, Token};
341	use allocator_api2::alloc::Global;
342	use std::fmt::Write;
343
344	#[test]
345	fn parse_str_lower() {
346		let c = Cursor::new(SourceOffset(0), Token::new_ident(true, false, false, 0, 3));
347		assert_eq!(SourceCursor::from(c, "FoO").parse_ascii_lower(Global), "foo");
348		assert_eq!(SourceCursor::from(c, "FOO").parse_ascii_lower(Global), "foo");
349		assert_eq!(SourceCursor::from(c, "foo").parse_ascii_lower(Global), "foo");
350
351		let c = Cursor::new(SourceOffset(0), Token::new_string(QuoteStyle::Single, true, false, 5));
352		assert_eq!(SourceCursor::from(c, "'FoO'").parse_ascii_lower(Global), "foo");
353		assert_eq!(SourceCursor::from(c, "'FOO'").parse_ascii_lower(Global), "foo");
354
355		let c = Cursor::new(SourceOffset(0), Token::new_string(QuoteStyle::Single, false, false, 4));
356		assert_eq!(SourceCursor::from(c, "'FoO").parse_ascii_lower(Global), "foo");
357		assert_eq!(SourceCursor::from(c, "'FOO").parse_ascii_lower(Global), "foo");
358		assert_eq!(SourceCursor::from(c, "'foo").parse_ascii_lower(Global), "foo");
359
360		let c = Cursor::new(SourceOffset(0), Token::new_url(true, false, false, 4, 1, 6));
361		assert_eq!(SourceCursor::from(c, "url(a)").parse_ascii_lower(Global), "a");
362		assert_eq!(SourceCursor::from(c, "url(b)").parse_ascii_lower(Global), "b");
363
364		let c = Cursor::new(SourceOffset(0), Token::new_url(true, false, false, 6, 1, 8));
365		assert_eq!(SourceCursor::from(c, "\\75rl(A)").parse_ascii_lower(Global), "a");
366		assert_eq!(SourceCursor::from(c, "u\\52l(B)").parse_ascii_lower(Global), "b");
367		assert_eq!(SourceCursor::from(c, "ur\\6c(C)").parse_ascii_lower(Global), "c");
368
369		let c = Cursor::new(SourceOffset(0), Token::new_url(true, false, false, 8, 1, 10));
370		assert_eq!(SourceCursor::from(c, "\\75\\52l(A)").parse_ascii_lower(Global), "a");
371		assert_eq!(SourceCursor::from(c, "u\\52\\6c(B)").parse_ascii_lower(Global), "b");
372		assert_eq!(SourceCursor::from(c, "\\75r\\6c(C)").parse_ascii_lower(Global), "c");
373	}
374
375	#[test]
376	fn eq_ignore_ascii_case() {
377		let c = Cursor::new(SourceOffset(0), Token::new_ident(false, false, false, 0, 3));
378		assert!(SourceCursor::from(c, "foo").eq_ignore_ascii_case("foo"));
379		assert!(!SourceCursor::from(c, "foo").eq_ignore_ascii_case("bar"));
380		assert!(!SourceCursor::from(c, "fo ").eq_ignore_ascii_case("foo"));
381		assert!(!SourceCursor::from(c, "foo").eq_ignore_ascii_case("fooo"));
382		assert!(!SourceCursor::from(c, "foo").eq_ignore_ascii_case("ғоо"));
383
384		let c = Cursor::new(SourceOffset(0), Token::new_ident(true, false, false, 0, 3));
385		assert!(SourceCursor::from(c, "FoO").eq_ignore_ascii_case("foo"));
386		assert!(SourceCursor::from(c, "FOO").eq_ignore_ascii_case("foo"));
387		assert!(!SourceCursor::from(c, "foo").eq_ignore_ascii_case("bar"));
388		assert!(!SourceCursor::from(c, "fo ").eq_ignore_ascii_case("foo"));
389		assert!(!SourceCursor::from(c, "foo").eq_ignore_ascii_case("fooo"));
390		assert!(!SourceCursor::from(c, "foo").eq_ignore_ascii_case("ғоо"));
391
392		let c = Cursor::new(SourceOffset(3), Token::new_ident(false, false, false, 0, 3));
393		assert!(SourceCursor::from(c, "bar").eq_ignore_ascii_case("bar"));
394
395		let c = Cursor::new(SourceOffset(3), Token::new_ident(false, false, true, 0, 3));
396		assert!(SourceCursor::from(c, "bar").eq_ignore_ascii_case("bar"));
397
398		let c = Cursor::new(SourceOffset(3), Token::new_ident(false, false, true, 0, 5));
399		assert!(SourceCursor::from(c, "b\\61r").eq_ignore_ascii_case("bar"));
400
401		let c = Cursor::new(SourceOffset(3), Token::new_ident(false, false, true, 0, 7));
402		assert!(SourceCursor::from(c, "b\\61\\72").eq_ignore_ascii_case("bar"));
403	}
404
405	#[test]
406	fn write_str() {
407		let c = Cursor::new(SourceOffset(0), Token::new_string(QuoteStyle::Double, true, false, 5));
408		let mut str = String::new();
409		write!(str, "{}", SourceCursor::from(c, "'foo'")).unwrap();
410		assert_eq!(c.token().quote_style(), QuoteStyle::Double);
411		assert_eq!(str, "\"foo\"");
412
413		let c = Cursor::new(SourceOffset(0), Token::new_string(QuoteStyle::Double, false, false, 4));
414		let mut str = String::new();
415		write!(str, "{}", SourceCursor::from(c, "'foo")).unwrap();
416		assert_eq!(c.token().quote_style(), QuoteStyle::Double);
417		assert_eq!(str, "\"foo\"");
418
419		let c = Cursor::new(SourceOffset(0), Token::new_string(QuoteStyle::Single, false, false, 4));
420		let mut str = String::new();
421		write!(str, "{}", SourceCursor::from(c, "\"foo")).unwrap();
422		assert_eq!(c.token().quote_style(), QuoteStyle::Single);
423		assert_eq!(str, "'foo'");
424	}
425
426	#[test]
427	#[cfg(feature = "bumpalo")]
428	fn test_bumpalo_compatibility() {
429		use bumpalo::Bump;
430
431		// Test that Bumpalo's Bump can be used as an allocator
432		let bump = Bump::new();
433		let c = Cursor::new(SourceOffset(0), Token::new_ident(true, false, false, 0, 3));
434
435		// Test that the old interface still works
436		assert_eq!(SourceCursor::from(c, "FoO").parse(&bump), "FoO");
437		assert_eq!(SourceCursor::from(c, "FoO").parse_ascii_lower(&bump), "foo");
438
439		// Test that the new interface works with Bumpalo too
440		assert_eq!(&*SourceCursor::from(c, "FoO").parse(&bump), "FoO");
441		assert_eq!(&*SourceCursor::from(c, "FoO").parse_ascii_lower(&bump), "foo");
442
443		// Test with escape sequences
444		let c = Cursor::new(SourceOffset(0), Token::new_ident(false, false, true, 0, 7));
445		assert_eq!(SourceCursor::from(c, "b\\61\\72").parse(&bump), "bar");
446		assert_eq!(&*SourceCursor::from(c, "b\\61\\72").parse(&bump), "bar");
447	}
448}