css_lexer/lib.rs
1#![deny(warnings)]
2//! An implementation of the [CSS Syntax Level 3 tokenization algorithm][1]. It is intended as a low-level building
3//! block for buidling parsers for CSS or CSS-alike languages (for example SASS).
4//!
5//! This crate provides the [Lexer] struct, which borrows `&str` and can incrementally produce [Tokens][Token]. The
6//! encoding of the `&str` is assumed to be utf-8.
7//!
8//! The [Lexer] _may_ be configured with additional [Features][Feature] to allow for lexing tokens in ways which diverge
9//! from the CSS specification (such as tokenizing comments using `//`). With no additional features this lexer is fully
10//! spec compliant.
11//!
12//! [Tokens][Token] are _untyped_ (there are no super-classes like `Ident`); but they have a [Kind] which can be used to
13//! determine their type. Tokens do not store the underlying character data, nor do they store their offsets. They just
14//! provide "facts" about the underlying data. In order to re-build a string, each [Token] will need to be wrapped in a
15//! [Cursor] and consult the original `&str` to get the character data. This design allows Tokens live in the stack,
16//! avoiding heap allocation as they are always `size_of` `8`. Likewise [Cursors][Cursor] are always a `size_of` `12`.
17//!
18//! # Limitations
19//!
20//! The [Lexer] has limitations around document sizes and token sizes, in order to keep [Token], [SourceOffset] and
21//! [Cursor] small. It's very unlikely the average document will run into these limitations, but they're listed here
22//! for completeness:
23//!
24//! - Documents are limited to ~4gb in size. [SourceOffset] is a [u32] so cannot represent larger offsets. Attempting to
25//!   lex larger documents is considrered [undefined behaviour][2].
26//!
27//! - [Tokens][Token] are limited to ~4gb in length. A [Token's][Token] is a [u32] so cannot represent larger lengths.
28//!   If the lexer encounters a token with  larger length this is considered [undefined behaviour][2].
29//!
30//! - Number [Tokens][Token] are limited to 16,777,216 characters in length. For example encountering a number with
31//!   17MM `0`s is considered [undefined behaviour][2]. This is not the same as the number value, which is an [f32].
32//!   (Please note that the CSS spec dictates numbers are f32, CSS does not have larger numbers).
33//!
34//! - Dimension [Tokens][Token] are limited to 4,096 numeric characters in length and 4,096 ident characters in length.
35//!   For example encountering a dimension with 4,097 `0`s is considered [undefined behaviour][2].
36//!
37//! # General usage
38//!
39//! A parser can be implemented on top of the [Lexer] by instantiating a [Lexer] with [Lexer::new()] or
40//! [Lexer::new_with_features()] if you wish to opt-into non-spec-compliant features. The [Lexer] needs to be given a
41//! `&str` which it will reference to produce Tokens.
42//!
43//! Repeatedly calling [Lexer::advance()] will move the Lexer's internal position one [Token] forward, and return the
44//! newly lexed [Token], once the end of `&str` is reached [Lexer::advance()] will repeatedly return [Token::EOF].
45//!
46//! # Example
47//!
48//! ```
49//! use css_lexer::*;
50//! let mut lexer = Lexer::new(&EmptyAtomSet::ATOMS, "width: 1px");
51//! assert_eq!(lexer.offset(), 0);
52//! {
53//!     let token = lexer.advance();
54//!     assert_eq!(token, Kind::Ident);
55//!     let cursor = token.with_cursor(SourceOffset(0));
56//!     assert_eq!(cursor.str_slice(lexer.source()), "width");
57//! }
58//! {
59//!     let token = lexer.advance();
60//!     assert_eq!(token, Kind::Colon);
61//!     assert_eq!(token, ':');
62//! }
63//! {
64//!     let token = lexer.advance();
65//!     assert_eq!(token, Kind::Whitespace);
66//! }
67//! {
68//!     let token = lexer.advance();
69//!     assert_eq!(token, Kind::Dimension);
70//! }
71//! ```
72//!
73//! [1]: https://drafts.csswg.org/css-syntax/#tokenization
74//! [2]: https://en.wikipedia.org/wiki/Undefined_behavior
75
76mod associated_whitespace_rules;
77mod atom_set;
78mod comment_style;
79mod constants;
80mod cow;
81mod cursor;
82mod empty_atom_set;
83mod feature;
84mod kind;
85mod kindset;
86mod pairwise;
87mod private;
88mod quote_style;
89mod source_cursor;
90mod source_offset;
91mod span;
92mod syntax;
93mod token;
94mod whitespace_style;
95
96/// A convenience alias for the most common use case - a Lexer
97pub type BasicLexer<'a> = Lexer<'a>;
98
99pub use associated_whitespace_rules::AssociatedWhitespaceRules;
100pub use atom_set::{AtomSet, DynAtomSet};
101pub use comment_style::CommentStyle;
102pub use cow::CowStr;
103pub use cursor::Cursor;
104pub use empty_atom_set::EmptyAtomSet;
105pub use feature::Feature;
106pub use kind::Kind;
107pub use kindset::KindSet;
108pub use pairwise::PairWise;
109pub use quote_style::QuoteStyle;
110pub use source_cursor::SourceCursor;
111pub use source_offset::SourceOffset;
112pub use span::{Span, ToSpan};
113pub use token::Token;
114pub use whitespace_style::Whitespace;
115
116/// The [Lexer] struct - the core of the library - borrows `&str` and can incrementally produce [Tokens][Token].
117///
118/// The encoding of the `&str` is assumed to be utf-8. Other sources should be re-encoded into utf-8 prior to ingesting
119/// into the [Lexer].
120///
121/// The [Lexer] _may_ be configured with additional [Features][Feature] to allow for lexing tokens in ways which diverge
122/// from the CSS specification (such as tokenizing comments using `//`). With no additional features this lexer is fully
123/// spec compliant.
124///
125/// [Tokens][Token] are _untyped_ (there are no super-classes like `Ident`); but they have a [Kind] which can be used to
126/// determine their type. Tokens do not store the underlying character data, nor do they store their offsets. They just
127/// provide "facts" about the underlying data. In order to re-build a string, each [Token] will need to be wrapped in a
128/// [Cursor] and consult the original `&str` to get the character data. This design allows Tokens live in the stack,
129/// avoiding heap allocation as they are always `size_of` `8`. Likewise [Cursors][Cursor] are always a `size_of` `12`.
130///
131/// # Limitations
132///
133/// The [Lexer] has limitations around document sizes and token sizes, in order to keep [Token], [SourceOffset] and
134/// [Cursor] small.
135///
136/// - Documents are limited to ~4gb in size. [SourceOffset] is a [u32] so cannot represent larger offsets. Attempting to
137///   lex larger documents is considrered [undefined behaviour][2].
138///
139/// - [Tokens][Token] are limited to ~4gb in length. A [Token's][Token] is a [u32] so cannot represent larger lengths.
140///   If the lexer encounters a token with  larger length this is considered [undefined behaviour][2].
141///
142/// - Number [Tokens][Token] are limited to 16,777,216 characters in length. For example encountering a number with
143///   17MM `0`s is considered [undefined behaviour][2]. This is not the same as the number value, which is an [f32].
144///   (Please note that the CSS spec dictates numbers are f32, CSS does not have larger numbers).
145///
146/// - Dimension [Tokens][Token] are limited to 4,096 numeric characters in length and 4,096 ident characters in length.
147///   For example encountering a dimension with 4,097 `0` is considered [undefined behaviour][2].
148///
149/// # General usage
150///
151/// A parser can be implemented on top of the [Lexer] by instantiating a [Lexer] with [Lexer::new()] or
152/// [Lexer::new_with_features()] if you wish to opt-into non-spec-compliant features. The [Lexer] needs to be given a
153/// `&str` which it will reference to produce Tokens.
154///
155/// Repeatedly calling [Lexer::advance()] will move the Lexer's internal position one [Token] forward, and return the
156/// newly lexed [Token], once the end of `&str` is reached [Lexer::advance()] will repeatedly return [Token::EOF].
157///
158/// # Example
159///
160/// ```
161/// use css_lexer::*;
162/// let mut lexer = Lexer::new(&EmptyAtomSet::ATOMS, "width: 1px");
163/// assert_eq!(lexer.offset(), 0);
164/// {
165///     let token = lexer.advance();
166///     assert_eq!(token, Kind::Ident);
167///     let cursor = token.with_cursor(SourceOffset(0));
168///     assert_eq!(cursor.str_slice(lexer.source()), "width");
169/// }
170/// {
171///     let token = lexer.advance();
172///     assert_eq!(token, Kind::Colon);
173///     assert_eq!(token, ':');
174/// }
175/// {
176///     let token = lexer.advance();
177///     assert_eq!(token, Kind::Whitespace);
178/// }
179/// {
180///     let token = lexer.advance();
181///     assert_eq!(token, Kind::Dimension);
182/// }
183/// ```
184///
185/// [1]: https://drafts.csswg.org/css-syntax/#tokenization
186/// [2]: https://en.wikipedia.org/wiki/Undefined_behavior
187#[derive(Debug, Clone)]
188pub struct Lexer<'a> {
189	source: &'a str,
190	offset: SourceOffset,
191	token: Token,
192	features: Feature,
193	atoms: &'static dyn DynAtomSet,
194}
195
196impl<'a> Lexer<'a> {
197	#[inline]
198	pub fn new(atoms: &'static dyn DynAtomSet, source: &'a str) -> Self {
199		Self { source, offset: SourceOffset::default(), token: Token::default(), features: Feature::default(), atoms }
200	}
201
202	#[inline]
203	pub fn new_with_features(atoms: &'static dyn DynAtomSet, source: &'a str, features: Feature) -> Self {
204		Self { source, features, offset: SourceOffset::default(), token: Token::default(), atoms }
205	}
206
207	#[inline(always)]
208	pub fn source(&self) -> &'a str {
209		self.source
210	}
211
212	/// Is the lexer at the last token
213	pub fn at_end(&self) -> bool {
214		self.offset.0 as usize == self.source.len()
215	}
216
217	/// Current position in file
218	#[inline(always)]
219	pub const fn offset(&self) -> SourceOffset {
220		self.offset
221	}
222
223	#[inline(always)]
224	pub fn checkpoint(&self) -> Cursor {
225		Cursor::new(self.offset(), self.token)
226	}
227
228	/// Rewinds the lexer back to the given checkpoint
229	pub fn rewind(&mut self, cursor: Cursor) {
230		debug_assert!(cursor.offset() <= self.offset());
231		self.offset = cursor.offset();
232		self.token = cursor.token();
233	}
234
235	/// Advances the lexer to the end of the given token
236	pub fn hop(&mut self, cursor: Cursor) {
237		debug_assert!(cursor.offset().0 as usize >= (self.offset.0 + self.token.len()) as usize);
238		self.offset = cursor.offset();
239		self.token = cursor.token();
240	}
241
242	/// Moves the lexer one token forward, returning that token
243	pub fn advance(&mut self) -> Token {
244		self.token = self.read_next_token(self.offset.0);
245		self.offset.0 += self.token.len();
246		self.token
247	}
248}
249
250impl<'a> Iterator for Lexer<'a> {
251	type Item = Cursor;
252
253	#[inline]
254	fn next(&mut self) -> Option<Self::Item> {
255		if self.offset.0 as usize >= self.source.len() {
256			return None;
257		}
258		let offset = self.offset;
259		let token = self.advance();
260		if token.kind() == Kind::Eof { None } else { Some(token.with_cursor(offset)) }
261	}
262}
263
264#[test]
265fn size_test() {
266	assert_eq!(::std::mem::size_of::<Lexer>(), 48);
267}
268
269#[cfg(test)]
270mod iterator_tests {
271	use super::*;
272
273	#[test]
274	fn test_lexer_iterator_basic() {
275		let lexer = Lexer::new(&EmptyAtomSet::ATOMS, "foo bar");
276		let cursors: Vec<_> = lexer.collect();
277		assert_eq!(cursors.len(), 3); // ident, whitespace, ident
278		assert_eq!(cursors[0], Kind::Ident);
279		assert_eq!(cursors[1], Kind::Whitespace);
280		assert_eq!(cursors[2], Kind::Ident);
281	}
282
283	#[test]
284	fn test_lexer_iterator_empty() {
285		let lexer = Lexer::new(&EmptyAtomSet::ATOMS, "");
286		let cursors: Vec<_> = lexer.collect();
287		assert_eq!(cursors.len(), 0);
288	}
289
290	#[test]
291	fn test_lexer_iterator_equivalence() {
292		let source = "width: 1px";
293
294		let lexer = Lexer::new(&EmptyAtomSet::ATOMS, source);
295		let cursors: Vec<_> = lexer.collect();
296
297		let mut lexer = Lexer::new(&EmptyAtomSet::ATOMS, source);
298		let mut manual_cursors = Vec::new();
299		while !lexer.at_end() {
300			let offset = lexer.offset();
301			let token = lexer.advance();
302			if token.kind() != Kind::Eof {
303				manual_cursors.push(token.with_cursor(offset));
304			}
305		}
306
307		assert_eq!(cursors.len(), manual_cursors.len());
308		for (c1, c2) in cursors.iter().zip(manual_cursors.iter()) {
309			assert_eq!(c1.token().kind(), c2.token().kind());
310			assert_eq!(c1.offset(), c2.offset());
311		}
312	}
313
314	#[test]
315	fn test_lexer_iterator_clone() {
316		let source = "foo bar baz";
317		let mut lexer = Lexer::new(&EmptyAtomSet::ATOMS, source);
318
319		let first = lexer.next();
320		assert!(first.is_some());
321		assert_eq!(first.unwrap(), Kind::Ident);
322
323		let lexer_clone = lexer.clone();
324
325		let cursors1: Vec<_> = lexer.collect();
326		let cursors2: Vec<_> = lexer_clone.collect();
327
328		assert_eq!(cursors1.len(), cursors2.len());
329		for (c1, c2) in cursors1.iter().zip(cursors2.iter()) {
330			assert_eq!(c1.token().kind(), c2.token().kind());
331			assert_eq!(c1.offset(), c2.offset());
332		}
333	}
334}