css_lexer/lib.rs
1#![deny(warnings)]
2//! An implementation of the [CSS Syntax Level 3 tokenization algorithm][1]. It is intended as a low-level building
3//! block for buidling parsers for CSS or CSS-alike languages (for example SASS).
4//!
5//! This crate provides the [Lexer] struct, which borrows `&str` and can incrementally produce [Tokens][Token]. The
6//! encoding of the `&str` is assumed to be utf-8.
7//!
8//! The [Lexer] _may_ be configured with additional [Features][Feature] to allow for lexing tokens in ways which diverge
9//! from the CSS specification (such as tokenizing comments using `//`). With no additional features this lexer is fully
10//! spec compliant.
11//!
12//! [Tokens][Token] are _untyped_ (there are no super-classes like `Ident`); but they have a [Kind] which can be used to
13//! determine their type. Tokens do not store the underlying character data, nor do they store their offsets. They just
14//! provide "facts" about the underlying data. In order to re-build a string, each [Token] will need to be wrapped in a
15//! [Cursor] and consult the original `&str` to get the character data. This design allows Tokens live in the stack,
16//! avoiding heap allocation as they are always `size_of` `8`. Likewise [Cursors][Cursor] are always a `size_of` `12`.
17//!
18//! # Limitations
19//!
20//! The [Lexer] has limitations around document sizes and token sizes, in order to keep [Token], [SourceOffset] and
21//! [Cursor] small. It's very unlikely the average document will run into these limitations, but they're listed here
22//! for completeness:
23//!
24//! - Documents are limited to ~4gb in size. [SourceOffset] is a [u32] so cannot represent larger offsets. Attempting to
25//! lex larger documents is considrered [undefined behaviour][2].
26//!
27//! - [Tokens][Token] are limited to ~4gb in length. A [Token's][Token] is a [u32] so cannot represent larger lengths.
28//! If the lexer encounters a token with larger length this is considered [undefined behaviour][2].
29//!
30//! - Number [Tokens][Token] are limited to 16,777,216 characters in length. For example encountering a number with
31//! 17MM `0`s is considered [undefined behaviour][2]. This is not the same as the number value, which is an [f32].
32//! (Please note that the CSS spec dictates numbers are f32, CSS does not have larger numbers).
33//!
34//! - Dimension [Tokens][Token] are limited to 4,096 numeric characters in length and 4,096 ident characters in length.
35//! For example encountering a dimension with 4,097 `0`s is considered [undefined behaviour][2].
36//!
37//! # General usage
38//!
39//! A parser can be implemented on top of the [Lexer] by instantiating a [Lexer] with [Lexer::new()] or
40//! [Lexer::new_with_features()] if you wish to opt-into non-spec-compliant features. The [Lexer] needs to be given a
41//! `&str` which it will reference to produce Tokens.
42//!
43//! Repeatedly calling [Lexer::advance()] will move the Lexer's internal position one [Token] forward, and return the
44//! newly lexed [Token], once the end of `&str` is reached [Lexer::advance()] will repeatedly return [Token::EOF].
45//!
46//! # Example
47//!
48//! ```
49//! use css_lexer::*;
50//! let mut lexer = Lexer::new(&EmptyAtomSet::ATOMS, "width: 1px");
51//! assert_eq!(lexer.offset(), 0);
52//! {
53//! let token = lexer.advance();
54//! assert_eq!(token, Kind::Ident);
55//! let cursor = token.with_cursor(SourceOffset(0));
56//! assert_eq!(cursor.str_slice(lexer.source()), "width");
57//! }
58//! {
59//! let token = lexer.advance();
60//! assert_eq!(token, Kind::Colon);
61//! assert_eq!(token, ':');
62//! }
63//! {
64//! let token = lexer.advance();
65//! assert_eq!(token, Kind::Whitespace);
66//! }
67//! {
68//! let token = lexer.advance();
69//! assert_eq!(token, Kind::Dimension);
70//! }
71//! ```
72//!
73//! [1]: https://drafts.csswg.org/css-syntax/#tokenization
74//! [2]: https://en.wikipedia.org/wiki/Undefined_behavior
75
76mod associated_whitespace_rules;
77mod atom_set;
78mod comment_style;
79mod constants;
80mod cow;
81mod cursor;
82#[cfg(feature = "dynamic-atoms")]
83mod dyn_atom_registry;
84mod empty_atom_set;
85mod feature;
86mod kind;
87mod kindset;
88mod pairwise;
89mod private;
90mod quote_style;
91mod small_str_buf;
92mod source_cursor;
93mod source_offset;
94mod span;
95mod syntax;
96mod token;
97mod whitespace_style;
98
99/// A convenience alias for the most common use case - a Lexer
100pub type BasicLexer<'a> = Lexer<'a>;
101
102pub use associated_whitespace_rules::AssociatedWhitespaceRules;
103pub use atom_set::{AtomSet, DynAtomSet};
104pub use comment_style::CommentStyle;
105pub use cow::CowStr;
106pub use cursor::Cursor;
107#[cfg(feature = "dynamic-atoms")]
108pub use dyn_atom_registry::{Atom, DynAtomRegistry, RegisteredAtomSet};
109pub use empty_atom_set::EmptyAtomSet;
110pub use feature::Feature;
111pub use kind::Kind;
112pub use kindset::KindSet;
113pub use pairwise::PairWise;
114pub use quote_style::QuoteStyle;
115pub use source_cursor::SourceCursor;
116pub use source_offset::SourceOffset;
117pub use span::{Span, ToSpan};
118pub use token::Token;
119pub use whitespace_style::Whitespace;
120
121/// The [Lexer] struct - the core of the library - borrows `&str` and can incrementally produce [Tokens][Token].
122///
123/// The encoding of the `&str` is assumed to be utf-8. Other sources should be re-encoded into utf-8 prior to ingesting
124/// into the [Lexer].
125///
126/// The [Lexer] _may_ be configured with additional [Features][Feature] to allow for lexing tokens in ways which diverge
127/// from the CSS specification (such as tokenizing comments using `//`). With no additional features this lexer is fully
128/// spec compliant.
129///
130/// [Tokens][Token] are _untyped_ (there are no super-classes like `Ident`); but they have a [Kind] which can be used to
131/// determine their type. Tokens do not store the underlying character data, nor do they store their offsets. They just
132/// provide "facts" about the underlying data. In order to re-build a string, each [Token] will need to be wrapped in a
133/// [Cursor] and consult the original `&str` to get the character data. This design allows Tokens live in the stack,
134/// avoiding heap allocation as they are always `size_of` `8`. Likewise [Cursors][Cursor] are always a `size_of` `12`.
135///
136/// # Limitations
137///
138/// The [Lexer] has limitations around document sizes and token sizes, in order to keep [Token], [SourceOffset] and
139/// [Cursor] small.
140///
141/// - Documents are limited to ~4gb in size. [SourceOffset] is a [u32] so cannot represent larger offsets. Attempting to
142/// lex larger documents is considrered [undefined behaviour][2].
143///
144/// - [Tokens][Token] are limited to ~4gb in length. A [Token's][Token] is a [u32] so cannot represent larger lengths.
145/// If the lexer encounters a token with larger length this is considered [undefined behaviour][2].
146///
147/// - Number [Tokens][Token] are limited to 16,777,216 characters in length. For example encountering a number with
148/// 17MM `0`s is considered [undefined behaviour][2]. This is not the same as the number value, which is an [f32].
149/// (Please note that the CSS spec dictates numbers are f32, CSS does not have larger numbers).
150///
151/// - Dimension [Tokens][Token] are limited to 4,096 numeric characters in length and 4,096 ident characters in length.
152/// For example encountering a dimension with 4,097 `0` is considered [undefined behaviour][2].
153///
154/// # General usage
155///
156/// A parser can be implemented on top of the [Lexer] by instantiating a [Lexer] with [Lexer::new()] or
157/// [Lexer::new_with_features()] if you wish to opt-into non-spec-compliant features. The [Lexer] needs to be given a
158/// `&str` which it will reference to produce Tokens.
159///
160/// Repeatedly calling [Lexer::advance()] will move the Lexer's internal position one [Token] forward, and return the
161/// newly lexed [Token], once the end of `&str` is reached [Lexer::advance()] will repeatedly return [Token::EOF].
162///
163/// # Example
164///
165/// ```
166/// use css_lexer::*;
167/// let mut lexer = Lexer::new(&EmptyAtomSet::ATOMS, "width: 1px");
168/// assert_eq!(lexer.offset(), 0);
169/// {
170/// let token = lexer.advance();
171/// assert_eq!(token, Kind::Ident);
172/// let cursor = token.with_cursor(SourceOffset(0));
173/// assert_eq!(cursor.str_slice(lexer.source()), "width");
174/// }
175/// {
176/// let token = lexer.advance();
177/// assert_eq!(token, Kind::Colon);
178/// assert_eq!(token, ':');
179/// }
180/// {
181/// let token = lexer.advance();
182/// assert_eq!(token, Kind::Whitespace);
183/// }
184/// {
185/// let token = lexer.advance();
186/// assert_eq!(token, Kind::Dimension);
187/// }
188/// ```
189///
190/// [1]: https://drafts.csswg.org/css-syntax/#tokenization
191/// [2]: https://en.wikipedia.org/wiki/Undefined_behavior
192#[derive(Debug, Clone)]
193pub struct Lexer<'a> {
194 source: &'a str,
195 offset: SourceOffset,
196 token: Token,
197 features: Feature,
198 atoms: &'static dyn DynAtomSet,
199}
200
201impl<'a> Lexer<'a> {
202 #[inline]
203 pub fn new(atoms: &'static dyn DynAtomSet, source: &'a str) -> Self {
204 Self { source, offset: SourceOffset::default(), token: Token::default(), features: Feature::default(), atoms }
205 }
206
207 #[inline]
208 pub fn new_with_features(atoms: &'static dyn DynAtomSet, source: &'a str, features: Feature) -> Self {
209 Self { source, features, offset: SourceOffset::default(), token: Token::default(), atoms }
210 }
211
212 #[inline(always)]
213 pub fn source(&self) -> &'a str {
214 self.source
215 }
216
217 /// Is the lexer at the last token
218 pub fn at_end(&self) -> bool {
219 self.offset.0 as usize == self.source.len()
220 }
221
222 /// Current position in file
223 #[inline(always)]
224 pub const fn offset(&self) -> SourceOffset {
225 self.offset
226 }
227
228 #[inline(always)]
229 pub fn checkpoint(&self) -> Cursor {
230 Cursor::new(self.offset(), self.token)
231 }
232
233 /// Rewinds the lexer back to the given checkpoint
234 pub fn rewind(&mut self, cursor: Cursor) {
235 debug_assert!(cursor.offset() <= self.offset());
236 self.offset = cursor.offset();
237 self.token = cursor.token();
238 }
239
240 /// Advances the lexer to the end of the given token
241 pub fn hop(&mut self, cursor: Cursor) {
242 debug_assert!(cursor.offset().0 as usize >= (self.offset.0 + self.token.len()) as usize);
243 self.offset = cursor.offset();
244 self.token = cursor.token();
245 }
246
247 /// Moves the lexer one token forward, returning that token
248 pub fn advance(&mut self) -> Token {
249 self.token = self.read_next_token(self.offset.0);
250 self.offset.0 += self.token.len();
251 self.token
252 }
253}
254
255impl<'a> Iterator for Lexer<'a> {
256 type Item = Cursor;
257
258 #[inline]
259 fn next(&mut self) -> Option<Self::Item> {
260 if self.offset.0 as usize >= self.source.len() {
261 return None;
262 }
263 let offset = self.offset;
264 let token = self.advance();
265 if token.kind() == Kind::Eof { None } else { Some(token.with_cursor(offset)) }
266 }
267}
268
269#[test]
270fn size_test() {
271 assert_eq!(::std::mem::size_of::<Lexer>(), 48);
272}
273
274#[cfg(test)]
275mod iterator_tests {
276 use super::*;
277
278 #[test]
279 fn test_lexer_iterator_basic() {
280 let lexer = Lexer::new(&EmptyAtomSet::ATOMS, "foo bar");
281 let cursors: Vec<_> = lexer.collect();
282 assert_eq!(cursors.len(), 3); // ident, whitespace, ident
283 assert_eq!(cursors[0], Kind::Ident);
284 assert_eq!(cursors[1], Kind::Whitespace);
285 assert_eq!(cursors[2], Kind::Ident);
286 }
287
288 #[test]
289 fn test_lexer_iterator_empty() {
290 let lexer = Lexer::new(&EmptyAtomSet::ATOMS, "");
291 let cursors: Vec<_> = lexer.collect();
292 assert_eq!(cursors.len(), 0);
293 }
294
295 #[test]
296 fn test_lexer_iterator_equivalence() {
297 let source = "width: 1px";
298
299 let lexer = Lexer::new(&EmptyAtomSet::ATOMS, source);
300 let cursors: Vec<_> = lexer.collect();
301
302 let mut lexer = Lexer::new(&EmptyAtomSet::ATOMS, source);
303 let mut manual_cursors = Vec::new();
304 while !lexer.at_end() {
305 let offset = lexer.offset();
306 let token = lexer.advance();
307 if token.kind() != Kind::Eof {
308 manual_cursors.push(token.with_cursor(offset));
309 }
310 }
311
312 assert_eq!(cursors.len(), manual_cursors.len());
313 for (c1, c2) in cursors.iter().zip(manual_cursors.iter()) {
314 assert_eq!(c1.token().kind(), c2.token().kind());
315 assert_eq!(c1.offset(), c2.offset());
316 }
317 }
318
319 #[test]
320 fn test_lexer_iterator_clone() {
321 let source = "foo bar baz";
322 let mut lexer = Lexer::new(&EmptyAtomSet::ATOMS, source);
323
324 let first = lexer.next();
325 assert!(first.is_some());
326 assert_eq!(first.unwrap(), Kind::Ident);
327
328 let lexer_clone = lexer.clone();
329
330 let cursors1: Vec<_> = lexer.collect();
331 let cursors2: Vec<_> = lexer_clone.collect();
332
333 assert_eq!(cursors1.len(), cursors2.len());
334 for (c1, c2) in cursors1.iter().zip(cursors2.iter()) {
335 assert_eq!(c1.token().kind(), c2.token().kind());
336 assert_eq!(c1.offset(), c2.offset());
337 }
338 }
339}