css_lexer/
lib.rs

1#![deny(warnings)]
2//! An implementation of the [CSS Syntax Level 3 tokenization algorithm][1]. It is intended as a low-level building
3//! block for buidling parsers for CSS or CSS-alike languages (for example SASS).
4//!
5//! This crate provides the [Lexer] struct, which borrows `&str` and can incrementally produce [Tokens][Token]. The
6//! encoding of the `&str` is assumed to be utf-8.
7//!
8//! The [Lexer] _may_ be configured with additional [Features][Feature] to allow for lexing tokens in ways which diverge
9//! from the CSS specification (such as tokenizing comments using `//`). With no additional features this lexer is fully
10//! spec compliant.
11//!
12//! [Tokens][Token] are _untyped_ (there are no super-classes like `Ident`); but they have a [Kind] which can be used to
13//! determine their type. Tokens do not store the underlying character data, nor do they store their offsets. They just
14//! provide "facts" about the underlying data. In order to re-build a string, each [Token] will need to be wrapped in a
15//! [Cursor] and consult the original `&str` to get the character data. This design allows Tokens live in the stack,
16//! avoiding heap allocation as they are always `size_of` `8`. Likewise [Cursors][Cursor] are always a `size_of` `12`.
17//!
18//! # Limitations
19//!
20//! The [Lexer] has limitations around document sizes and token sizes, in order to keep [Token], [SourceOffset] and
21//! [Cursor] small. It's very unlikely the average document will run into these limitations, but they're listed here
22//! for completeness:
23//!
24//! - Documents are limited to ~4gb in size. [SourceOffset] is a [u32] so cannot represent larger offsets. Attempting to
25//!   lex larger documents is considrered [undefined behaviour][2].
26//!
27//! - [Tokens][Token] are limited to ~4gb in length. A [Token's][Token] is a [u32] so cannot represent larger lengths.
28//!   If the lexer encounters a token with  larger length this is considered [undefined behaviour][2].
29//!
30//! - Number [Tokens][Token] are limited to 16,777,216 characters in length. For example encountering a number with
31//!   17MM `0`s is considered [undefined behaviour][2]. This is not the same as the number value, which is an [f32].
32//!   (Please note that the CSS spec dictates numbers are f32, CSS does not have larger numbers).
33//!
34//! - Dimension [Tokens][Token] are limited to 4,096 numeric characters in length and 4,096 ident characters in length.
35//!   For example encountering a dimension with 4,097 `0`s is considered [undefined behaviour][2].
36//!
37//! # General usage
38//!
39//! A parser can be implemented on top of the [Lexer] by instantiating a [Lexer] with [Lexer::new()] or
40//! [Lexer::new_with_features()] if you wish to opt-into non-spec-compliant features. The [Lexer] needs to be given a
41//! `&str` which it will reference to produce Tokens.
42//!
43//! Repeatedly calling [Lexer::advance()] will move the Lexer's internal position one [Token] forward, and return the
44//! newly lexed [Token], once the end of `&str` is reached [Lexer::advance()] will repeatedly return [Token::EOF].
45//!
46//! # Example
47//!
48//! ```
49//! use css_lexer::*;
50//! let mut lexer = Lexer::new("width: 1px");
51//! assert_eq!(lexer.offset(), 0);
52//! {
53//!     let token = lexer.advance();
54//!     assert_eq!(token, Kind::Ident);
55//!     let cursor = token.with_cursor(SourceOffset(0));
56//!     assert_eq!(cursor.str_slice(lexer.source()), "width");
57//! }
58//! {
59//!     let token = lexer.advance();
60//!     assert_eq!(token, Kind::Colon);
61//!     assert_eq!(token, ':');
62//! }
63//! {
64//!     let token = lexer.advance();
65//!     assert_eq!(token, Kind::Whitespace);
66//! }
67//! {
68//!     let token = lexer.advance();
69//!     assert_eq!(token, Kind::Dimension);
70//!     assert_eq!(token.dimension_unit(), DimensionUnit::Px);
71//! }
72//! ```
73//!
74//! [1]: https://drafts.csswg.org/css-syntax/#tokenization
75//! [2]: https://en.wikipedia.org/wiki/Undefined_behavior
76
77mod associated_whitespace_rules;
78mod comment_style;
79mod constants;
80mod cursor;
81mod dimension_unit;
82mod feature;
83mod kind;
84mod kindset;
85mod pairwise;
86mod private;
87mod quote_style;
88mod source_cursor;
89mod source_offset;
90mod span;
91mod syntax;
92mod token;
93mod whitespace_style;
94
95pub use associated_whitespace_rules::AssociatedWhitespaceRules;
96pub use comment_style::CommentStyle;
97pub use cursor::Cursor;
98pub use dimension_unit::DimensionUnit;
99pub use feature::Feature;
100pub use kind::Kind;
101pub use kindset::KindSet;
102pub use pairwise::PairWise;
103pub use quote_style::QuoteStyle;
104pub use source_cursor::SourceCursor;
105pub use source_offset::SourceOffset;
106pub use span::{Span, ToSpan};
107pub use token::Token;
108pub use whitespace_style::Whitespace;
109
110/// The [Lexer] struct - the core of the library - borrows `&str` and can incrementally produce [Tokens][Token].
111///
112/// The encoding of the `&str` is assumed to be utf-8. Other sources should be re-encoded into utf-8 prior to ingesting
113/// into the [Lexer].
114///
115/// The [Lexer] _may_ be configured with additional [Features][Feature] to allow for lexing tokens in ways which diverge
116/// from the CSS specification (such as tokenizing comments using `//`). With no additional features this lexer is fully
117/// spec compliant.
118///
119/// [Tokens][Token] are _untyped_ (there are no super-classes like `Ident`); but they have a [Kind] which can be used to
120/// determine their type. Tokens do not store the underlying character data, nor do they store their offsets. They just
121/// provide "facts" about the underlying data. In order to re-build a string, each [Token] will need to be wrapped in a
122/// [Cursor] and consult the original `&str` to get the character data. This design allows Tokens live in the stack,
123/// avoiding heap allocation as they are always `size_of` `8`. Likewise [Cursors][Cursor] are always a `size_of` `12`.
124///
125/// # Limitations
126///
127/// The [Lexer] has limitations around document sizes and token sizes, in order to keep [Token], [SourceOffset] and
128/// [Cursor] small.
129///
130/// - Documents are limited to ~4gb in size. [SourceOffset] is a [u32] so cannot represent larger offsets. Attempting to
131///   lex larger documents is considrered [undefined behaviour][2].
132///
133/// - [Tokens][Token] are limited to ~4gb in length. A [Token's][Token] is a [u32] so cannot represent larger lengths.
134///   If the lexer encounters a token with  larger length this is considered [undefined behaviour][2].
135///
136/// - Number [Tokens][Token] are limited to 16,777,216 characters in length. For example encountering a number with
137///   17MM `0`s is considered [undefined behaviour][2]. This is not the same as the number value, which is an [f32].
138///   (Please note that the CSS spec dictates numbers are f32, CSS does not have larger numbers).
139///
140/// - Dimension [Tokens][Token] are limited to 4,096 numeric characters in length and 4,096 ident characters in length.
141///   For example encountering a dimension with 4,097 `0` is considered [undefined behaviour][2].
142///
143/// # General usage
144///
145/// A parser can be implemented on top of the [Lexer] by instantiating a [Lexer] with [Lexer::new()] or
146/// [Lexer::new_with_features()] if you wish to opt-into non-spec-compliant features. The [Lexer] needs to be given a
147/// `&str` which it will reference to produce Tokens.
148///
149/// Repeatedly calling [Lexer::advance()] will move the Lexer's internal position one [Token] forward, and return the
150/// newly lexed [Token], once the end of `&str` is reached [Lexer::advance()] will repeatedly return [Token::EOF].
151///
152/// # Example
153///
154/// ```
155/// use css_lexer::*;
156/// let mut lexer = Lexer::new("width: 1px");
157/// assert_eq!(lexer.offset(), 0);
158/// {
159///     let token = lexer.advance();
160///     assert_eq!(token, Kind::Ident);
161///     let cursor = token.with_cursor(SourceOffset(0));
162///     assert_eq!(cursor.str_slice(lexer.source()), "width");
163/// }
164/// {
165///     let token = lexer.advance();
166///     assert_eq!(token, Kind::Colon);
167///     assert_eq!(token, ':');
168/// }
169/// {
170///     let token = lexer.advance();
171///     assert_eq!(token, Kind::Whitespace);
172/// }
173/// {
174///     let token = lexer.advance();
175///     assert_eq!(token, Kind::Dimension);
176///     assert_eq!(token.dimension_unit(), DimensionUnit::Px);
177/// }
178/// ```
179///
180/// [1]: https://drafts.csswg.org/css-syntax/#tokenization
181/// [2]: https://en.wikipedia.org/wiki/Undefined_behavior
182#[derive(Default, Debug, Clone, PartialEq)]
183pub struct Lexer<'a> {
184	source: &'a str,
185	offset: SourceOffset,
186	token: Token,
187	features: Feature,
188}
189
190impl<'a> Lexer<'a> {
191	#[inline]
192	pub fn new(source: &'a str) -> Self {
193		Self { source, ..Default::default() }
194	}
195
196	#[inline]
197	pub fn new_with_features(source: &'a str, features: Feature) -> Self {
198		Self { source, features, ..Default::default() }
199	}
200
201	#[inline(always)]
202	pub fn source(&self) -> &'a str {
203		self.source
204	}
205
206	/// Is the lexer at the last token
207	pub fn at_end(&self) -> bool {
208		self.offset.0 as usize == self.source.len()
209	}
210
211	/// Current position in file
212	#[inline(always)]
213	pub const fn offset(&self) -> SourceOffset {
214		self.offset
215	}
216
217	#[inline(always)]
218	pub fn checkpoint(&self) -> Cursor {
219		Cursor::new(self.offset(), self.token)
220	}
221
222	/// Rewinds the lexer back to the given checkpoint
223	pub fn rewind(&mut self, cursor: Cursor) {
224		debug_assert!(cursor.offset() <= self.offset());
225		self.offset = cursor.offset();
226		self.token = cursor.token();
227	}
228
229	/// Advances the lexer to the end of the given token
230	pub fn hop(&mut self, cursor: Cursor) {
231		debug_assert!(cursor.offset().0 as usize >= (self.offset.0 + self.token.len()) as usize);
232		self.offset = cursor.offset();
233		self.token = cursor.token();
234	}
235
236	/// Moves the lexer one token forward, returning that token
237	pub fn advance(&mut self) -> Token {
238		self.token = self.read_next_token(self.offset.0);
239		self.offset.0 += self.token.len();
240		self.token
241	}
242}
243
244#[test]
245fn size_test() {
246	assert_eq!(::std::mem::size_of::<Lexer>(), 32);
247}