css_lexer/token.rs
1use crate::{
2 AssociatedWhitespaceRules, CommentStyle, Cursor, Kind, KindSet, PairWise, QuoteStyle, SourceOffset, Whitespace,
3};
4use std::char::REPLACEMENT_CHARACTER;
5
6/// An abstract representation of the chunk of the source text, retaining certain "facts" about the source.
7///
8/// # Design
9///
10/// The [Token] type is an immutable packing of two [u32s][u32] that represents a unit in the source text, but without
11/// the associated offset data that points to its position in the source text. This is important because it means that
12/// equivalent [Tokens][Token] are equal even in different parts of the document. For the most part a [Token] doesn't
13/// represent data that can be put into a text file because it lacks the underlying character data. It is lossy. For
14/// example a [Token] with [Kind::Ident] just represents _an_ ident, but it doesn't retain what the keyword is).
15/// Storing raw-character data would require either storing tokens on the heap (and therefore they couldn't be [Sized])
16/// or by keeping a reference to `&'a str` which means larger token sizes and lifetime tracking. By _not_ storing
17/// character data we can keep [Token] [Sized] and keep it to `size_of` `8`, avoiding the heap, avoiding
18/// references/lifetimes, and keeping [Token] entirely in the stack. For a lot of tokens this is _fine_ because the
19/// underlying character data isn't that useful past a certain point.
20///
21/// A [Token] retains certain "facts" about the underlying unit of text, though. For example it retains the [Kind], how
22/// many characters the token consumed, and various other pieces of information, depending on the [Kind]. In some
23/// cases, it's entirely possible to represent the full token, including character data, into the available bits (for
24/// example [Kind::Delim] stores its [char], [Kind::Number] stores its [f32]). Taking the time in the tokenizer to
25/// gather these facts and values can keep cache-lines hot, which speeds up subsequent checks in the parser.
26///
27/// If you're familiar with "red green" syntax trees such as [Swiftlang's libsyntax][1], or [Rust-Analyzer's Rowan][2]
28/// or [Roslyn][3] this might be a little familiar in some concepts. However [Token] does not represent a tree, and
29/// relies on resorting back to the string data to find out keyword values.
30///
31/// [1]: https://gh.io/AAtdqpg
32/// [2]: https://gh.io/AAtf8pt
33/// [3]: https://gh.io/AAtab90
34///
35/// This representation of facts, kind, length, or other metadata can be quite complex - so here's a
36/// full breakdown:
37///
38/// # Anatomy of Token
39///
40/// A [Token] is a struct of `(u32, u32)`. The second u32 is _usually_ the token length (hence keeping them separate).
41/// The first [u32], however, is split into 3 (sometimes 5) parts. The two u32s can be thought of like so:
42///
43/// ```md
44/// |-----|-------|--------------------------|---------------------------------|
45/// | TF | K | VD | Value |
46/// 0b| 000 | 00000 | 000000000000000000000000 | 0000000000000000000000000000000 |
47/// |-----|-------|--------------------------|---------------------------------|
48/// | 3-- | 5---- | 24---------------------- | 32----------------------------- |
49/// ```
50///
51/// ## TF = Type Flags (or "Token Facts")
52///
53/// This represents a bit-mask in the upper-most 3 bits. The flags are general purpose and change meaning depending on
54/// the Token's [Kind]. Each flag generally maps to a method so it's not necessary to remenber the contents of this
55/// table, but it can serve as a useful reference. Note that not all methods return a [bool], so footnotes have been
56/// added to explain these further.
57///
58/// | Kind:: | Flag | Description | Method |
59/// |--------------------|-------|-----------------------------|------------------------------------------|
60/// | [Kind::Number] | `001` | Floating Point | [Token::is_float()] |
61/// | | `010` | Has a "Sign" (-/+) | [Token::has_sign()] |
62/// | | `100` | Sign is required | [Token::sign_is_required()] |
63/// | [Kind::Dimension] | `001` | Floating Point | [Token::is_float()] |
64/// | | `010` | Has a "Sign" (-/+) | [Token::has_sign()] |
65/// | | `100` | Unit is a known dimension | [Token::atom_bits()][^dimension] |
66/// | [Kind::String] | `001` | Uses Double Quotes | [Token::quote_style()][^quotes] |
67/// | | `010` | Has a closing quote | [Token::has_close_quote()] |
68/// | | `100` | Contains escape characters | [Token::contains_escape_chars()] |
69/// | [Kind::Ident] | `001` | Contains non-lower-ASCII | [Token::is_lower_case()] |
70/// | | `010` | Is a "Dashed Ident" | [Token::is_dashed_ident()] |
71/// | | `100` | Contains escape characters | [Token::contains_escape_chars()] |
72/// | [Kind::Function] | `001` | Contains non-lower-ASCII | [Token::is_lower_case()] |
73/// | | `010` | Is a "Dashed Ident" | [Token::is_dashed_ident()] |
74/// | | `100` | Contains escape characters | [Token::contains_escape_chars()] |
75/// | [Kind::AtKeyword] | `001` | Contains non-lower-ASCII | [Token::is_lower_case()] |
76/// | | `010` | Is a "Dashed Ident" | [Token::is_dashed_ident()] |
77/// | | `100` | Contains escape characters | [Token::contains_escape_chars()] |
78/// | [Kind::Hash] | `001` | Contains non-lower-ASCII | [Token::is_lower_case()] |
79/// | | `010` | First character is ASCII | [Token::hash_is_id_like()] |
80/// | | `100` | Contains escape characters | [Token::contains_escape_chars()] |
81/// | [Kind::Url] | `001` | Has a closing paren ) | [Token::url_has_closing_paren()] |
82/// | | `010` | Contains whitespace after ( | [Token::url_has_leading_space()] |
83/// | | `100` | Contains escape characters | [Token::contains_escape_chars()] |
84/// | [Kind::CdcOrCdo] | `001` | Is CDO (`000` would be CDC) | [Token::is_cdc()] |
85/// | | `010` | (Reserved) | -- |
86/// | | `100` | (Reserved) | -- |
87/// | [Kind::Whitespace] | `---` | Whitespace style | [Token::whitespace_style()][^whitespace] |
88/// | [Kind::Delim] | `---` | Associate whitespace rules | [Token::associated_whitespace()][^delim] |
89/// | [Kind::Comment] | `---` | (Special) | [Token::comment_style()][^comments] |
90///
91/// [^quotes]: Strings do not have a [bool] returning method for whether or not the quote is using double or single
92/// quotes, instead the [Token::quote_style()] method will returning the [QuoteStyle] enum for better readability.
93/// [^whitespace]: Whitespace tokens to not have a [bool] returning method, instead [Token::whitespace_style()] will
94/// return the [Whitespace] enum for improved readability.
95/// [^comments]: Rather than using the 3 bits as a bit-mask, Comment tokens use the data to store the [CommentStyle]
96/// enum, which is capable of representing 8 discrete comment styles.
97/// [^delim]: Delims can be used in interesting ways inside of CSS syntax. At higher levels CSS is _sometimes_
98/// whitespace sensitive, for example the whitespace inside of a CSS selector _sometimes_ represents the descendant
99/// combinator, meanwhile delimiters inside calc() are sensitive to whitespace collapse (`calc(1px + 1px)` is valid
100/// while `calc(1px+1px)` is a parse error). Further to this, introducing whitespace (say through a formatter) might
101/// break in interesting ways due to some combinations of Delims & Idents - for example Pseudo Classes like `:hover`,
102/// or CSS like languages such as SASS using `$var` style syntax. While `:hover` and `$var` are comprised of two tokens
103/// they're considered one conceptual unit. Having a way to express these relationships at the token level can be useful
104/// for other low level machinery such as formatters/minifiers, rather than introducing complex state at higher levels.
105/// For these reasons, Delim tokens have the ability to express their whitespace association. The lexer will always
106/// produce a token with empty whitespace rules, but parsers can replace this token with a more complex set of rules.
107///
108/// ## K = Kind Bits
109///
110/// The `K` value - upper-most bits 4-9 stores the 5-bit [Kind].
111///
112/// ## VD = Value Data
113///
114/// The `VD` value - the lower-most 24-bits - stores data depending on the [Token] [Kind]. For most kinds this data is
115/// reserved (just 0s). The value data cannot be interrogated manually, but it packs in additional data about the
116/// underlying string to make the string easier to parse without doing the same lookups that the tokenizer already had
117/// to - such as determining lengths of the various parts of the token, or packing values so that consulting the string
118/// can be avoided (which keeps cache-lines hot).
119///
120/// Below describes the special kinds which use the Value Data to store yet more information about the token...
121///
122/// ### Value Data for [Kind::Ident], [Kind::Function], [Kind::AtKeyword]
123///
124/// If the [Kind] is [Kind::Ident], [Kind::Function], or [Kind::AtKeyword] then Value Data represents the Ident's "Atom
125/// Data". When lexing one of these tokens the Lexer will pass the string slice to [DynAtomSet][crate::DynAtomSet] and
126/// set this bits accordingly. This allows implementations to provide a [DynAtomSet][crate::DynAtomSet] of interned
127/// strings to improve performance of string comparisons. The `ATOM_DYNAMIC_BIT` can be used to dynamically intern
128/// strings during runtime (this behaviour is abstracted by [DynAtomRegistry][crate::DynAtomRegistry]). This 24-bits
129/// allows for ~16MM unique strings, but with the `ATOM_DYNAMIC_BIT` this becomes ~8MM static atoms and ~8MM dynamic
130/// atoms (very unlikely CSS will ever reach even 10k predefined keywords, and most CSS files will have less than 1000
131/// unique strings).
132///
133/// ### Value Data for [Kind::Number]
134///
135/// If the [Kind] is [Kind::Number], Value Data represents the length of that number (this means the parser is
136/// restricted from representing numbers longer than 16,777,216 characters which is probably an acceptable limit). Note
137/// that this does not affect the _value_ of a number, just the characters in a string. Numbers in CSS are [f32]. The
138/// vast majority of [f32s][f32] can be represented in 16MM characters, but it's possible to author a document that
139/// contains a set of numeric characters longer than 16MM code points. These scenarios are considered [undefined
140/// behaviour][1].
141///
142/// [4]: https://en.wikipedia.org/wiki/Undefined_behavior
143///
144/// ### Value Data for [Kind::Hash]
145///
146/// If the [Kind] is [Kind::Hash], Value Data represents the length of that hash (this means the parser is restricted
147/// from representing IDs and hex codes longer than 16,777,216 characters which is probably an acceptable limit). Note
148/// that this restriction means that ID selectors have a much tigher limit than other tokens, such as strings or
149/// idents, but it's very unlikely to see a 16million character ID in CSS (String, maybe).
150///
151/// ### Value Data for [Kind::Url]
152///
153/// If the [Kind] is [Kind::Url], Value Data represents the "leading length" and "trailing length" of the URL. This
154/// means the value data is split into two 12 bit numbers:
155///
156/// ```md
157/// |--------------|--------------|
158/// | LL | TL |
159/// | 000000000000 | 000000000000 |
160/// |--------------|--------------|
161/// | 12---------- | 12---------- |
162/// ```
163///
164/// The "leading" length represents the `url(` part of the token. Typically this will be `4`, however it's possible
165/// (for legacy compatibility reasons within CSS) to add whitespace between the opening parenthesis and the URL value.
166/// It's also possible to escape the `url` ident portion. This means `\75\52\6c( ` is also a valid leading section of
167/// a URL ident (which has a character length of 13), as is `\000075 \000052 \00006c ( ` (28 characters). 12 bits
168/// allows for a maximum character length of 4,096. It is not possible to represent a URL token's leading section using
169/// 4,096 characters so there is some headroom (wasted bytes) here.
170///
171/// The "trailing" length represents the `)` part of the token. Typically this will be `1`, however it's possible to
172/// add any number of whitespace characters between the end of the URL and the closing parenthesis. If a CSS document
173/// contains more than 4095 whitespace characters then this is considered [undefined behaviour][4].
174///
175/// ### Value Data for [Kind::Dimension]
176///
177/// If K is a Dimension, then this represents both the number of characters in the numeric portion of the dimension
178/// and the length of the ident portion of the dimension... or the dimension unit itself (more on that below). This
179/// means the value data is split into two 12 bit numbers:
180///
181/// ```md
182/// |--------------|--------------|
183/// | NL | DUL |
184/// | 000000000000 | 000000000000 |
185/// |--------------|--------------|
186/// | 12---------- | 12---------- |
187///
188/// |--------------|-------| --------|
189/// | NL | KDUL | KNOWN |
190/// | 000000000000 | 00000 | 0000000 |
191/// |--------------|-------| --------|
192/// | 12---------- | 5---- | 7------ |
193/// ```
194///
195/// The NL portion - the numeric length - represents the length of characters the number contains. This means the
196/// numeric portion of a dimension can only be 4,096 characters long. This is dramatically shorter than the 16MM
197/// allowed for numbers but it's still also incredibly generous such that it's highly unlikely to ever be hit unless
198/// someone is intentionally trying to break the parser. The [Lexer][super::Lexer] encountering a dimension with a
199/// numeric portion longer than 4,096 characters is considered [undefined behaviour][4].
200///
201/// The DUL portion (if `TF & 100 == 0`) will represent the length of characters the ident portion of the dimension
202/// (aka the dimension unit) contains. This means the ident portion of a dimension can only be 4,096 characters long.
203/// For practical purposes CSS has a fixed set of dimensions - the longest of which (at the time of writing) are 5
204/// characters long (e.g. `svmax`). Through the use of escaping shenanigans it's possible to create a valid CSS
205/// dimension longer than 5 characters though (every ident can be made 8 times longer by using escape characters, e.g.
206/// `1svmax` at 6 characters can be instead written as `1\000073 \000076 \00006d \000061 \000078` at 40 characters). In
207/// addition to these factors, it's worth pointing out that there is scope for further dimensions and some [proposals
208/// for "custom" dimensions][5], and lastly this library is designed for CSS _and CSS-alike_ languages, which may
209/// invent their own dimension units. In other words being too restrictive on dimension ident length could be costly
210/// in the future, therefore 4,096 characters seems like a reasonable, if generous, trade-off.
211///
212/// There's a giant caveat here though. If `TF & 100 != 0`, then the dimension is considered "known" and DUL will be
213/// encoded differently. Instead of just containing the dimension unit length, which requires consulting the underlying
214/// `&str` to get the actual dimension, it will be used to store an Atom - but only the first 7 bits (the KNOWN
215/// portion), which for an Atom must be a Dimension atom (an assummption made on anything that implements
216/// [AtomSet][crate::AtomSet] is that all dimension units should be stored in the byte values of 1-127, so that they
217/// can be encoded in this space). Dimension units _can_ be escape encoded, and so the underlying character data may
218/// differ from the unescaped unit length, as such 5-bit KDUL portion represents character data length, in other words
219/// `KNOWN.len()` may not always equal KDUL`.
220///
221/// [5]: https://github.com/w3c/csswg-drafts/issues/7379
222///
223/// ## Value
224///
225/// The `Value` portion of [Token] represents the length of the token for most token kinds. However, for some tokens
226/// their length is already packed into the first u32. So it would make more sense to use this u32 to store more
227/// interesting data.
228///
229/// ## Value for [Kind::Delim] and single character tokens
230///
231/// [Kind::Delim] and single-character tokens (i.e. [Kind::Colon]->[Kind::RightCurly]) typically have a length of `1`
232/// ([Kind::Delim] can have a varied length for surrogate pairs). Instead of storing the length and wasting a whole
233/// [u32], this region stores the [char]. Calling [Token::char()] will return an [Option] which will always be [Some]
234/// for [Kind::Delim] and single-character tokens.
235///
236/// ## Value for [Kind::Hash]
237///
238/// The length of a hash is stored in its `VD` portion, leaving 32bits to storing other data. It just so happens that
239/// a 8-character hex code (#ffaabbcc) fits nicely inside of 32-bits. During tokenization we can eagerly parse the hex
240/// code and stuff it here, so it can be more easily reasoned about in upstream code (rather than
241/// reading the character data).
242///
243/// ## Value for [Kind::Number] and [Kind::Dimension]
244///
245/// As these tokens store their length data in the `VD` portion, this [u32] instead stores the _value_ of the number,
246/// stored as [f32::to_bits()].
247///
248/// ## Value data for other tokens.
249///
250/// In all other cases, this represents the length of the token as utf-8 bytes. This means the token length is
251/// 4,294,967,296 aka ~4GB. This sounds very long but also CSS can host very large image data and browsers will
252/// accomodate very large URLs. [An mdn article on Data URLs][6] claims that Firefox supports 32mb Data URLs, Chrome
253/// supports over 512mb, and Safari over 2gb. The reality is that if someone has such a large data URL in their CSS
254/// they probably should split it out, but we have a whole 32 bits to store the length so we may as well use it...
255///
256/// [6]: https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URLs#common_problems
257#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
258pub struct Token(u32, u32);
259
260impl Default for Token {
261 fn default() -> Self {
262 Self((Kind::Whitespace as u32) << 24, 0)
263 }
264}
265
266const KIND_MASK: u32 = !((1 << 24) - 1);
267const LENGTH_MASK: u32 = (1 << 24) - 1;
268const HALF_LENGTH_MASK: u32 = !((1 << 12) - 1);
269
270/// The bit position used to distinguish between static and dynamic atoms.
271/// - Static atoms have this bit = 0 (values 0 to 8,388,607)
272/// - Dynamic atoms have this bit = 1 (values 8,388,608 to 16,777,215)
273///
274/// This allows atoms to use the full 24-bit space available in token data.
275#[allow(dead_code)] // Used in dyn_atom_registry module
276pub(crate) const ATOM_DYNAMIC_BIT: u32 = 23;
277
278impl Token {
279 /// Represents an empty token.
280 pub const EMPTY: Token = Token::new_whitespace(Whitespace::none(), 0);
281
282 /// Represents an EOF token.
283 pub const EOF: Token = Token(0b0, 0);
284
285 /// Represents a CDO (`<!--`) token.
286 pub const CDO: Token = Token(((Kind::CdcOrCdo as u32) << 24) & KIND_MASK, 4);
287
288 /// Represents a CDC (`-->`) token.
289 pub const CDC: Token = Token((((Kind::CdcOrCdo as u32) | 0b001_00000) << 24) & KIND_MASK, 3);
290
291 /// Represents a single ' ' space token.
292 pub const SPACE: Token = Token::new_whitespace(Whitespace::Space, 1);
293
294 /// Represents a single Tab token.
295 pub const TAB: Token = Token::new_whitespace(Whitespace::Tab, 1);
296
297 /// Represents a single `\n` token.
298 pub const NEWLINE: Token = Token::new_whitespace(Whitespace::Newline, 1);
299
300 /// Represents the Number `0`. This is not equal to other representations of zero, such as `00`, `0e0`, `0.0` and so
301 /// on.
302 pub const NUMBER_ZERO: Token = Token((((Kind::Number as u32) | 0b100_00000) << 24) & KIND_MASK, 1);
303
304 /// Represents the `:` token.
305 pub const COLON: Token = Token::new_delim_kind(Kind::Colon, ':');
306
307 /// Represents the `;` token.
308 pub const SEMICOLON: Token = Token::new_delim_kind(Kind::Semicolon, ';');
309
310 /// Represents the `,` token.
311 pub const COMMA: Token = Token::new_delim_kind(Kind::Comma, ',');
312
313 /// Represents the `[` token.
314 pub const LEFT_SQUARE: Token = Token::new_delim_kind(Kind::LeftSquare, '[');
315
316 /// Represents the `]` token.
317 pub const RIGHT_SQUARE: Token = Token::new_delim_kind(Kind::RightSquare, ']');
318
319 /// Represents the `(` token.
320 pub const LEFT_PAREN: Token = Token::new_delim_kind(Kind::LeftParen, '(');
321
322 /// Represents the `)` token.
323 pub const RIGHT_PAREN: Token = Token::new_delim_kind(Kind::RightParen, ')');
324
325 /// Represents the `{` token.
326 pub const LEFT_CURLY: Token = Token::new_delim_kind(Kind::LeftCurly, '{');
327
328 /// Represents the `}` token.
329 pub const RIGHT_CURLY: Token = Token::new_delim_kind(Kind::RightCurly, '}');
330
331 /// Represents a `!` [Kind::Delim] token.
332 pub const BANG: Token = Token::new_delim('!');
333
334 /// Represents a `#` [Kind::Delim] token.
335 pub const HASH: Token = Token::new_delim('#');
336
337 /// Represents a `$` [Kind::Delim] token.
338 pub const DOLLAR: Token = Token::new_delim('$');
339
340 /// Represents a `%` [Kind::Delim] token - not to be confused with the `%` dimension.
341 pub const PERCENT: Token = Token::new_delim('%');
342
343 /// Represents a `&` [Kind::Delim] token.
344 pub const AMPERSAND: Token = Token::new_delim('&');
345
346 /// Represents a `*` [Kind::Delim] token.
347 pub const ASTERISK: Token = Token::new_delim('*');
348
349 /// Represents a `+` [Kind::Delim] token.
350 pub const PLUS: Token = Token::new_delim('+');
351
352 /// Represents a `-` [Kind::Delim] token.
353 pub const DASH: Token = Token::new_delim('-');
354
355 /// Represents a `.` [Kind::Delim] token.
356 pub const PERIOD: Token = Token::new_delim('.');
357
358 /// Represents a `/` [Kind::Delim] token.
359 pub const SLASH: Token = Token::new_delim('/');
360
361 /// Represents a `<` [Kind::Delim] token.
362 pub const LESS_THAN: Token = Token::new_delim('<');
363
364 /// Represents a `=` [Kind::Delim] token.
365 pub const EQUALS: Token = Token::new_delim('=');
366
367 /// Represents a `>` [Kind::Delim] token.
368 pub const GREATER_THAN: Token = Token::new_delim('>');
369
370 /// Represents a `?` [Kind::Delim] token.
371 pub const QUESTION: Token = Token::new_delim('?');
372
373 /// Represents a `@` [Kind::Delim] token. Not to be confused with the @keyword token.
374 pub const AT: Token = Token::new_delim('@');
375
376 /// Represents a `\\` [Kind::Delim] token.
377 pub const BACKSLASH: Token = Token::new_delim('\\');
378
379 /// Represents a `^` [Kind::Delim] token.
380 pub const CARET: Token = Token::new_delim('^');
381
382 /// Represents a `_` [Kind::Delim] token.
383 pub const UNDERSCORE: Token = Token::new_delim('_');
384
385 /// Represents a `\`` [Kind::Delim] token.
386 pub const BACKTICK: Token = Token::new_delim('\'');
387
388 /// Represents a `|` [Kind::Delim] token.
389 pub const PIPE: Token = Token::new_delim('|');
390
391 /// Represents a `~` [Kind::Delim] token.
392 pub const TILDE: Token = Token::new_delim('~');
393
394 /// Represents a replacement character [Kind::Delim] token.
395 pub const REPLACEMENT_CHARACTER: Token = Token::new_delim(REPLACEMENT_CHARACTER);
396
397 /// Creates a "Dummy" token with no additional data, just the [Kind].
398 #[inline]
399 pub const fn dummy(kind: Kind) -> Self {
400 Self((kind as u32) << 24, 0)
401 }
402
403 /// Creates a "Dummy" token with no additional data, just [Kind::Ident].
404 #[inline]
405 pub const fn dummy_ident() -> Self {
406 Self((Kind::Ident as u32) << 24, 0)
407 }
408
409 /// Creates a [Kind::Whitesapce] token.
410 #[inline]
411 pub(crate) const fn new_whitespace(style: Whitespace, len: u32) -> Self {
412 let flags: u32 = Kind::Whitespace as u32 | ((style.to_bits() as u32) << 5);
413 Self((flags << 24) & KIND_MASK, len)
414 }
415
416 /// Creates a [Kind::Comment] token.
417 #[inline]
418 pub(crate) fn new_comment(style: CommentStyle, len: u32) -> Self {
419 let flags: u32 = Kind::Comment as u32 | ((style as u32) << 5);
420 Self((flags << 24) & KIND_MASK, len)
421 }
422
423 /// Creates a [Kind::Number] token.
424 #[inline]
425 pub(crate) fn new_number(is_float: bool, has_sign: bool, len: u32, value: f32) -> Self {
426 let flags: u32 = Kind::Number as u32 | ((is_float as u32) << 5) | ((has_sign as u32) << 6);
427 Self((flags << 24) & KIND_MASK | (len & LENGTH_MASK), value.to_bits())
428 }
429
430 /// Creates a new [Kind::Dimension] token.
431 #[inline]
432 pub(crate) fn new_dimension(
433 is_float: bool,
434 has_sign: bool,
435 num_len: u32,
436 unit_len: u32,
437 value: f32,
438 atom: u8,
439 ) -> Self {
440 debug_assert!(num_len <= 4097);
441 let num_len = (num_len << 12) & HALF_LENGTH_MASK;
442 let is_known_unit = if unit_len < 32 { ((atom != 0) as u32) << 7 } else { 0 };
443 let unit_len = if is_known_unit == 0 { unit_len } else { unit_len << 7 | (atom as u32 & 0b1111111) };
444 let flags: u32 = Kind::Dimension as u32 | is_known_unit | ((is_float as u32) << 5) | ((has_sign as u32) << 6);
445 Self(((flags << 24) & KIND_MASK) | ((num_len | unit_len) & LENGTH_MASK), value.to_bits())
446 }
447
448 /// Creates a new [Kind::BadString] token. Bad Strings are like String tokens but during lexing they failed to fully tokenize
449 /// into a proper string token, usually due to containing newline characters.
450 #[inline]
451 pub(crate) fn new_bad_string(len: u32) -> Self {
452 Self(((Kind::BadString as u32) << 24) & KIND_MASK, len)
453 }
454
455 /// Creates a new [Kind::BadUrl] token. Bad URLs are like URL tokens but during lexing they failed to fully tokenize into a
456 /// proper URL token, usually due to containing newline characters.
457 #[inline]
458 pub(crate) fn new_bad_url(len: u32) -> Self {
459 Self(((Kind::BadUrl as u32) << 24) & KIND_MASK, len)
460 }
461
462 /// Creates a new [Kind::Ident] token.
463 #[inline]
464 pub(crate) fn new_ident(
465 contains_non_lower_ascii: bool,
466 dashed: bool,
467 contains_escape: bool,
468 atom: u32,
469 len: u32,
470 ) -> Self {
471 let flags: u32 = Kind::Ident as u32
472 | ((contains_non_lower_ascii as u32) << 5)
473 | ((dashed as u32) << 6)
474 | ((contains_escape as u32) << 7);
475 debug_assert!(atom & LENGTH_MASK == atom);
476 Self((flags << 24) & KIND_MASK | atom, len)
477 }
478
479 /// Creates a new [Kind::Function] token.
480 #[inline]
481 pub(crate) fn new_function(
482 contains_non_lower_ascii: bool,
483 dashed: bool,
484 contains_escape: bool,
485 atom: u32,
486 len: u32,
487 ) -> Self {
488 let flags: u32 = Kind::Function as u32
489 | ((contains_non_lower_ascii as u32) << 5)
490 | ((dashed as u32) << 6)
491 | ((contains_escape as u32) << 7);
492 debug_assert!(atom & LENGTH_MASK == atom);
493 Self((flags << 24) & KIND_MASK | atom, len)
494 }
495
496 /// Creates a new [Kind::AtKeyword] token.
497 #[inline]
498 pub(crate) fn new_atkeyword(
499 contains_non_lower_ascii: bool,
500 dashed: bool,
501 contains_escape: bool,
502 atom: u32,
503 len: u32,
504 ) -> Self {
505 let flags: u32 = Kind::AtKeyword as u32
506 | ((contains_non_lower_ascii as u32) << 5)
507 | ((dashed as u32) << 6)
508 | ((contains_escape as u32) << 7);
509 debug_assert!(atom & LENGTH_MASK == atom);
510 Self((flags << 24) & KIND_MASK | atom, len)
511 }
512
513 /// Creates a new [Kind::Hash] token.
514 #[inline]
515 pub(crate) fn new_hash(
516 contains_non_lower_ascii: bool,
517 first_is_ascii: bool,
518 contains_escape: bool,
519 len: u32,
520 hex_value: u32,
521 ) -> Self {
522 let flags: u32 = Kind::Hash as u32
523 | ((contains_non_lower_ascii as u32) << 5)
524 | ((first_is_ascii as u32) << 6)
525 | ((contains_escape as u32) << 7);
526 debug_assert!(len < (1 << 24));
527 Self((flags << 24) & KIND_MASK | (len & LENGTH_MASK), hex_value)
528 }
529
530 /// Creates a new [Kind::String] token.
531 #[inline]
532 pub(crate) fn new_string(quotes: QuoteStyle, has_close_quote: bool, contains_escape: bool, len: u32) -> Self {
533 debug_assert!(quotes != QuoteStyle::None);
534 let quotes = if quotes == QuoteStyle::Double { 0b001_00000 } else { 0b0 };
535 let flags: u32 =
536 Kind::String as u32 | quotes | ((has_close_quote as u32) << 6) | ((contains_escape as u32) << 7);
537 Self((flags << 24) & KIND_MASK, len)
538 }
539
540 /// Creates a new [Kind::Url] token.
541 #[inline]
542 pub(crate) fn new_url(
543 ends_with_paren: bool,
544 contains_whitespace_after_open_paren: bool,
545 contains_escape: bool,
546 leading_length: u32,
547 trailing_length: u32,
548 len: u32,
549 ) -> Self {
550 let leading_length = (leading_length << 12) & HALF_LENGTH_MASK;
551 let flags: u32 = Kind::Url as u32
552 | ((ends_with_paren as u32) << 5)
553 | ((contains_whitespace_after_open_paren as u32) << 6)
554 | ((contains_escape as u32) << 7);
555 Self((flags << 24) & KIND_MASK | ((leading_length | trailing_length) & LENGTH_MASK), len)
556 }
557
558 /// Creates a new [Kind::Delim] token.
559 #[inline]
560 pub(crate) const fn new_delim(char: char) -> Self {
561 let flags: u32 = Kind::Delim as u32;
562 Self((flags << 24) & KIND_MASK, char as u32)
563 }
564
565 /// Creates a new [Kind::Delim] token.
566 #[inline]
567 pub(crate) const fn new_delim_kind(kind: Kind, char: char) -> Self {
568 let flags: u32 = kind as u32;
569 Self((flags << 24) & KIND_MASK, char as u32)
570 }
571
572 /// Creates a new [Kind::Delim] token with associated whitespace.
573 #[inline]
574 pub(crate) const fn new_delim_with_associated_whitespace(char: char, rules: AssociatedWhitespaceRules) -> Self {
575 let flags: u32 = Kind::Delim as u32 | ((rules.to_bits() as u32) << 5);
576 Self((flags << 24) & KIND_MASK, char as u32)
577 }
578
579 /// \[private\]
580 /// Creates a new Token with an interned string.
581 #[inline]
582 pub fn new_interned(kind: Kind, bits: u32, len: u32) -> Token {
583 debug_assert!(kind == KindSet::IDENT_LIKE);
584 debug_assert!(bits & LENGTH_MASK == bits);
585 debug_assert!(len > 0);
586 Self(((kind as u32) << 24) & KIND_MASK | (bits & LENGTH_MASK), len + ((kind != Kind::Ident) as u32))
587 }
588
589 /// Returns the raw bits representing the [Kind].
590 #[inline(always)]
591 pub(crate) const fn kind_bits(&self) -> u8 {
592 (self.0 >> 24 & 0b11111) as u8
593 }
594
595 /// Returns the [Kind].
596 #[inline]
597 pub const fn kind(&self) -> Kind {
598 Kind::from_bits(self.kind_bits())
599 }
600
601 /// Check if the TF upper-most bit is set.
602 #[inline(always)]
603 const fn first_bit_is_set(&self) -> bool {
604 self.0 >> 31 == 1
605 }
606
607 /// Check if the TF second-upper-most bit is set.
608 #[inline(always)]
609 const fn second_bit_is_set(&self) -> bool {
610 self.0 >> 30 & 0b1 == 1
611 }
612
613 /// Check if the TF third-upper-most bit is set.
614 #[inline(always)]
615 const fn third_bit_is_set(&self) -> bool {
616 self.0 >> 29 & 0b1 == 1
617 }
618
619 /// Check if the [Kind] is "Ident Like", i.e. it is [Kind::Ident], [Kind::AtKeyword], [Kind::Function], [Kind::Hash].
620 #[inline(always)]
621 pub(crate) const fn is_ident_like(&self) -> bool {
622 self.kind_bits() & 0b11000 == 0b01000 && self.kind_bits() != Kind::String as u8
623 }
624
625 /// Check if the [Kind] is "Delim Like", i.e. it is [Kind::Delim], [Kind::Colon], [Kind::Semicolon], [Kind::Comma],
626 /// [Kind::LeftSquare], [Kind::RightSquare], [Kind::LeftParen], [Kind::RightParen], [Kind::LeftCurly],
627 /// [Kind::RightCurly].
628 #[inline(always)]
629 pub(crate) const fn is_delim_like(&self) -> bool {
630 self.kind_bits() & 0b10000 == 0b10000
631 }
632
633 /// The only token with an empty length is EOF, but this method is available for symmetry with `len()`.
634 #[inline]
635 pub const fn is_empty(&self) -> bool {
636 self.kind_bits() == Kind::Eof as u8
637 }
638
639 /// Returns the amount of characters (utf-8 code points) this Token represents in the underlying source text.
640 #[inline]
641 pub const fn len(&self) -> u32 {
642 if self.kind_bits() == Kind::Eof as u8 {
643 0
644 } else if self.is_delim_like() {
645 debug_assert!(matches!(
646 self.kind(),
647 Kind::Delim
648 | Kind::Colon | Kind::Semicolon
649 | Kind::Comma | Kind::LeftSquare
650 | Kind::RightSquare
651 | Kind::LeftParen
652 | Kind::RightParen
653 | Kind::LeftCurly
654 | Kind::RightCurly
655 ));
656 self.char().unwrap().len_utf8() as u32
657 } else if self.kind_bits() == Kind::Number as u8 {
658 self.numeric_len()
659 } else if self.kind_bits() == Kind::Dimension as u8 {
660 if self.first_bit_is_set() {
661 self.numeric_len() + (self.0 >> 7 & 0b11111)
662 } else {
663 ((self.0 & LENGTH_MASK) >> 12) + (self.0 & !HALF_LENGTH_MASK)
664 }
665 } else if self.kind_bits() == Kind::Hash as u8 {
666 self.0 & LENGTH_MASK
667 } else {
668 self.1
669 }
670 }
671
672 /// If the [Kind] is "Delim Like" (i.e. it is [Kind::Delim], [Kind::Colon], [Kind::Semicolon], [Kind::Comma],
673 /// [Kind::LeftSquare], [Kind::RightSquare], [Kind::LeftParen], [Kind::RightParen], [Kind::LeftCurly],
674 /// [Kind::RightCurly]) then this will return a [Some] with a [char] representing the value.
675 /// For non-delim-like tokens this will return [None].
676 pub const fn char(&self) -> Option<char> {
677 if self.is_delim_like() {
678 return char::from_u32(self.1);
679 }
680 None
681 }
682
683 /// The [Token] is a [Kind::Dimension] or [Kind::Number] and is an integer - i.e. it has no `.`.
684 #[inline]
685 pub const fn is_int(&self) -> bool {
686 self.kind_bits() & 0b11100 == 0b00100 && !self.third_bit_is_set()
687 }
688
689 /// The [Token] is a [Kind::Dimension] or [Kind::Number] and is a float - i.e. it has decimal places. This will be
690 /// `true` even if the decimal places are 0. e.g. `0.0`.
691 #[inline]
692 pub const fn is_float(&self) -> bool {
693 self.kind_bits() & 0b11100 == 0b00100 && self.third_bit_is_set()
694 }
695
696 /// The [Token] is a [Kind::Dimension] or [Kind::Number] and the underlying character data included a `-` or `+`
697 /// character. Note that a positive number may not necessarily have a sign, e.g. `3` will return false, while `+3`
698 /// will return `true`.
699 #[inline]
700 pub const fn has_sign(&self) -> bool {
701 self.kind_bits() & 0b11100 == 0b00100 && self.second_bit_is_set()
702 }
703
704 /// The [Token] is a [Kind::Number] and the `+` sign is semantically required and should be preserved during
705 /// minification. This is used for numbers in `an+b` syntax (e.g., `:nth-child(+5)`) where the `+` sign
706 /// distinguishes the value from other syntactic forms.
707 ///
708 /// Asserts: the `kind()` is [Kind::Number].
709 #[inline]
710 pub const fn sign_is_required(&self) -> bool {
711 debug_assert!(self.kind_bits() == Kind::Number as u8);
712 self.first_bit_is_set()
713 }
714
715 /// Returns a new [Token] with the `sign_is_required` flag set. This indicates that the `+` sign
716 /// should be preserved during minification (e.g., for `an+b` syntax).
717 ///
718 /// Asserts: the `kind()` is [Kind::Number].
719 #[inline]
720 pub const fn with_sign_required(self) -> Token {
721 debug_assert!(self.kind_bits() == Kind::Number as u8);
722 Token(self.0 | (1 << 31), self.1)
723 }
724
725 /// If the [Token] is a [Kind::Dimension] or [Kind::Number] then this returns the amount of characters used to
726 /// represent this number in the underlying source text. Numbers may be inefficiently encoded in the source text,
727 /// e.g. `0.0000`.
728 ///
729 /// Asserts: the `kind()` is [Kind::Dimension] or [Kind::Number].
730 #[inline]
731 pub const fn numeric_len(&self) -> u32 {
732 debug_assert!(matches!(self.kind(), Kind::Number | Kind::Dimension));
733 if self.kind_bits() == Kind::Dimension as u8 { (self.0 & LENGTH_MASK) >> 12 } else { self.0 & LENGTH_MASK }
734 }
735
736 /// If the [Token] is a [Kind::Dimension] or [Kind::Number] then this returns the [f32] representation of the number's
737 /// value.
738 ///
739 /// Asserts: the `kind()` is [Kind::Dimension] or [Kind::Number].
740 #[inline]
741 pub fn value(&self) -> f32 {
742 debug_assert!(matches!(self.kind(), Kind::Number | Kind::Dimension));
743 f32::from_bits(self.1)
744 }
745
746 /// Returns the [Whitespace].
747 ///
748 /// If the [Token] is not a [Kind::Whitespace] this will return [Whitespace::none()].
749 #[inline]
750 pub fn whitespace_style(&self) -> Whitespace {
751 if self.kind_bits() == Kind::Whitespace as u8 {
752 Whitespace::from_bits((self.0 >> 29) as u8)
753 } else {
754 Whitespace::none()
755 }
756 }
757
758 /// Returns the [AssociatedWhitespaceRules].
759 ///
760 /// If the [Kind] is not "Delim Like" (i.e. it is not [Kind::Delim], [Kind::Colon], [Kind::Semicolon], [Kind::Comma],
761 /// [Kind::LeftSquare], [Kind::RightSquare], [Kind::LeftParen], [Kind::RightParen], [Kind::LeftCurly],
762 /// [Kind::RightCurly]) then this will always return `AssociatedWhitespaceRules::none()`.
763 #[inline]
764 pub fn associated_whitespace(&self) -> AssociatedWhitespaceRules {
765 if self.is_delim_like() {
766 AssociatedWhitespaceRules::from_bits((self.0 >> 29) as u8)
767 } else {
768 AssociatedWhitespaceRules::none()
769 }
770 }
771
772 /// Returns a new [Token] with the [AssociatedWhitespaceRules] set to the given [AssociatedWhitespaceRules],
773 /// if possible.
774 ///
775 /// If the [Kind] is not "Delim Like" (i.e. it is not [Kind::Delim], [Kind::Colon], [Kind::Semicolon], [Kind::Comma],
776 /// [Kind::LeftSquare], [Kind::RightSquare], [Kind::LeftParen], [Kind::RightParen], [Kind::LeftCurly],
777 /// [Kind::RightCurly]) then this will return the same [Token].
778 /// If the [AssociatedWhitespaceRules] is different it will return a new [Token].
779 #[inline]
780 pub fn with_associated_whitespace(&self, rules: AssociatedWhitespaceRules) -> Token {
781 if !self.is_delim_like() {
782 return *self;
783 }
784 Token::new_delim_with_associated_whitespace(self.char().unwrap(), rules)
785 }
786
787 /// Returns the [CommentStyle].
788 ///
789 /// If the [Token] is not a [Kind::Comment] this will return [None].
790 #[inline]
791 pub fn comment_style(&self) -> Option<CommentStyle> {
792 if self.kind_bits() == Kind::Comment as u8 { CommentStyle::from_bits((self.0 >> 29) as u8) } else { None }
793 }
794
795 /// Returns the [QuoteStyle].
796 ///
797 /// If the [Token] is not a [Kind::String] this will return [QuoteStyle::None].
798 #[inline]
799 pub fn quote_style(&self) -> QuoteStyle {
800 if self.kind_bits() == Kind::String as u8 {
801 if self.third_bit_is_set() {
802 return QuoteStyle::Double;
803 } else {
804 return QuoteStyle::Single;
805 }
806 }
807 QuoteStyle::None
808 }
809
810 /// Returns a new [Token] with the [QuoteStyle] set to the given [QuoteStyle], if possible.
811 ///
812 /// If the [Token] is not a [Kind::String], or the [QuoteStyle] is already the given [QuoteStyle] this will return the same [Token].
813 /// If the [QuoteStyle] is different it will return a new [Token].
814 /// [QuoteStyle] must not be [QuoteStyle::None]
815 #[inline]
816 pub fn with_quotes(&self, quote_style: QuoteStyle) -> Token {
817 debug_assert!(quote_style != QuoteStyle::None);
818 if self.kind_bits() != Kind::String as u8 || quote_style == self.quote_style() {
819 return *self;
820 }
821 Token::new_string(quote_style, self.has_close_quote(), self.contains_escape_chars(), self.len())
822 }
823
824 /// If the [Token] is a [Kind::String] this checks if the string ended in a close quote.
825 /// It is possible to have a valid String token that does not end in a close quote, by eliding the quote at the end of
826 /// a file.
827 ///
828 /// Asserts: The [Kind] is [Kind::String].
829 #[inline]
830 pub const fn has_close_quote(&self) -> bool {
831 debug_assert!(self.kind_bits() == Kind::String as u8);
832 self.second_bit_is_set()
833 }
834
835 /// Checks if it is possible for the [Token] to contain escape characters. Numbers, for example, cannot. Idents can.
836 #[inline]
837 pub const fn can_escape(&self) -> bool {
838 self.kind_bits() == Kind::String as u8 || self.kind_bits() == Kind::Dimension as u8 || self.is_ident_like()
839 }
840
841 /// If the [Token] can escape, checks if the underlying source text contained escape characters.
842 ///
843 /// Asserts: The token can escape ([Token::can_escape()]).
844 #[inline]
845 pub const fn contains_escape_chars(&self) -> bool {
846 if self.kind_bits() == Kind::Dimension as u8 {
847 // Always assume Dimension contains escape because we have other fast paths to handle dimension units
848 return true;
849 }
850 self.can_escape() && self.first_bit_is_set()
851 }
852
853 /// If the [Token] is Ident like, checks if the first two code points are HYPHEN-MINUS (`-`).
854 ///
855 /// Asserts: The token is "ident like", i.e. it is [Kind::Ident], [Kind::AtKeyword], [Kind::Function], [Kind::Hash].
856 #[inline]
857 pub const fn is_dashed_ident(&self) -> bool {
858 debug_assert!(self.is_ident_like());
859 self.second_bit_is_set()
860 }
861
862 /// Checks if the [Token] is Ident like and none of the characters are ASCII upper-case.
863 #[inline]
864 pub const fn is_lower_case(&self) -> bool {
865 self.is_ident_like() && !self.third_bit_is_set()
866 }
867
868 #[inline]
869 pub fn atom_bits(&self) -> u32 {
870 if self.kind_bits() == Kind::Dimension as u8 && self.first_bit_is_set() {
871 self.0 & 0b111_1111
872 } else if self.is_ident_like() && self.kind_bits() != Kind::Hash as u8 {
873 self.0 & LENGTH_MASK
874 } else {
875 0
876 }
877 }
878
879 /// Checks if the [Token] is Trivia-like, that is [Kind::Comment], [Kind::Whitespace], [Kind::Eof]
880 #[inline]
881 pub const fn is_trivia(&self) -> bool {
882 self.kind_bits() & 0b000011 == self.kind_bits()
883 }
884
885 /// If the [Token] is [Kind::Url], checks if there are leading Whitespace characters before the inner value.
886 ///
887 /// Asserts: The token is [Kind::Url].
888 #[inline]
889 pub const fn url_has_leading_space(&self) -> bool {
890 debug_assert!(self.kind_bits() == Kind::Url as u8);
891 self.second_bit_is_set()
892 }
893
894 /// If the [Token] is [Kind::Url], checks if the closing parenthesis is present.
895 ///
896 /// Asserts: The token is [Kind::Url].
897 #[inline]
898 pub const fn url_has_closing_paren(&self) -> bool {
899 debug_assert!(self.kind_bits() == Kind::Url as u8);
900 self.third_bit_is_set()
901 }
902
903 /// If the [Token] is [Kind::Hash], checks if the Hash is "ID-like" (i.e its first character is ASCII).
904 ///
905 /// Asserts: The token is [Kind::Hash].
906 #[inline]
907 pub const fn hash_is_id_like(&self) -> bool {
908 debug_assert!(self.kind_bits() == Kind::Hash as u8);
909 self.second_bit_is_set()
910 }
911
912 /// Checks if the [Token] is [Kind::BadString] or [Kind::BadUrl].
913 #[inline]
914 pub const fn is_bad(&self) -> bool {
915 (self.kind_bits() | 0b00001) & 0b11001 == 1
916 }
917
918 /// Checks if the [Token] is [Kind::CdcOrCdo] and is the CDC variant of that token.
919 #[inline]
920 pub const fn is_cdc(&self) -> bool {
921 self.kind_bits() == (Kind::CdcOrCdo as u8) && self.third_bit_is_set()
922 }
923
924 /// Some tokens may have a "leading" part:
925 /// - [Kind::AtKeyword] always starts with a `@`,
926 /// - [Kind::Hash] with a `#`.
927 /// - [Kind::String] with a `"` or `'`.
928 /// - [Kind::Comment] with a leading `/*` (or `//`).
929 /// - [Kind::Dimension] has a leading numeric portion.
930 /// - [Kind::Url] has the leading `url(` ident (which may vary in exact representation).
931 ///
932 /// This function returns the length of that, irrespective of the [Kind]. For other kinds not listed, this will return
933 /// `0`, but for the above kinds it will calculate the leading length. This is useful for parsing out the underlying
934 /// data which is likely to be of greater use.
935 pub fn leading_len(&self) -> u32 {
936 match self.kind() {
937 Kind::AtKeyword | Kind::Hash | Kind::String => 1,
938 Kind::Dimension => self.numeric_len(),
939 Kind::Comment => 2,
940 Kind::Url => (self.0 & LENGTH_MASK) >> 12,
941 _ => 0,
942 }
943 }
944
945 /// Some tokens may have a "trailing" part:
946 /// - [Kind::Function] will always have an opening `(`.
947 /// - [Kind::String] may have a closing `"` or `'`.
948 /// - [Kind::Comment] may have a closing `*/`
949 /// - [Kind::Url] may have a clsoing `)`.
950 ///
951 /// This function returns the length of that, irrespective of the [Kind]. For other kinds not listed, this will return
952 /// `0`, but for the above kinds it will calculate the leading length. This is useful for parsing out the underlying
953 /// data which is likely to be of greater use.
954 pub fn trailing_len(&self) -> u32 {
955 match self.kind() {
956 Kind::Function => 1,
957 Kind::String => self.has_close_quote() as u32,
958 Kind::Comment if self.comment_style().unwrap().is_block() => 2,
959 Kind::Url => self.0 & !HALF_LENGTH_MASK,
960 _ => 0,
961 }
962 }
963
964 /// Certain kinds have a [PairWise] equivalent:
965 /// - [Kind::LeftParen] has [Kind::RightParen]
966 /// - [Kind::LeftCurly] has [Kind::RightCurly]
967 /// - [Kind::LeftSquare] has [Kind::RightSquare]
968 ///
969 /// This function returns the [PairWise] enum, if the [Token] is one of the above listed [Kinds][Kind]. For any other
970 /// [Kind] this returns [None].
971 #[inline]
972 pub fn to_pairwise(&self) -> Option<PairWise> {
973 PairWise::from_token(self)
974 }
975
976 /// A convenience function for `Cursor::new(offset, token)`.
977 #[inline(always)]
978 pub fn with_cursor(self, offset: SourceOffset) -> Cursor {
979 Cursor::new(offset, self)
980 }
981
982 /// If the [Kind] is [Kind::Hash] then this token may have had the opportunity to be parsed as a `<hex-value>` (e.g.
983 /// `#fff`). When this happens the character data is parsed during tokenization into a u32 which stores the
984 /// RR,GG,BB,AA values.
985 #[inline(always)]
986 pub fn hex_value(self) -> u32 {
987 if self == Kind::Hash { self.1 } else { 0 }
988 }
989
990 /// If this [Token] is preceded by the [Token] `other` then a separating token (e.g. a comment) will need to be
991 /// inserted between these the two tokens during serialization, in order for them to be able to be re-tokenized as
992 /// the same tokens. For example an Ident ("a") adjacent to an Ident ("b"), if serialized without whitespace, would
993 /// create a single Ident ("ab"). The rules for estbalishing whether or not these tokens needs whitespace are quite
994 /// simple and are effectively [defined in the serialization section of the spec][1]. To reproduce the table:
995 ///
996 /// [1]: https://drafts.csswg.org/css-syntax/#serialization
997 ///
998 /// | | ident | function | url | bad url | - | number | percentage | dimension | CDC | ( | * | % |
999 /// |:-----------|:-----:|:--------:|:---:|:-------:|:-:|:------:|:----------:|:---------:|:---:|:-:|:-:|:-:|
1000 /// | ident | ✗ | ✗ | ✗ | ✗ | ✗ | ✗ | ✗ | ✗ | ✗ | ✗ | | |
1001 /// | at-keyword | ✗ | ✗ | ✗ | ✗ | ✗ | ✗ | ✗ | ✗ | ✗ | | | |
1002 /// | hash | ✗ | ✗ | ✗ | ✗ | ✗ | ✗ | ✗ | ✗ | ✗ | | | |
1003 /// | dimension | ✗ | ✗ | ✗ | ✗ | ✗ | ✗ | ✗ | ✗ | ✗ | | | |
1004 /// | # | ✗ | ✗ | ✗ | ✗ | ✗ | ✗ | ✗ | ✗ | ✗ | | | |
1005 /// | \- | ✗ | ✗ | ✗ | ✗ | ✗ | ✗ | ✗ | ✗ | ✗ | | | |
1006 /// | number | ✗ | ✗ | ✗ | ✗ | | ✗ | ✗ | ✗ | ✗ | | | ✗ |
1007 /// | @ | ✗ | ✗ | ✗ | ✗ | ✗ | | | | ✗ | | | |
1008 /// | . | | | | | | ✗ | ✗ | ✗ | | | | |
1009 /// | + | | | | | | ✗ | ✗ | ✗ | | | | |
1010 /// | / | | | | | | | | | | | ✗ | |
1011 ///
1012 /// The one exception not in this table is that two consecutive `/` characters should also be separated by spaces in
1013 /// order to avoid abmiguities with CSS-alike languages that treat two consecutive `/` characters as a single line
1014 /// comment.
1015 ///
1016 /// # Example
1017 ///
1018 /// ```
1019 /// use css_lexer::*;
1020 /// let mut lexer = Lexer::new(&EmptyAtomSet::ATOMS, "10 %");
1021 /// let first = lexer.advance();
1022 /// let _ = lexer.advance(); // Whitespace
1023 /// let second = lexer.advance();
1024 /// assert!(first.needs_separator_for(second));
1025 /// ```
1026 pub fn needs_separator_for(&self, second: Token) -> bool {
1027 if second == AssociatedWhitespaceRules::EnforceBefore && *self != Kind::Whitespace
1028 || *self == AssociatedWhitespaceRules::EnforceAfter && second != Kind::Whitespace
1029 {
1030 // We need whitespace after, unless the next token is actually whitespace.
1031 return true;
1032 }
1033 if *self == AssociatedWhitespaceRules::BanAfter {
1034 return false;
1035 }
1036 match self.kind() {
1037 Kind::Ident => {
1038 (matches!(second.kind(), Kind::Number | Kind::Dimension) &&
1039 // numbers with a `-` need separating, but with `+` they do not.
1040 (!second.has_sign() || second.value() < 0.0))
1041 || matches!(second.kind(), Kind::Ident | Kind::Function | Kind::Url | Kind::BadUrl)
1042 || matches!(second.char(), Some('(' | '-'))
1043 || second.is_cdc()
1044 }
1045 Kind::AtKeyword | Kind::Hash | Kind::Dimension => {
1046 (matches!(second.kind(), Kind::Number | Kind::Dimension) &&
1047 // numbers with a `-` need separating, but with `+` they do not.
1048 (!second.has_sign() || second.value() < 0.0))
1049 || matches!(second.kind(), Kind::Ident | Kind::Function | Kind::Url | Kind::BadUrl)
1050 || matches!(second.char(), Some('-'))
1051 || second.is_cdc()
1052 }
1053 Kind::Number => {
1054 matches!(
1055 second.kind(),
1056 Kind::Ident | Kind::Function | Kind::Url | Kind::BadUrl | Kind::Number | Kind::Dimension
1057 ) || matches!(second.char(), Some('%'))
1058 || second.is_cdc()
1059 }
1060 _ => match self.char() {
1061 Some('#') => {
1062 matches!(
1063 second.kind(),
1064 Kind::Ident | Kind::Function | Kind::Url | Kind::BadUrl | Kind::Number | Kind::Dimension
1065 ) || matches!(second.char(), Some('-'))
1066 || second.is_cdc()
1067 }
1068 Some('-') => {
1069 matches!(
1070 second.kind(),
1071 Kind::Ident | Kind::Function | Kind::Url | Kind::BadUrl | Kind::Number | Kind::Dimension
1072 ) || matches!(second.char(), Some('-'))
1073 || second.is_cdc()
1074 }
1075 Some('@') => {
1076 matches!(second.kind(), Kind::Ident | Kind::Function | Kind::Url | Kind::BadUrl)
1077 || matches!(second.char(), Some('-'))
1078 || second.is_cdc()
1079 }
1080 Some('.') => matches!(second.kind(), Kind::Number | Kind::Dimension),
1081 Some('+') => matches!(second.kind(), Kind::Number | Kind::Dimension),
1082 Some('/') => matches!(second.char(), Some('*' | '/')),
1083 _ => false,
1084 },
1085 }
1086 }
1087}
1088
1089impl core::fmt::Debug for Token {
1090 fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
1091 let mut d = f.debug_struct(format!("Token::{}", self.kind().as_str()).as_str());
1092 match self.kind() {
1093 Kind::Eof => &mut d,
1094 Kind::Number => d.field("value", &self.value()).field("len", &self.numeric_len()),
1095 Kind::Dimension => {
1096 d.field("value", &self.value()).field("len", &self.numeric_len()).field("dimension_len", &self.len())
1097 }
1098 _ if self.is_delim_like() => {
1099 d.field("char", &self.char().unwrap()).field("len", &self.len());
1100 if !self.associated_whitespace().is_none() {
1101 d.field("associated_whitespace", &self.associated_whitespace());
1102 }
1103 &mut d
1104 }
1105 Kind::String => d
1106 .field("quote_style", &if self.first_bit_is_set() { "Double" } else { "Single" })
1107 .field("has_close_quote", &self.second_bit_is_set())
1108 .field("contains_escape_chars", &self.third_bit_is_set())
1109 .field("len", &self.len()),
1110 Kind::Ident | Kind::Function | Kind::AtKeyword => d
1111 .field("is_lower_case", &self.first_bit_is_set())
1112 .field("is_dashed_ident", &self.second_bit_is_set())
1113 .field("contains_escape_chars", &self.third_bit_is_set())
1114 .field("len", &self.len()),
1115 Kind::Hash => d
1116 .field("is_lower_case", &self.first_bit_is_set())
1117 .field("hash_is_id_like", &self.second_bit_is_set())
1118 .field("contains_escape_chars", &self.third_bit_is_set())
1119 .field("len", &self.len()),
1120 Kind::Url => d
1121 .field("url_has_closing_paren", &self.first_bit_is_set())
1122 .field("url_has_leading_space", &self.second_bit_is_set())
1123 .field("contains_escape_chars", &self.third_bit_is_set())
1124 .field("len", &self.len()),
1125 Kind::CdcOrCdo => d.field("is_cdc", &self.first_bit_is_set()).field("len", &self.len()),
1126 Kind::Whitespace => d.field("contains", &self.whitespace_style()).field("len", &self.len()),
1127 _ => d
1128 .field("flag_0", &self.first_bit_is_set())
1129 .field("flag_1", &self.second_bit_is_set())
1130 .field("flag_2", &self.third_bit_is_set())
1131 .field("len", &self.len()),
1132 }
1133 .finish()
1134 }
1135}
1136
1137impl std::fmt::Display for Token {
1138 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
1139 match self.kind() {
1140 Kind::Delim => write!(f, "Delim({})", self.char().unwrap()),
1141 k => write!(f, "{}", k.as_str()),
1142 }
1143 }
1144}
1145
1146#[cfg(feature = "serde")]
1147impl serde::ser::Serialize for Token {
1148 fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
1149 where
1150 S: serde::ser::Serializer,
1151 {
1152 use serde::ser::SerializeStruct;
1153 if *self == Self::EMPTY {
1154 return serializer.serialize_none();
1155 }
1156 let mut state = serializer.serialize_struct("Token", 3)?;
1157 state.serialize_field("kind", self.kind().as_str())?;
1158 state.serialize_field("len", &self.len())?;
1159 state.end()
1160 }
1161}
1162
1163impl From<Token> for Kind {
1164 fn from(token: Token) -> Self {
1165 token.kind()
1166 }
1167}
1168
1169impl PartialEq<Kind> for Token {
1170 fn eq(&self, other: &Kind) -> bool {
1171 self.kind_bits() == *other as u8
1172 }
1173}
1174
1175impl From<Token> for KindSet {
1176 fn from(token: Token) -> Self {
1177 KindSet::new(&[token.kind()])
1178 }
1179}
1180
1181impl PartialEq<KindSet> for Token {
1182 fn eq(&self, other: &KindSet) -> bool {
1183 other.contains_bits(self.kind_bits())
1184 }
1185}
1186
1187impl From<Token> for QuoteStyle {
1188 fn from(token: Token) -> Self {
1189 token.quote_style()
1190 }
1191}
1192
1193impl PartialEq<QuoteStyle> for Token {
1194 fn eq(&self, other: &QuoteStyle) -> bool {
1195 &self.quote_style() == other
1196 }
1197}
1198
1199impl From<Token> for Whitespace {
1200 fn from(token: Token) -> Self {
1201 token.whitespace_style()
1202 }
1203}
1204
1205impl PartialEq<Whitespace> for Token {
1206 fn eq(&self, other: &Whitespace) -> bool {
1207 self.whitespace_style().intersects(*other)
1208 }
1209}
1210
1211impl PartialEq<AssociatedWhitespaceRules> for Token {
1212 fn eq(&self, other: &AssociatedWhitespaceRules) -> bool {
1213 self.associated_whitespace().intersects(*other)
1214 }
1215}
1216
1217impl PartialEq<CommentStyle> for Token {
1218 fn eq(&self, other: &CommentStyle) -> bool {
1219 self.comment_style().map(|style| &style == other).unwrap_or(false)
1220 }
1221}
1222
1223impl PartialEq<PairWise> for Token {
1224 fn eq(&self, other: &PairWise) -> bool {
1225 self.to_pairwise().map(|style| &style == other).unwrap_or(false)
1226 }
1227}
1228
1229impl PartialEq<char> for Token {
1230 fn eq(&self, other: &char) -> bool {
1231 self.char().map(|char| char == *other).unwrap_or(false)
1232 }
1233}
1234
1235#[test]
1236fn size_test() {
1237 assert_eq!(::std::mem::size_of::<Token>(), 8);
1238}
1239
1240#[test]
1241fn test_new_whitespace() {
1242 assert_eq!(Token::SPACE, Kind::Whitespace);
1243 assert_eq!(Token::SPACE, Whitespace::Space);
1244 assert_eq!(Token::TAB, Kind::Whitespace);
1245 assert_eq!(Token::TAB, Whitespace::Tab);
1246 assert_eq!(Token::NEWLINE, Kind::Whitespace);
1247 assert_eq!(Token::NEWLINE, Whitespace::Newline);
1248 assert_eq!(Token::new_whitespace(Whitespace::Space, 4), Kind::Whitespace);
1249 assert_eq!(Token::new_whitespace(Whitespace::Space | Whitespace::Newline, 4), Whitespace::Space);
1250 assert_eq!(Token::new_whitespace(Whitespace::Space, 4).len(), 4);
1251 assert_eq!(Token::new_whitespace(Whitespace::Tab | Whitespace::Space, 4), Whitespace::Tab);
1252 assert_eq!(Token::new_whitespace(Whitespace::Newline, 4), Whitespace::Newline);
1253 assert_eq!(Token::new_whitespace(Whitespace::Newline, 4).len(), 4);
1254}
1255
1256#[test]
1257fn test_new_comment() {
1258 assert_eq!(Token::new_comment(CommentStyle::Block, 4), Kind::Comment);
1259 assert_eq!(Token::new_comment(CommentStyle::Block, 4), CommentStyle::Block);
1260 assert_eq!(Token::new_comment(CommentStyle::Single, 4), CommentStyle::Single);
1261}
1262
1263#[test]
1264fn test_new_number() {
1265 assert_eq!(Token::new_number(false, false, 3, 4.2), Kind::Number);
1266 assert_eq!(Token::new_number(false, false, 3, 4.2).value(), 4.2);
1267 assert_eq!(Token::new_number(false, false, 3, 4.2).len(), 3);
1268 assert_eq!(Token::new_number(false, true, 9, 4.2), Kind::Number);
1269 assert_eq!(Token::new_number(false, true, 9, 4.2).value(), 4.2);
1270 assert_eq!(Token::new_number(false, true, 9, 4.2).len(), 9);
1271 assert!(!Token::new_number(false, false, 3, 4.2).has_sign());
1272 assert!(Token::new_number(false, true, 3, 4.2).has_sign());
1273 assert!(!Token::new_number(false, true, 3, 4.0).is_float());
1274 assert!(Token::new_number(true, false, 3, 4.2).is_float());
1275}
1276
1277#[test]
1278fn test_new_string() {
1279 assert_eq!(Token::new_string(QuoteStyle::Single, false, false, 4), Kind::String);
1280 assert_eq!(Token::new_string(QuoteStyle::Single, false, false, 4), QuoteStyle::Single);
1281 assert!(!Token::new_string(QuoteStyle::Single, false, false, 4).has_close_quote());
1282 assert!(!Token::new_string(QuoteStyle::Single, false, false, 4).contains_escape_chars());
1283 assert_eq!(Token::new_string(QuoteStyle::Single, false, false, 4).len(), 4);
1284 assert_eq!(Token::new_string(QuoteStyle::Double, false, false, 4), Kind::String);
1285 assert_eq!(Token::new_string(QuoteStyle::Double, false, false, 4), QuoteStyle::Double);
1286 assert!(Token::new_string(QuoteStyle::Double, true, false, 4).has_close_quote());
1287 assert!(!Token::new_string(QuoteStyle::Double, true, false, 4).contains_escape_chars());
1288 assert_eq!(Token::new_string(QuoteStyle::Double, true, false, 5).len(), 5);
1289 assert!(Token::new_string(QuoteStyle::Double, true, true, 4).contains_escape_chars());
1290 assert!(Token::new_string(QuoteStyle::Double, false, true, 4).contains_escape_chars());
1291}
1292
1293#[test]
1294fn test_new_hash() {
1295 assert_eq!(Token::new_hash(false, false, false, 4, 0), Kind::Hash);
1296 assert!(!Token::new_hash(false, false, false, 4, 0).contains_escape_chars());
1297 assert!(Token::new_hash(false, false, true, 4, 0).contains_escape_chars());
1298 assert!(Token::new_hash(false, false, true, 4, 0).is_lower_case());
1299 assert!(!Token::new_hash(true, false, false, 4, 0).is_lower_case());
1300 assert_eq!(Token::new_hash(true, false, false, 4, 0).len(), 4);
1301 assert_eq!(Token::new_hash(true, false, false, 4, 0).hex_value(), 0);
1302 assert_eq!(Token::new_hash(true, false, false, 4, 18).hex_value(), 18);
1303}
1304
1305#[test]
1306#[should_panic]
1307fn test_new_string_with_quotes_none() {
1308 Token::new_string(QuoteStyle::None, false, true, 4);
1309}
1310
1311#[test]
1312fn test_new_delim() {
1313 assert_eq!(Token::new_delim('>'), Kind::Delim);
1314 assert_eq!(Token::new_delim('>'), '>');
1315 assert_eq!(Token::new_delim('>').len(), 1);
1316 assert_eq!(Token::new_delim('.'), Kind::Delim);
1317 assert_eq!(Token::new_delim('.'), '.');
1318 assert_eq!(Token::new_delim('.').len(), 1);
1319 assert_eq!(Token::new_delim('ℝ'), Kind::Delim);
1320 assert_eq!(Token::new_delim('ℝ'), 'ℝ');
1321 assert_eq!(Token::new_delim('ℝ').len(), 3);
1322 assert_eq!(Token::new_delim('💣'), Kind::Delim);
1323 assert_eq!(Token::new_delim('💣'), '💣');
1324 assert_eq!(Token::new_delim('💣').len(), 4);
1325 assert_eq!(Token::new_delim('💣').len(), 4);
1326 assert_eq!(Token::new_delim('💣').len(), 4);
1327}
1328
1329#[test]
1330fn with_associated_whitespace() {
1331 assert_eq!(
1332 Token::new_delim('>').with_associated_whitespace(
1333 AssociatedWhitespaceRules::EnforceBefore | AssociatedWhitespaceRules::EnforceAfter
1334 ),
1335 AssociatedWhitespaceRules::EnforceBefore | AssociatedWhitespaceRules::EnforceBefore
1336 );
1337}
1338
1339#[test]
1340fn test_with_quotes() {
1341 assert_eq!(
1342 Token::new_string(QuoteStyle::Single, false, false, 4).with_quotes(QuoteStyle::Double),
1343 Token::new_string(QuoteStyle::Double, false, false, 4)
1344 );
1345 assert_eq!(
1346 Token::new_string(QuoteStyle::Double, true, true, 8).with_quotes(QuoteStyle::Single),
1347 Token::new_string(QuoteStyle::Single, true, true, 8),
1348 );
1349}
1350
1351#[test]
1352#[should_panic]
1353fn test_with_quotes_none() {
1354 Token::new_string(QuoteStyle::Single, false, true, 4).with_quotes(QuoteStyle::None);
1355 Token::new_string(QuoteStyle::Double, false, true, 4).with_quotes(QuoteStyle::None);
1356}
1357
1358#[test]
1359fn test_new_dimension() {
1360 {
1361 let token = Token::new_dimension(false, false, 3, 3, 999.0, 0);
1362 assert_eq!(token, Kind::Dimension);
1363 assert_eq!(token.value(), 999.0);
1364 assert_eq!(token.numeric_len(), 3);
1365 assert_eq!(token.len(), 6);
1366 assert!(!token.is_float());
1367 assert!(!token.has_sign());
1368 }
1369 {
1370 let token = Token::new_dimension(false, false, 5, 2, 8191.0, 0);
1371 assert_eq!(token, Kind::Dimension);
1372 assert_eq!(token.value(), 8191.0);
1373 assert_eq!(token.numeric_len(), 5);
1374 assert_eq!(token.len(), 7);
1375 assert!(!token.is_float());
1376 assert!(!token.has_sign());
1377 }
1378 for i in -8191..8191 {
1379 let token = Token::new_dimension(false, false, 9, 3, i as f32, 0);
1380 assert_eq!(token.value(), i as f32);
1381 }
1382}