css_lexer/
token.rs

1use crate::{
2	AssociatedWhitespaceRules, CommentStyle, Cursor, Kind, KindSet, PairWise, QuoteStyle, SourceOffset, Whitespace,
3};
4use std::char::REPLACEMENT_CHARACTER;
5
6/// An abstract representation of the chunk of the source text, retaining certain "facts" about the source.
7///
8/// # Design
9///
10/// The [Token] type is an immutable packing of two [u32s][u32] that represents a unit in the source text, but without
11/// the associated offset data that points to its position in the source text. This is important because it means that
12/// equivalent [Tokens][Token] are equal even in different parts of the document. For the most part a [Token] doesn't
13/// represent data that can be put into a text file because it lacks the underlying character data. It is lossy. For
14/// example a [Token] with [Kind::Ident] just represents _an_ ident, but it doesn't retain what the keyword is).
15/// Storing raw-character data would require either storing tokens on the heap (and therefore they couldn't be [Sized])
16/// or by keeping a reference to `&'a str` which means larger token sizes and lifetime tracking. By _not_ storing
17/// character data we can keep [Token] [Sized] and keep it to `size_of` `8`, avoiding the heap, avoiding
18/// references/lifetimes, and keeping [Token] entirely in the stack. For a lot of tokens this is _fine_ because the
19/// underlying character data isn't that useful past a certain point.
20///
21/// A [Token] retains certain "facts" about the underlying unit of text, though. For example it retains the [Kind], how
22/// many characters the token consumed, and various other pieces of information, depending on the [Kind]. In some
23/// cases, it's entirely possible to represent the full token, including character data, into the available bits (for
24/// example [Kind::Delim] stores its [char], [Kind::Number] stores its [f32]). Taking the time in the tokenizer to
25/// gather these facts and values can keep cache-lines hot, which speeds up subsequent checks in the parser.
26///
27/// If you're familiar with "red green" syntax trees such as [Swiftlang's libsyntax][1], or [Rust-Analyzer's Rowan][2]
28/// or [Roslyn][3] this might be a little familiar in some concepts. However [Token] does not represent a tree, and
29/// relies on resorting back to the string data to find out keyword values.
30///
31/// [1]: https://gh.io/AAtdqpg
32/// [2]: https://gh.io/AAtf8pt
33/// [3]: https://gh.io/AAtab90
34///
35/// This representation of facts, kind, length, or other metadata can be quite complex - so here's a
36/// full breakdown:
37///
38/// # Anatomy of Token
39///
40/// A [Token] is a struct of `(u32, u32)`. The second u32 is _usually_ the token length (hence keeping them separate).
41/// The first [u32], however, is split into 3 (sometimes 5) parts. The two u32s can be thought of like so:
42///
43/// ```md
44///   |-----|-------|--------------------------|---------------------------------|
45///   | TF  | K     | VD                       | Value                           |
46/// 0b| 000 | 00000 | 000000000000000000000000 | 0000000000000000000000000000000 |
47///   |-----|-------|--------------------------|---------------------------------|
48///   | 3-- | 5---- | 24---------------------- | 32----------------------------- |
49/// ```
50///
51/// ## TF = Type Flags (or "Token Facts")
52///
53/// This represents a bit-mask in the upper-most 3 bits. The flags are general purpose and change meaning depending on
54/// the Token's [Kind]. Each flag generally maps to a method so it's not necessary to remenber the contents of this
55/// table, but it can serve as a useful reference. Note that not all methods return a [bool], so footnotes have been
56/// added to explain these further.
57///
58/// | Kind::             | Flag  | Description                 | Method                                   |
59/// |--------------------|-------|-----------------------------|------------------------------------------|
60/// | [Kind::Number]     | `001` | Floating Point              | [Token::is_float()]                      |
61/// |                    | `010` | Has a "Sign" (-/+)          | [Token::has_sign()]                      |
62/// |                    | `100` | (Reserved)                  | --                                       |
63/// | [Kind::Dimension]  | `001` | Floating Point              | [Token::is_float()]                      |
64/// |                    | `010` | Has a "Sign" (-/+)          | [Token::has_sign()]                      |
65/// |                    | `100` | Unit is a known dimension   | [Token::atom_bits()][^dimension]         |
66/// | [Kind::String]     | `001` | Uses Double Quotes          | [Token::quote_style()][^quotes]          |
67/// |                    | `010` | Has a closing quote         | [Token::has_close_quote()]               |
68/// |                    | `100` | Contains escape characters  | [Token::contains_escape_chars()]         |
69/// | [Kind::Ident]      | `001` | Contains non-lower-ASCII    | [Token::is_lower_case()]                 |
70/// |                    | `010` | Is a "Dashed Ident"         | [Token::is_dashed_ident()]               |
71/// |                    | `100` | Contains escape characters  | [Token::contains_escape_chars()]         |
72/// | [Kind::Function]   | `001` | Contains non-lower-ASCII    | [Token::is_lower_case()]                 |
73/// |                    | `010` | Is a "Dashed Ident"         | [Token::is_dashed_ident()]               |
74/// |                    | `100` | Contains escape characters  | [Token::contains_escape_chars()]         |
75/// | [Kind::AtKeyword]  | `001` | Contains non-lower-ASCII    | [Token::is_lower_case()]                 |
76/// |                    | `010` | Is a "Dashed Ident"         | [Token::is_dashed_ident()]               |
77/// |                    | `100` | Contains escape characters  | [Token::contains_escape_chars()]         |
78/// | [Kind::Hash]       | `001` | Contains non-lower-ASCII    | [Token::is_lower_case()]                 |
79/// |                    | `010` | First character is ASCII    | [Token::hash_is_id_like()]               |
80/// |                    | `100` | Contains escape characters  | [Token::contains_escape_chars()]         |
81/// | [Kind::Url]        | `001` | Has a closing paren )       | [Token::url_has_closing_paren()]         |
82/// |                    | `010` | Contains whitespace after ( | [Token::url_has_leading_space()]         |
83/// |                    | `100` | Contains escape characters  | [Token::contains_escape_chars()]         |
84/// | [Kind::CdcOrCdo]   | `001` | Is CDO (`000` would be CDC) | [Token::is_cdc()]                        |
85/// |                    | `010` | (Reserved)                  | --                                       |
86/// |                    | `100` | (Reserved)                  | --                                       |
87/// | [Kind::Whitespace] | `---` | Whitespace style            | [Token::whitespace_style()][^whitespace] |
88/// | [Kind::Delim]      | `---` | Associate whitespace rules  | [Token::associated_whitespace()][^delim] |
89/// | [Kind::Comment]    | `---` | (Special)                   | [Token::comment_style()][^comments]      |
90///
91/// [^quotes]: Strings do not have a [bool] returning method for whether or not the quote is using double or single
92/// quotes, instead the [Token::quote_style()] method will returning the [QuoteStyle] enum for better readability.
93/// [^whitespace]: Whitespace tokens to not have a [bool] returning method, instead [Token::whitespace_style()] will
94/// return the [Whitespace] enum for improved readability.
95/// [^comments]: Rather than using the 3 bits as a bit-mask, Comment tokens use the data to store the [CommentStyle]
96/// enum, which is capable of representing 8 discrete comment styles.
97/// [^delim]: Delims can be used in interesting ways inside of CSS syntax. At higher levels CSS is _sometimes_
98/// whitespace sensitive, for example the whitespace inside of a CSS selector _sometimes_ represents the descendant
99/// combinator, meanwhile delimiters inside calc() are sensitive to whitespace collapse (`calc(1px + 1px)` is valid
100/// while `calc(1px+1px)` is a parse error). Further to this, introducing whitespace (say through a formatter) might
101/// break in interesting ways due to some combinations of Delims & Idents - for example Pseudo Classes like `:hover`,
102/// or CSS like languages such as SASS using `$var` style syntax. While `:hover` and `$var` are comprised of two tokens
103/// they're considered one conceptual unit. Having a way to express these relationships at the token level can be useful
104/// for other low level machinery such as formatters/minifiers, rather than introducing complex state at higher levels.
105/// For these reasons, Delim tokens have the ability to express their whitespace association. The lexer will always
106/// produce a token with empty whitespace rules, but parsers can replace this token with a more complex set of rules.
107///
108/// ## K = Kind Bits
109///
110/// The `K` value - upper-most bits 4-9 stores the 5-bit [Kind].
111///
112/// ## VD = Value Data
113///
114/// The `VD` value - the lower-most 24-bits - stores data depending on the [Token] [Kind]. For most kinds this data is
115/// reserved (just 0s). The value data cannot be interrogated manually, but it packs in additional data about the
116/// underlying string to make the string easier to parse without doing the same lookups that the tokenizer already had
117/// to - such as determining lengths of the various parts of the token, or packing values so that consulting the string
118/// can be avoided (which keeps cache-lines hot).
119///
120/// Below describes the special kinds which use the Value Data to store yet more information about the token...
121///
122/// ### Value Data for [Kind::Number]
123///
124/// If the [Kind] is [Kind::Number], Value Data represents the length of that number (this means the parser is
125/// restricted from representing numbers longer than 16,777,216 characters which is probably an acceptable limit). Note
126/// that this does not affect the _value_ of a number, just the characters in a string. Numbers in CSS are [f32]. The
127/// vast majority of [f32s][f32] can be represented in 16MM characters, but it's possible to author a document that
128/// contains a set of numeric characters longer than 16MM code points. These scenarios are considered [undefined
129/// behaviour][1].
130///
131/// [4]: https://en.wikipedia.org/wiki/Undefined_behavior
132///
133/// ### Value Data for [Kind::Hash]
134///
135/// If the [Kind] is [Kind::Hash], Value Data represents the length of that hash (this means the parser is restricted
136/// from representing IDs and hex codes longer than 16,777,216 characters which is probably an acceptable limit). Note
137/// that this restriction means that ID selectors have a much tigher limit than other tokens, such as strings or
138/// idents, but it's very unlikely to see a 16million character ID in CSS (String, maybe).
139///
140/// ### Value Data for [Kind::Url]
141///
142/// If the [Kind] is [Kind::Url], Value Data represents the "leading length" and "trailing length" of the URL. This
143/// means the value data is split into two 12 bit numbers:
144///
145/// ```md
146/// |--------------|--------------|
147/// | LL           | TL           |
148/// | 000000000000 | 000000000000 |
149/// |--------------|--------------|
150/// | 12---------- | 12---------- |
151/// ```
152///
153/// The "leading" length represents the `url(` part of the token. Typically this will be `4`, however it's possible
154/// (for legacy compatibility reasons within CSS) to add whitespace between the opening parenthesis and the URL value.
155/// It's also possible to escape the `url` ident portion. This means `\75\52\6c(   ` is also a valid leading section of
156/// a URL ident (which has a character length of 13), as is `\000075 \000052 \00006c (   ` (28 characters). 12 bits
157/// allows for a maximum character length of 4,096. It is not possible to represent a URL token's leading section using
158/// 4,096 characters so there is some headroom (wasted bytes) here.
159///
160/// The "trailing" length represents the `)` part of the token. Typically this will be `1`, however it's possible to
161/// add any number of whitespace characters between the end of the URL and the closing parenthesis. If a CSS document
162/// contains more than 4095 whitespace characters then this is considered [undefined behaviour][4].
163///
164/// ### Value Data for [Kind::Dimension]
165///
166/// If K is a Dimension, then this represents both the number of characters in the numeric portion of the dimension
167/// and the length of the ident portion of the dimension... or the dimension unit itself (more on that below). This
168/// means the value data is split into two 12 bit numbers:
169///
170/// ```md
171/// |--------------|--------------|
172/// | NL           | DUL          |
173/// | 000000000000 | 000000000000 |
174/// |--------------|--------------|
175/// | 12---------- | 12---------- |
176///
177/// |--------------|-------| --------|
178/// | NL           | KDUL  | KNOWN   |
179/// | 000000000000 | 00000 | 0000000 |
180/// |--------------|-------| --------|
181/// | 12---------- | 5---- | 7------ |
182/// ```
183///
184/// The NL portion - the numeric length - represents the length of characters the number contains. This means the
185/// numeric portion of a dimension can only be 4,096 characters long. This is dramatically shorter than the 16MM
186/// allowed for numbers but it's still also incredibly generous such that it's highly unlikely to ever be hit unless
187/// someone is intentionally trying to break the parser. The [Lexer][super::Lexer] encountering a dimension with a
188/// numeric portion longer than 4,096 characters is considered [undefined behaviour][4].
189///
190/// The DUL portion (if `TF & 100 == 0`) will represent the length of characters the ident portion of the dimension
191/// (aka the dimension unit) contains. This means the ident portion of a dimension can only be 4,096 characters long.
192/// For practical purposes CSS has a fixed set of dimensions - the longest of which (at the time of writing) are 5
193/// characters long (e.g. `svmax`). Through the use of escaping shenanigans it's possible to create a valid CSS
194/// dimension longer than 5 characters though (every ident can be made 8 times longer by using escape characters, e.g.
195/// `1svmax` at 6 characters can be instead written as `1\000073 \000076 \00006d \000061 \000078` at 40 characters). In
196/// addition to these factors, it's worth pointing out that there is scope for further dimensions and some [proposals
197/// for "custom" dimensions][5], and lastly this library is designed for CSS _and CSS-alike_ languages, which may
198/// invent their own dimension units. In other words being too restrictive on dimension ident length could be costly
199/// in the future, therefore 4,096 characters seems like a reasonable, if generous, trade-off.
200///
201/// There's a giant caveat here though. If `TF & 100 != 0`, then the dimension is considered "known" and DUL will be
202/// encoded differently. Instead of just containing the dimension unit length, which requires consulting the underlying
203/// `&str` to get the actual dimension, it will be used to store an Atom - but only the first 7 bits (the KNOWN
204/// portion), which for an Atom must be a Dimension atom (an assummption made on anything that implements
205/// [AtomSet][crate::AtomSet] is that all dimension units should be stored in the byte values of 1-127, so that they
206/// can be encoded in this space). Dimension units _can_ be escape encoded, and so the underlying character data may
207/// differ from the unescaped unit length, as such 5-bit KDUL portion represents character data length, in other words
208/// `KNOWN.len()` may not always equal KDUL`.
209///
210/// [5]: https://github.com/w3c/csswg-drafts/issues/7379
211///
212/// ## Value
213///
214/// The `Value` portion of [Token] represents the length of the token for most token kinds. However, for some tokens
215/// their length is already packed into the first u32. So it would make more sense to use this u32 to store more
216/// interesting data.
217///
218/// ## Value for [Kind::Delim] and single character tokens
219///
220/// [Kind::Delim] and single-character tokens (i.e. [Kind::Colon]->[Kind::RightCurly]) typically have a length of `1`
221/// ([Kind::Delim] can have a varied length for surrogate pairs). Instead of storing the length and wasting a whole
222/// [u32], this region stores the [char]. Calling [Token::char()] will return an [Option] which will always be [Some]
223/// for [Kind::Delim] and single-character tokens.
224///
225/// ## Value for [Kind::Hash]
226///
227/// The length of a hash is stored in its `VD` portion, leaving 32bits to storing other data. It just so happens that
228/// a 8-character hex code (#ffaabbcc) fits nicely inside of 32-bits. During tokenization we can eagerly parse the hex
229/// code and stuff it here, so it can be more easily reasoned about in upstream code (rather than
230/// reading the character data).
231///
232/// ## Value for [Kind::Number] and [Kind::Dimension]
233///
234/// As these tokens store their length data in the `VD` portion, this [u32] instead stores the _value_ of the number,
235/// stored as [f32::to_bits()].
236///
237/// ## Value data for other tokens.
238///
239/// In all other cases, this represents the length of the token as utf-8 bytes. This means the token length is
240/// 4,294,967,296 aka ~4GB. This sounds very long but also CSS can host very large image data and browsers will
241/// accomodate very large URLs. [An mdn article on Data URLs][6] claims that Firefox supports 32mb Data URLs, Chrome
242/// supports over 512mb, and Safari over 2gb. The reality is that if someone has such a large data URL in their CSS
243/// they probably should split it out, but we have a whole 32 bits to store the length so we may as well use it...
244///
245/// [6]: https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URLs#common_problems
246#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
247pub struct Token(u32, u32);
248
249impl Default for Token {
250	fn default() -> Self {
251		Self((Kind::Whitespace as u32) << 24, 0)
252	}
253}
254
255const KIND_MASK: u32 = !((1 << 24) - 1);
256const LENGTH_MASK: u32 = (1 << 24) - 1;
257const HALF_LENGTH_MASK: u32 = !((1 << 12) - 1);
258
259impl Token {
260	/// Represents an empty token.
261	pub const EMPTY: Token = Token::new_whitespace(Whitespace::none(), 0);
262
263	/// Represents an EOF token.
264	pub const EOF: Token = Token(0b0, 0);
265
266	/// Represents a CDO (`<!--`) token.
267	pub const CDO: Token = Token(((Kind::CdcOrCdo as u32) << 24) & KIND_MASK, 4);
268
269	/// Represents a CDC (`-->`) token.
270	pub const CDC: Token = Token((((Kind::CdcOrCdo as u32) | 0b001_00000) << 24) & KIND_MASK, 3);
271
272	/// Represents a single ' ' space token.
273	pub const SPACE: Token = Token::new_whitespace(Whitespace::Space, 1);
274
275	/// Represents a single Tab token.
276	pub const TAB: Token = Token::new_whitespace(Whitespace::Tab, 1);
277
278	/// Represents a single `\n` token.
279	pub const NEWLINE: Token = Token::new_whitespace(Whitespace::Newline, 1);
280
281	/// Represents the Number `0`. This is not equal to other representations of zero, such as `00`, `0e0`, `0.0` and so
282	/// on.
283	pub const NUMBER_ZERO: Token = Token((((Kind::Number as u32) | 0b100_00000) << 24) & KIND_MASK, 1);
284
285	/// Represents the `:` token.
286	pub const COLON: Token = Token::new_delim_kind(Kind::Colon, ':');
287
288	/// Represents the `;` token.
289	pub const SEMICOLON: Token = Token::new_delim_kind(Kind::Semicolon, ';');
290
291	/// Represents the `,` token.
292	pub const COMMA: Token = Token::new_delim_kind(Kind::Comma, ',');
293
294	/// Represents the `[` token.
295	pub const LEFT_SQUARE: Token = Token::new_delim_kind(Kind::LeftSquare, '[');
296
297	/// Represents the `]` token.
298	pub const RIGHT_SQUARE: Token = Token::new_delim_kind(Kind::RightSquare, ']');
299
300	/// Represents the `(` token.
301	pub const LEFT_PAREN: Token = Token::new_delim_kind(Kind::LeftParen, '(');
302
303	/// Represents the `)` token.
304	pub const RIGHT_PAREN: Token = Token::new_delim_kind(Kind::RightParen, ')');
305
306	/// Represents the `{` token.
307	pub const LEFT_CURLY: Token = Token::new_delim_kind(Kind::LeftCurly, '{');
308
309	/// Represents the `}` token.
310	pub const RIGHT_CURLY: Token = Token::new_delim_kind(Kind::RightCurly, '}');
311
312	/// Represents a `!` [Kind::Delim] token.
313	pub const BANG: Token = Token::new_delim('!');
314
315	/// Represents a `#` [Kind::Delim] token.
316	pub const HASH: Token = Token::new_delim('#');
317
318	/// Represents a `$` [Kind::Delim] token.
319	pub const DOLLAR: Token = Token::new_delim('$');
320
321	/// Represents a `%` [Kind::Delim] token - not to be confused with the `%` dimension.
322	pub const PERCENT: Token = Token::new_delim('%');
323
324	/// Represents a `&` [Kind::Delim] token.
325	pub const AMPERSAND: Token = Token::new_delim('&');
326
327	/// Represents a `*` [Kind::Delim] token.
328	pub const ASTERISK: Token = Token::new_delim('*');
329
330	/// Represents a `+` [Kind::Delim] token.
331	pub const PLUS: Token = Token::new_delim('+');
332
333	/// Represents a `-` [Kind::Delim] token.
334	pub const DASH: Token = Token::new_delim('-');
335
336	/// Represents a `.` [Kind::Delim] token.
337	pub const PERIOD: Token = Token::new_delim('.');
338
339	/// Represents a `/` [Kind::Delim] token.
340	pub const SLASH: Token = Token::new_delim('/');
341
342	/// Represents a `<` [Kind::Delim] token.
343	pub const LESS_THAN: Token = Token::new_delim('<');
344
345	/// Represents a `=` [Kind::Delim] token.
346	pub const EQUALS: Token = Token::new_delim('=');
347
348	/// Represents a `>` [Kind::Delim] token.
349	pub const GREATER_THAN: Token = Token::new_delim('>');
350
351	/// Represents a `?` [Kind::Delim] token.
352	pub const QUESTION: Token = Token::new_delim('?');
353
354	/// Represents a `@` [Kind::Delim] token. Not to be confused with the @keyword token.
355	pub const AT: Token = Token::new_delim('@');
356
357	/// Represents a `\\` [Kind::Delim] token.
358	pub const BACKSLASH: Token = Token::new_delim('\\');
359
360	/// Represents a `^` [Kind::Delim] token.
361	pub const CARET: Token = Token::new_delim('^');
362
363	/// Represents a `_` [Kind::Delim] token.
364	pub const UNDERSCORE: Token = Token::new_delim('_');
365
366	/// Represents a `\`` [Kind::Delim] token.
367	pub const BACKTICK: Token = Token::new_delim('\'');
368
369	/// Represents a `|` [Kind::Delim] token.
370	pub const PIPE: Token = Token::new_delim('|');
371
372	/// Represents a `~` [Kind::Delim] token.
373	pub const TILDE: Token = Token::new_delim('~');
374
375	/// Represents a replacement character [Kind::Delim] token.
376	pub const REPLACEMENT_CHARACTER: Token = Token::new_delim(REPLACEMENT_CHARACTER);
377
378	/// Creates a "Dummy" token with no additional data, just the [Kind].
379	#[inline]
380	pub const fn dummy(kind: Kind) -> Self {
381		Self((kind as u32) << 24, 0)
382	}
383
384	/// Creates a "Dummy" token with no additional data, just [Kind::Ident].
385	#[inline]
386	pub const fn dummy_ident() -> Self {
387		Self((Kind::Ident as u32) << 24, 0)
388	}
389
390	/// Creates a [Kind::Whitesapce] token.
391	#[inline]
392	pub(crate) const fn new_whitespace(style: Whitespace, len: u32) -> Self {
393		let flags: u32 = Kind::Whitespace as u32 | ((style.to_bits() as u32) << 5);
394		Self((flags << 24) & KIND_MASK, len)
395	}
396
397	/// Creates a [Kind::Comment] token.
398	#[inline]
399	pub(crate) fn new_comment(style: CommentStyle, len: u32) -> Self {
400		let flags: u32 = Kind::Comment as u32 | ((style as u32) << 5);
401		Self((flags << 24) & KIND_MASK, len)
402	}
403
404	/// Creates a [Kind::Number] token.
405	#[inline]
406	pub(crate) fn new_number(is_float: bool, has_sign: bool, len: u32, value: f32) -> Self {
407		let flags: u32 = Kind::Number as u32 | ((is_float as u32) << 5) | ((has_sign as u32) << 6);
408		Self((flags << 24) & KIND_MASK | (len & LENGTH_MASK), value.to_bits())
409	}
410
411	/// Creates a new [Kind::Dimension] token.
412	#[inline]
413	pub(crate) fn new_dimension(
414		is_float: bool,
415		has_sign: bool,
416		num_len: u32,
417		unit_len: u32,
418		value: f32,
419		atom: u8,
420	) -> Self {
421		debug_assert!(num_len <= 4097);
422		let num_len = (num_len << 12) & HALF_LENGTH_MASK;
423		let is_known_unit = if unit_len < 32 { ((atom != 0) as u32) << 7 } else { 0 };
424		let unit_len = if is_known_unit == 0 { unit_len } else { unit_len << 7 | (atom as u32 & 0b1111111) };
425		let flags: u32 = Kind::Dimension as u32 | is_known_unit | ((is_float as u32) << 5) | ((has_sign as u32) << 6);
426		Self(((flags << 24) & KIND_MASK) | ((num_len | unit_len) & LENGTH_MASK), value.to_bits())
427	}
428
429	/// Creates a new [Kind::BadString] token. Bad Strings are like String tokens but during lexing they failed to fully tokenize
430	/// into a proper string token, usually due to containing newline characters.
431	#[inline]
432	pub(crate) fn new_bad_string(len: u32) -> Self {
433		Self(((Kind::BadString as u32) << 24) & KIND_MASK, len)
434	}
435
436	/// Creates a new [Kind::BadUrl] token. Bad URLs are like URL tokens but during lexing they failed to fully tokenize into a
437	/// proper URL token, usually due to containing newline characters.
438	#[inline]
439	pub(crate) fn new_bad_url(len: u32) -> Self {
440		Self(((Kind::BadUrl as u32) << 24) & KIND_MASK, len)
441	}
442
443	/// Creates a new [Kind::Ident] token.
444	#[inline]
445	pub(crate) fn new_ident(
446		contains_non_lower_ascii: bool,
447		dashed: bool,
448		contains_escape: bool,
449		atom: u32,
450		len: u32,
451	) -> Self {
452		let flags: u32 = Kind::Ident as u32
453			| ((contains_non_lower_ascii as u32) << 5)
454			| ((dashed as u32) << 6)
455			| ((contains_escape as u32) << 7);
456		debug_assert!(atom & LENGTH_MASK == atom);
457		Self((flags << 24) & KIND_MASK | atom, len)
458	}
459
460	/// Creates a new [Kind::Function] token.
461	#[inline]
462	pub(crate) fn new_function(
463		contains_non_lower_ascii: bool,
464		dashed: bool,
465		contains_escape: bool,
466		atom: u32,
467		len: u32,
468	) -> Self {
469		let flags: u32 = Kind::Function as u32
470			| ((contains_non_lower_ascii as u32) << 5)
471			| ((dashed as u32) << 6)
472			| ((contains_escape as u32) << 7);
473		debug_assert!(atom & LENGTH_MASK == atom);
474		Self((flags << 24) & KIND_MASK | atom, len)
475	}
476
477	/// Creates a new [Kind::AtKeyword] token.
478	#[inline]
479	pub(crate) fn new_atkeyword(
480		contains_non_lower_ascii: bool,
481		dashed: bool,
482		contains_escape: bool,
483		atom: u32,
484		len: u32,
485	) -> Self {
486		let flags: u32 = Kind::AtKeyword as u32
487			| ((contains_non_lower_ascii as u32) << 5)
488			| ((dashed as u32) << 6)
489			| ((contains_escape as u32) << 7);
490		debug_assert!(atom & LENGTH_MASK == atom);
491		Self((flags << 24) & KIND_MASK | atom, len)
492	}
493
494	/// Creates a new [Kind::Hash] token.
495	#[inline]
496	pub(crate) fn new_hash(
497		contains_non_lower_ascii: bool,
498		first_is_ascii: bool,
499		contains_escape: bool,
500		len: u32,
501		hex_value: u32,
502	) -> Self {
503		let flags: u32 = Kind::Hash as u32
504			| ((contains_non_lower_ascii as u32) << 5)
505			| ((first_is_ascii as u32) << 6)
506			| ((contains_escape as u32) << 7);
507		debug_assert!(len < (1 << 24));
508		Self((flags << 24) & KIND_MASK | (len & LENGTH_MASK), hex_value)
509	}
510
511	/// Creates a new [Kind::String] token.
512	#[inline]
513	pub(crate) fn new_string(quotes: QuoteStyle, has_close_quote: bool, contains_escape: bool, len: u32) -> Self {
514		debug_assert!(quotes != QuoteStyle::None);
515		let quotes = if quotes == QuoteStyle::Double { 0b001_00000 } else { 0b0 };
516		let flags: u32 =
517			Kind::String as u32 | quotes | ((has_close_quote as u32) << 6) | ((contains_escape as u32) << 7);
518		Self((flags << 24) & KIND_MASK, len)
519	}
520
521	/// Creates a new [Kind::Url] token.
522	#[inline]
523	pub(crate) fn new_url(
524		ends_with_paren: bool,
525		contains_whitespace_after_open_paren: bool,
526		contains_escape: bool,
527		leading_length: u32,
528		trailing_length: u32,
529		len: u32,
530	) -> Self {
531		let leading_length = (leading_length << 12) & HALF_LENGTH_MASK;
532		let flags: u32 = Kind::Url as u32
533			| ((ends_with_paren as u32) << 5)
534			| ((contains_whitespace_after_open_paren as u32) << 6)
535			| ((contains_escape as u32) << 7);
536		Self((flags << 24) & KIND_MASK | ((leading_length | trailing_length) & LENGTH_MASK), len)
537	}
538
539	/// Creates a new [Kind::Delim] token.
540	#[inline]
541	pub(crate) const fn new_delim(char: char) -> Self {
542		let flags: u32 = Kind::Delim as u32;
543		Self((flags << 24) & KIND_MASK, char as u32)
544	}
545
546	/// Creates a new [Kind::Delim] token.
547	#[inline]
548	pub(crate) const fn new_delim_kind(kind: Kind, char: char) -> Self {
549		let flags: u32 = kind as u32;
550		Self((flags << 24) & KIND_MASK, char as u32)
551	}
552
553	/// Creates a new [Kind::Delim] token with associated whitespace.
554	#[inline]
555	pub(crate) const fn new_delim_with_associated_whitespace(char: char, rules: AssociatedWhitespaceRules) -> Self {
556		let flags: u32 = Kind::Delim as u32 | ((rules.to_bits() as u32) << 5);
557		Self((flags << 24) & KIND_MASK, char as u32)
558	}
559
560	/// \[private\]
561	/// Creates a new Token with an interned string.
562	#[inline]
563	pub fn new_interned(kind: Kind, bits: u32, len: u32) -> Token {
564		debug_assert!(kind == KindSet::IDENT_LIKE);
565		debug_assert!(bits & LENGTH_MASK == bits);
566		debug_assert!(len > 0);
567		Self(((kind as u32) << 24) & KIND_MASK | (bits & LENGTH_MASK), len + ((kind != Kind::Ident) as u32))
568	}
569
570	/// Returns the raw bits representing the [Kind].
571	#[inline(always)]
572	pub(crate) const fn kind_bits(&self) -> u8 {
573		(self.0 >> 24 & 0b11111) as u8
574	}
575
576	/// Returns the [Kind].
577	#[inline]
578	pub const fn kind(&self) -> Kind {
579		Kind::from_bits(self.kind_bits())
580	}
581
582	/// Check if the TF upper-most bit is set.
583	#[inline(always)]
584	const fn first_bit_is_set(&self) -> bool {
585		self.0 >> 31 == 1
586	}
587
588	/// Check if the TF second-upper-most bit is set.
589	#[inline(always)]
590	const fn second_bit_is_set(&self) -> bool {
591		self.0 >> 30 & 0b1 == 1
592	}
593
594	/// Check if the TF third-upper-most bit is set.
595	#[inline(always)]
596	const fn third_bit_is_set(&self) -> bool {
597		self.0 >> 29 & 0b1 == 1
598	}
599
600	/// Check if the [Kind] is "Ident Like", i.e. it is [Kind::Ident], [Kind::AtKeyword], [Kind::Function], [Kind::Hash].
601	#[inline(always)]
602	pub(crate) const fn is_ident_like(&self) -> bool {
603		self.kind_bits() & 0b11000 == 0b01000 && self.kind_bits() != Kind::String as u8
604	}
605
606	/// Check if the [Kind] is "Delim Like", i.e. it is [Kind::Delim], [Kind::Colon], [Kind::Semicolon], [Kind::Comma],
607	/// [Kind::LeftSquare], [Kind::RightSquare], [Kind::LeftParen], [Kind::RightParen], [Kind::LeftCurly],
608	/// [Kind::RightCurly].
609	#[inline(always)]
610	pub(crate) const fn is_delim_like(&self) -> bool {
611		self.kind_bits() & 0b10000 == 0b10000
612	}
613
614	/// The only token with an empty length is EOF, but this method is available for symmetry with `len()`.
615	#[inline]
616	pub const fn is_empty(&self) -> bool {
617		self.kind_bits() == Kind::Eof as u8
618	}
619
620	/// Returns the amount of characters (utf-8 code points) this Token represents in the underlying source text.
621	#[inline]
622	pub const fn len(&self) -> u32 {
623		if self.kind_bits() == Kind::Eof as u8 {
624			0
625		} else if self.is_delim_like() {
626			debug_assert!(matches!(
627				self.kind(),
628				Kind::Delim
629					| Kind::Colon | Kind::Semicolon
630					| Kind::Comma | Kind::LeftSquare
631					| Kind::RightSquare
632					| Kind::LeftParen
633					| Kind::RightParen
634					| Kind::LeftCurly
635					| Kind::RightCurly
636			));
637			self.char().unwrap().len_utf8() as u32
638		} else if self.kind_bits() == Kind::Number as u8 {
639			self.numeric_len()
640		} else if self.kind_bits() == Kind::Dimension as u8 {
641			if self.first_bit_is_set() {
642				self.numeric_len() + (self.0 >> 7 & 0b11111)
643			} else {
644				((self.0 & LENGTH_MASK) >> 12) + (self.0 & !HALF_LENGTH_MASK)
645			}
646		} else if self.kind_bits() == Kind::Hash as u8 {
647			self.0 & LENGTH_MASK
648		} else {
649			self.1
650		}
651	}
652
653	/// If the [Kind] is "Delim Like" (i.e. it is [Kind::Delim], [Kind::Colon], [Kind::Semicolon], [Kind::Comma],
654	/// [Kind::LeftSquare], [Kind::RightSquare], [Kind::LeftParen], [Kind::RightParen], [Kind::LeftCurly],
655	/// [Kind::RightCurly]) then this will return a [Some] with a [char] representing the value.
656	/// For non-delim-like tokens this will return [None].
657	pub const fn char(&self) -> Option<char> {
658		if self.is_delim_like() {
659			return char::from_u32(self.1);
660		}
661		None
662	}
663
664	/// The [Token] is a [Kind::Dimension] or [Kind::Number] and is an integer - i.e. it has no `.`.
665	#[inline]
666	pub const fn is_int(&self) -> bool {
667		self.kind_bits() & 0b11100 == 0b00100 && !self.third_bit_is_set()
668	}
669
670	/// The [Token] is a [Kind::Dimension] or [Kind::Number] and is a float - i.e. it has decimal places. This will be
671	/// `true` even if the decimal places are 0. e.g. `0.0`.
672	#[inline]
673	pub const fn is_float(&self) -> bool {
674		self.kind_bits() & 0b11100 == 0b00100 && self.third_bit_is_set()
675	}
676
677	/// The [Token] is a [Kind::Dimension] or [Kind::Number] and the underlying character data included a `-` or `+`
678	/// character. Note that a positive number may not necessarily have a sign, e.g. `3` will return false, while `+3`
679	/// will return `true`.
680	#[inline]
681	pub const fn has_sign(&self) -> bool {
682		self.kind_bits() & 0b11100 == 0b00100 && self.second_bit_is_set()
683	}
684
685	/// If the [Token] is a [Kind::Dimension] or [Kind::Number] then this returns the amount of characters used to
686	/// represent this number in the underlying source text. Numbers may be inefficiently encoded in the source text,
687	/// e.g. `0.0000`.
688	///
689	/// Asserts: the `kind()` is [Kind::Dimension] or [Kind::Number].
690	#[inline]
691	pub const fn numeric_len(&self) -> u32 {
692		debug_assert!(matches!(self.kind(), Kind::Number | Kind::Dimension));
693		if self.kind_bits() == Kind::Dimension as u8 {
694			(self.0 & LENGTH_MASK) >> 12
695		} else if self.first_bit_is_set() {
696			(self.0 & LENGTH_MASK) >> 16
697		} else {
698			self.0 & LENGTH_MASK
699		}
700	}
701
702	/// If the [Token] is a [Kind::Dimension] or [Kind::Number] then this returns the [f32] representation of the number's
703	/// value.
704	///
705	/// Asserts: the `kind()` is [Kind::Dimension] or [Kind::Number].
706	#[inline]
707	pub fn value(&self) -> f32 {
708		debug_assert!(matches!(self.kind(), Kind::Number | Kind::Dimension));
709		f32::from_bits(self.1)
710	}
711
712	/// Returns the [Whitespace].
713	///
714	/// If the [Token] is not a [Kind::Whitespace] this will return [Whitespace::none()].
715	#[inline]
716	pub fn whitespace_style(&self) -> Whitespace {
717		if self.kind_bits() == Kind::Whitespace as u8 {
718			Whitespace::from_bits((self.0 >> 29) as u8)
719		} else {
720			Whitespace::none()
721		}
722	}
723
724	/// Returns the [AssociatedWhitespaceRules].
725	///
726	/// If the [Kind] is not "Delim Like" (i.e. it is not [Kind::Delim], [Kind::Colon], [Kind::Semicolon], [Kind::Comma],
727	/// [Kind::LeftSquare], [Kind::RightSquare], [Kind::LeftParen], [Kind::RightParen], [Kind::LeftCurly],
728	/// [Kind::RightCurly]) then this will always return `AssociatedWhitespaceRules::none()`.
729	#[inline]
730	pub fn associated_whitespace(&self) -> AssociatedWhitespaceRules {
731		if self.is_delim_like() {
732			AssociatedWhitespaceRules::from_bits((self.0 >> 29) as u8)
733		} else {
734			AssociatedWhitespaceRules::none()
735		}
736	}
737
738	/// Returns a new [Token] with the [AssociatedWhitespaceRules] set to the given [AssociatedWhitespaceRules],
739	/// if possible.
740	///
741	/// If the [Kind] is not "Delim Like" (i.e. it is not [Kind::Delim], [Kind::Colon], [Kind::Semicolon], [Kind::Comma],
742	/// [Kind::LeftSquare], [Kind::RightSquare], [Kind::LeftParen], [Kind::RightParen], [Kind::LeftCurly],
743	/// [Kind::RightCurly]) then this will return the same [Token].
744	/// If the [AssociatedWhitespaceRules] is different it will return a new [Token].
745	#[inline]
746	pub fn with_associated_whitespace(&self, rules: AssociatedWhitespaceRules) -> Token {
747		if !self.is_delim_like() {
748			return *self;
749		}
750		Token::new_delim_with_associated_whitespace(self.char().unwrap(), rules)
751	}
752
753	/// Returns the [CommentStyle].
754	///
755	/// If the [Token] is not a [Kind::Comment] this will return [None].
756	#[inline]
757	pub fn comment_style(&self) -> Option<CommentStyle> {
758		if self.kind_bits() == Kind::Comment as u8 { CommentStyle::from_bits((self.0 >> 29) as u8) } else { None }
759	}
760
761	/// Returns the [QuoteStyle].
762	///
763	/// If the [Token] is not a [Kind::String] this will return [QuoteStyle::None].
764	#[inline]
765	pub fn quote_style(&self) -> QuoteStyle {
766		if self.kind_bits() == Kind::String as u8 {
767			if self.third_bit_is_set() {
768				return QuoteStyle::Double;
769			} else {
770				return QuoteStyle::Single;
771			}
772		}
773		QuoteStyle::None
774	}
775
776	/// Returns a new [Token] with the [QuoteStyle] set to the given [QuoteStyle], if possible.
777	///
778	/// If the [Token] is not a [Kind::String], or the [QuoteStyle] is already the given [QuoteStyle] this will return the same [Token].
779	/// If the [QuoteStyle] is different it will return a new [Token].
780	/// [QuoteStyle] must not be [QuoteStyle::None]
781	#[inline]
782	pub fn with_quotes(&self, quote_style: QuoteStyle) -> Token {
783		debug_assert!(quote_style != QuoteStyle::None);
784		if self.kind_bits() != Kind::String as u8 || quote_style == self.quote_style() {
785			return *self;
786		}
787		Token::new_string(quote_style, self.has_close_quote(), self.contains_escape_chars(), self.len())
788	}
789
790	/// If the [Token] is a [Kind::String] this checks if the string ended in a close quote.
791	/// It is possible to have a valid String token that does not end in a close quote, by eliding the quote at the end of
792	/// a file.
793	///
794	/// Asserts: The [Kind] is [Kind::String].
795	#[inline]
796	pub const fn has_close_quote(&self) -> bool {
797		debug_assert!(self.kind_bits() == Kind::String as u8);
798		self.second_bit_is_set()
799	}
800
801	/// Checks if it is possible for the [Token] to contain escape characters. Numbers, for example, cannot. Idents can.
802	#[inline]
803	pub const fn can_escape(&self) -> bool {
804		self.kind_bits() == Kind::String as u8 || self.kind_bits() == Kind::Dimension as u8 || self.is_ident_like()
805	}
806
807	/// If the [Token] can escape, checks if the underlying source text contained escape characters.
808	///
809	/// Asserts: The token can escape ([Token::can_escape()]).
810	#[inline]
811	pub const fn contains_escape_chars(&self) -> bool {
812		if self.kind_bits() == Kind::Dimension as u8 {
813			// Always assume Dimension contains escape because we have other fast paths to handle dimension units
814			return true;
815		}
816		self.can_escape() && self.first_bit_is_set()
817	}
818
819	/// If the [Token] is Ident like, checks if the first two code points are HYPHEN-MINUS (`-`).
820	///
821	/// Asserts: The token is "ident like", i.e. it is [Kind::Ident], [Kind::AtKeyword], [Kind::Function], [Kind::Hash].
822	#[inline]
823	pub const fn is_dashed_ident(&self) -> bool {
824		debug_assert!(self.is_ident_like());
825		self.second_bit_is_set()
826	}
827
828	/// Checks if the [Token] is Ident like and none of the characters are ASCII upper-case.
829	#[inline]
830	pub const fn is_lower_case(&self) -> bool {
831		self.is_ident_like() && !self.third_bit_is_set()
832	}
833
834	#[inline]
835	pub fn atom_bits(&self) -> u32 {
836		if self.kind_bits() == Kind::Dimension as u8 && self.first_bit_is_set() {
837			self.0 & 0b111_1111
838		} else if self.is_ident_like() && self.kind_bits() != Kind::Hash as u8 {
839			self.0 & LENGTH_MASK
840		} else {
841			0
842		}
843	}
844
845	/// Checks if the [Token] is Trivia-like, that is [Kind::Comment], [Kind::Whitespace], [Kind::Eof]
846	#[inline]
847	pub const fn is_trivia(&self) -> bool {
848		self.kind_bits() & 0b000011 == self.kind_bits()
849	}
850
851	/// If the [Token] is [Kind::Url], checks if there are leading Whitespace characters before the inner value.
852	///
853	/// Asserts: The token is [Kind::Url].
854	#[inline]
855	pub const fn url_has_leading_space(&self) -> bool {
856		debug_assert!(self.kind_bits() == Kind::Url as u8);
857		self.second_bit_is_set()
858	}
859
860	/// If the [Token] is [Kind::Url], checks if the closing parenthesis is present.
861	///
862	/// Asserts: The token is [Kind::Url].
863	#[inline]
864	pub const fn url_has_closing_paren(&self) -> bool {
865		debug_assert!(self.kind_bits() == Kind::Url as u8);
866		self.third_bit_is_set()
867	}
868
869	/// If the [Token] is [Kind::Hash], checks if the Hash is "ID-like" (i.e its first character is ASCII).
870	///
871	/// Asserts: The token is [Kind::Hash].
872	#[inline]
873	pub const fn hash_is_id_like(&self) -> bool {
874		debug_assert!(self.kind_bits() == Kind::Hash as u8);
875		self.second_bit_is_set()
876	}
877
878	/// Checks if the [Token] is [Kind::BadString] or [Kind::BadUrl].
879	#[inline]
880	pub const fn is_bad(&self) -> bool {
881		(self.kind_bits() | 0b00001) & 0b11001 == 1
882	}
883
884	/// Checks if the [Token] is [Kind::CdcOrCdo] and is the CDC variant of that token.
885	#[inline]
886	pub const fn is_cdc(&self) -> bool {
887		self.kind_bits() == (Kind::CdcOrCdo as u8) && self.third_bit_is_set()
888	}
889
890	/// Some tokens may have a "leading" part:
891	///  - [Kind::AtKeyword] always starts with a `@`,
892	///  - [Kind::Hash] with a `#`.
893	///  - [Kind::String] with a `"` or `'`.
894	///  - [Kind::Comment] with a leading `/*` (or `//`).
895	///  - [Kind::Dimension] has a leading numeric portion.
896	///  - [Kind::Url] has the leading `url(` ident (which may vary in exact representation).
897	///
898	/// This function returns the length of that, irrespective of the [Kind]. For other kinds not listed, this will return
899	/// `0`, but for the above kinds it will calculate the leading length. This is useful for parsing out the underlying
900	/// data which is likely to be of greater use.
901	pub fn leading_len(&self) -> u32 {
902		match self.kind() {
903			Kind::AtKeyword | Kind::Hash | Kind::String => 1,
904			Kind::Dimension => self.numeric_len(),
905			Kind::Comment => 2,
906			Kind::Url => (self.0 & LENGTH_MASK) >> 12,
907			_ => 0,
908		}
909	}
910
911	/// Some tokens may have a "trailing" part:
912	///  - [Kind::Function] will always have an opening `(`.
913	///  - [Kind::String] may have a closing `"` or `'`.
914	///  - [Kind::Comment] may have a closing `*/`
915	///  - [Kind::Url] may have a clsoing `)`.
916	///
917	/// This function returns the length of that, irrespective of the [Kind]. For other kinds not listed, this will return
918	/// `0`, but for the above kinds it will calculate the leading length. This is useful for parsing out the underlying
919	/// data which is likely to be of greater use.
920	pub fn trailing_len(&self) -> u32 {
921		match self.kind() {
922			Kind::Function => 1,
923			Kind::String => self.has_close_quote() as u32,
924			Kind::Comment if self.comment_style().unwrap().is_block() => 2,
925			Kind::Url => self.0 & !HALF_LENGTH_MASK,
926			_ => 0,
927		}
928	}
929
930	/// Certain kinds have a [PairWise] equivalent:
931	///  - [Kind::LeftParen] has [Kind::RightParen]
932	///  - [Kind::LeftCurly] has [Kind::RightCurly]
933	///  - [Kind::LeftSquare] has [Kind::RightSquare]
934	///
935	/// This function returns the [PairWise] enum, if the [Token] is one of the above listed [Kinds][Kind]. For any other
936	/// [Kind] this returns [None].
937	#[inline]
938	pub fn to_pairwise(&self) -> Option<PairWise> {
939		PairWise::from_token(self)
940	}
941
942	/// A convenience function for `Cursor::new(offset, token)`.
943	#[inline(always)]
944	pub fn with_cursor(self, offset: SourceOffset) -> Cursor {
945		Cursor::new(offset, self)
946	}
947
948	/// If the [Kind] is [Kind::Hash] then this token may have had the opportunity to be parsed as a `<hex-value>` (e.g.
949	/// `#fff`). When this happens the character data is parsed during tokenization into a u32 which stores the
950	/// RR,GG,BB,AA values.
951	#[inline(always)]
952	pub fn hex_value(self) -> u32 {
953		if self == Kind::Hash { self.1 } else { 0 }
954	}
955
956	/// If this [Token] is preceded by the [Token] `other` then a separating token (e.g. a comment) will need to be
957	/// inserted between these the two tokens during serialization, in order for them to be able to be re-tokenized as
958	/// the same tokens. For example an Ident ("a") adjacent to an Ident ("b"), if serialized without whitespace, would
959	/// create a single Ident ("ab"). The rules for estbalishing whether or not these tokens needs whitespace are quite
960	/// simple and are effectively [defined in the serialization section of the spec][1]. To reproduce the table:
961	///
962	/// [1]: https://drafts.csswg.org/css-syntax/#serialization
963	///
964	/// |            | ident | function | url | bad url | - | number | percentage | dimension | CDC | ( | * | % |
965	/// |:-----------|:-----:|:--------:|:---:|:-------:|:-:|:------:|:----------:|:---------:|:---:|:-:|:-:|:-:|
966	/// | ident      |   ✗   |    ✗     |  ✗  |    ✗    | ✗ |    ✗   |      ✗     |     ✗     |  ✗  | ✗ |   |   |
967	/// | at-keyword |   ✗   |    ✗     |  ✗  |    ✗    | ✗ |    ✗   |      ✗     |     ✗     |  ✗  |   |   |   |
968	/// | hash       |   ✗   |    ✗     |  ✗  |    ✗    | ✗ |    ✗   |      ✗     |     ✗     |  ✗  |   |   |   |
969	/// | dimension  |   ✗   |    ✗     |  ✗  |    ✗    | ✗ |    ✗   |      ✗     |     ✗     |  ✗  |   |   |   |
970	/// | #          |   ✗   |    ✗     |  ✗  |    ✗    | ✗ |    ✗   |      ✗     |     ✗     |  ✗  |   |   |   |
971	/// | \-         |   ✗   |    ✗     |  ✗  |    ✗    | ✗ |    ✗   |      ✗     |     ✗     |  ✗  |   |   |   |
972	/// | number     |   ✗   |    ✗     |  ✗  |    ✗    |   |    ✗   |      ✗     |     ✗     |  ✗  |   |   | ✗ |
973	/// | @          |   ✗   |    ✗     |  ✗  |    ✗    | ✗ |        |            |           |  ✗  |   |   |   |
974	/// | .          |       |          |     |         |   |    ✗   |      ✗     |     ✗     |     |   |   |   |
975	/// | +          |       |          |     |         |   |    ✗   |      ✗     |     ✗     |     |   |   |   |
976	/// | /          |       |          |     |         |   |        |            |           |     |   | ✗ |   |
977	///
978	/// The one exception not in this table is that two consecutive `/` characters should also be separated by spaces in
979	/// order to avoid abmiguities with CSS-alike languages that treat two consecutive `/` characters as a single line
980	/// comment.
981	///
982	/// # Example
983	///
984	/// ```
985	/// use css_lexer::*;
986	/// let mut lexer = Lexer::new(&EmptyAtomSet::ATOMS, "10 %");
987	/// let first = lexer.advance();
988	/// let _ = lexer.advance(); // Whitespace
989	/// let second = lexer.advance();
990	/// assert!(first.needs_separator_for(second));
991	/// ```
992	pub fn needs_separator_for(&self, second: Token) -> bool {
993		if second == AssociatedWhitespaceRules::EnforceBefore && *self != Kind::Whitespace
994			|| *self == AssociatedWhitespaceRules::EnforceAfter && second != Kind::Whitespace
995		{
996			// We need whitespace after, unless the next token is actually whitespace.
997			return true;
998		}
999		if *self == AssociatedWhitespaceRules::BanAfter {
1000			return false;
1001		}
1002		match self.kind() {
1003			Kind::Ident => {
1004				(matches!(second.kind(), Kind::Number | Kind::Dimension) &&
1005					// numbers with a `-` need separating, but with `+` they do not.
1006					(!second.has_sign() || second.value() < 0.0))
1007					|| matches!(second.kind(), Kind::Ident | Kind::Function | Kind::Url | Kind::BadUrl)
1008					|| matches!(second.char(), Some('(' | '-'))
1009					|| second.is_cdc()
1010			}
1011			Kind::AtKeyword | Kind::Hash | Kind::Dimension => {
1012				(matches!(second.kind(), Kind::Number | Kind::Dimension) &&
1013					// numbers with a `-` need separating, but with `+` they do not.
1014					(!second.has_sign() || second.value() < 0.0))
1015					|| matches!(second.kind(), Kind::Ident | Kind::Function | Kind::Url | Kind::BadUrl)
1016					|| matches!(second.char(), Some('-'))
1017					|| second.is_cdc()
1018			}
1019			Kind::Number => {
1020				matches!(
1021					second.kind(),
1022					Kind::Ident | Kind::Function | Kind::Url | Kind::BadUrl | Kind::Number | Kind::Dimension
1023				) || matches!(second.char(), Some('%'))
1024					|| second.is_cdc()
1025			}
1026			_ => match self.char() {
1027				Some('#') => {
1028					matches!(
1029						second.kind(),
1030						Kind::Ident | Kind::Function | Kind::Url | Kind::BadUrl | Kind::Number | Kind::Dimension
1031					) || matches!(second.char(), Some('-'))
1032						|| second.is_cdc()
1033				}
1034				Some('-') => {
1035					matches!(
1036						second.kind(),
1037						Kind::Ident | Kind::Function | Kind::Url | Kind::BadUrl | Kind::Number | Kind::Dimension
1038					) || matches!(second.char(), Some('-'))
1039						|| second.is_cdc()
1040				}
1041				Some('@') => {
1042					matches!(second.kind(), Kind::Ident | Kind::Function | Kind::Url | Kind::BadUrl)
1043						|| matches!(second.char(), Some('-'))
1044						|| second.is_cdc()
1045				}
1046				Some('.') => matches!(second.kind(), Kind::Number | Kind::Dimension),
1047				Some('+') => matches!(second.kind(), Kind::Number | Kind::Dimension),
1048				Some('/') => matches!(second.char(), Some('*' | '/')),
1049				_ => false,
1050			},
1051		}
1052	}
1053}
1054
1055impl core::fmt::Debug for Token {
1056	fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
1057		let mut d = f.debug_struct(format!("Token::{}", self.kind().as_str()).as_str());
1058		match self.kind() {
1059			Kind::Eof => &mut d,
1060			Kind::Number => d.field("value", &self.value()).field("len", &self.numeric_len()),
1061			Kind::Dimension => {
1062				d.field("value", &self.value()).field("len", &self.numeric_len()).field("dimension_len", &self.len())
1063			}
1064			_ if self.is_delim_like() => {
1065				d.field("char", &self.char().unwrap()).field("len", &self.len());
1066				if !self.associated_whitespace().is_none() {
1067					d.field("associated_whitespace", &self.associated_whitespace());
1068				}
1069				&mut d
1070			}
1071			Kind::String => d
1072				.field("quote_style", &if self.first_bit_is_set() { "Double" } else { "Single" })
1073				.field("has_close_quote", &self.second_bit_is_set())
1074				.field("contains_escape_chars", &self.third_bit_is_set())
1075				.field("len", &self.len()),
1076			Kind::Ident | Kind::Function | Kind::AtKeyword => d
1077				.field("is_lower_case", &self.first_bit_is_set())
1078				.field("is_dashed_ident", &self.second_bit_is_set())
1079				.field("contains_escape_chars", &self.third_bit_is_set())
1080				.field("len", &self.len()),
1081			Kind::Hash => d
1082				.field("is_lower_case", &self.first_bit_is_set())
1083				.field("hash_is_id_like", &self.second_bit_is_set())
1084				.field("contains_escape_chars", &self.third_bit_is_set())
1085				.field("len", &self.len()),
1086			Kind::Url => d
1087				.field("url_has_closing_paren", &self.first_bit_is_set())
1088				.field("url_has_leading_space", &self.second_bit_is_set())
1089				.field("contains_escape_chars", &self.third_bit_is_set())
1090				.field("len", &self.len()),
1091			Kind::CdcOrCdo => d.field("is_cdc", &self.first_bit_is_set()).field("len", &self.len()),
1092			Kind::Whitespace => d.field("contains", &self.whitespace_style()).field("len", &self.len()),
1093			_ => d
1094				.field("flag_0", &self.first_bit_is_set())
1095				.field("flag_1", &self.second_bit_is_set())
1096				.field("flag_2", &self.third_bit_is_set())
1097				.field("len", &self.len()),
1098		}
1099		.finish()
1100	}
1101}
1102
1103impl std::fmt::Display for Token {
1104	fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
1105		match self.kind() {
1106			Kind::Delim => write!(f, "Delim({})", self.char().unwrap()),
1107			k => write!(f, "{}", k.as_str()),
1108		}
1109	}
1110}
1111
1112#[cfg(feature = "serde")]
1113impl serde::ser::Serialize for Token {
1114	fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
1115	where
1116		S: serde::ser::Serializer,
1117	{
1118		use serde::ser::SerializeStruct;
1119		if *self == Self::EMPTY {
1120			return serializer.serialize_none();
1121		}
1122		let mut state = serializer.serialize_struct("Token", 3)?;
1123		state.serialize_field("kind", self.kind().as_str())?;
1124		state.serialize_field("len", &self.len())?;
1125		state.end()
1126	}
1127}
1128
1129impl From<Token> for Kind {
1130	fn from(token: Token) -> Self {
1131		token.kind()
1132	}
1133}
1134
1135impl PartialEq<Kind> for Token {
1136	fn eq(&self, other: &Kind) -> bool {
1137		self.kind_bits() == *other as u8
1138	}
1139}
1140
1141impl From<Token> for KindSet {
1142	fn from(token: Token) -> Self {
1143		KindSet::new(&[token.kind()])
1144	}
1145}
1146
1147impl PartialEq<KindSet> for Token {
1148	fn eq(&self, other: &KindSet) -> bool {
1149		other.contains_bits(self.kind_bits())
1150	}
1151}
1152
1153impl From<Token> for QuoteStyle {
1154	fn from(token: Token) -> Self {
1155		token.quote_style()
1156	}
1157}
1158
1159impl PartialEq<QuoteStyle> for Token {
1160	fn eq(&self, other: &QuoteStyle) -> bool {
1161		&self.quote_style() == other
1162	}
1163}
1164
1165impl From<Token> for Whitespace {
1166	fn from(token: Token) -> Self {
1167		token.whitespace_style()
1168	}
1169}
1170
1171impl PartialEq<Whitespace> for Token {
1172	fn eq(&self, other: &Whitespace) -> bool {
1173		self.whitespace_style().intersects(*other)
1174	}
1175}
1176
1177impl PartialEq<AssociatedWhitespaceRules> for Token {
1178	fn eq(&self, other: &AssociatedWhitespaceRules) -> bool {
1179		self.associated_whitespace().intersects(*other)
1180	}
1181}
1182
1183impl PartialEq<CommentStyle> for Token {
1184	fn eq(&self, other: &CommentStyle) -> bool {
1185		self.comment_style().map(|style| &style == other).unwrap_or(false)
1186	}
1187}
1188
1189impl PartialEq<PairWise> for Token {
1190	fn eq(&self, other: &PairWise) -> bool {
1191		self.to_pairwise().map(|style| &style == other).unwrap_or(false)
1192	}
1193}
1194
1195impl PartialEq<char> for Token {
1196	fn eq(&self, other: &char) -> bool {
1197		self.char().map(|char| char == *other).unwrap_or(false)
1198	}
1199}
1200
1201#[test]
1202fn size_test() {
1203	assert_eq!(::std::mem::size_of::<Token>(), 8);
1204}
1205
1206#[test]
1207fn test_new_whitespace() {
1208	assert_eq!(Token::SPACE, Kind::Whitespace);
1209	assert_eq!(Token::SPACE, Whitespace::Space);
1210	assert_eq!(Token::TAB, Kind::Whitespace);
1211	assert_eq!(Token::TAB, Whitespace::Tab);
1212	assert_eq!(Token::NEWLINE, Kind::Whitespace);
1213	assert_eq!(Token::NEWLINE, Whitespace::Newline);
1214	assert_eq!(Token::new_whitespace(Whitespace::Space, 4), Kind::Whitespace);
1215	assert_eq!(Token::new_whitespace(Whitespace::Space | Whitespace::Newline, 4), Whitespace::Space);
1216	assert_eq!(Token::new_whitespace(Whitespace::Space, 4).len(), 4);
1217	assert_eq!(Token::new_whitespace(Whitespace::Tab | Whitespace::Space, 4), Whitespace::Tab);
1218	assert_eq!(Token::new_whitespace(Whitespace::Newline, 4), Whitespace::Newline);
1219	assert_eq!(Token::new_whitespace(Whitespace::Newline, 4).len(), 4);
1220}
1221
1222#[test]
1223fn test_new_comment() {
1224	assert_eq!(Token::new_comment(CommentStyle::Block, 4), Kind::Comment);
1225	assert_eq!(Token::new_comment(CommentStyle::Block, 4), CommentStyle::Block);
1226	assert_eq!(Token::new_comment(CommentStyle::Single, 4), CommentStyle::Single);
1227}
1228
1229#[test]
1230fn test_new_number() {
1231	assert_eq!(Token::new_number(false, false, 3, 4.2), Kind::Number);
1232	assert_eq!(Token::new_number(false, false, 3, 4.2).value(), 4.2);
1233	assert_eq!(Token::new_number(false, false, 3, 4.2).len(), 3);
1234	assert_eq!(Token::new_number(false, true, 9, 4.2), Kind::Number);
1235	assert_eq!(Token::new_number(false, true, 9, 4.2).value(), 4.2);
1236	assert_eq!(Token::new_number(false, true, 9, 4.2).len(), 9);
1237	assert!(!Token::new_number(false, false, 3, 4.2).has_sign());
1238	assert!(Token::new_number(false, true, 3, 4.2).has_sign());
1239	assert!(!Token::new_number(false, true, 3, 4.0).is_float());
1240	assert!(Token::new_number(true, false, 3, 4.2).is_float());
1241}
1242
1243#[test]
1244fn test_new_string() {
1245	assert_eq!(Token::new_string(QuoteStyle::Single, false, false, 4), Kind::String);
1246	assert_eq!(Token::new_string(QuoteStyle::Single, false, false, 4), QuoteStyle::Single);
1247	assert!(!Token::new_string(QuoteStyle::Single, false, false, 4).has_close_quote());
1248	assert!(!Token::new_string(QuoteStyle::Single, false, false, 4).contains_escape_chars());
1249	assert_eq!(Token::new_string(QuoteStyle::Single, false, false, 4).len(), 4);
1250	assert_eq!(Token::new_string(QuoteStyle::Double, false, false, 4), Kind::String);
1251	assert_eq!(Token::new_string(QuoteStyle::Double, false, false, 4), QuoteStyle::Double);
1252	assert!(Token::new_string(QuoteStyle::Double, true, false, 4).has_close_quote());
1253	assert!(!Token::new_string(QuoteStyle::Double, true, false, 4).contains_escape_chars());
1254	assert_eq!(Token::new_string(QuoteStyle::Double, true, false, 5).len(), 5);
1255	assert!(Token::new_string(QuoteStyle::Double, true, true, 4).contains_escape_chars());
1256	assert!(Token::new_string(QuoteStyle::Double, false, true, 4).contains_escape_chars());
1257}
1258
1259#[test]
1260fn test_new_hash() {
1261	assert_eq!(Token::new_hash(false, false, false, 4, 0), Kind::Hash);
1262	assert!(!Token::new_hash(false, false, false, 4, 0).contains_escape_chars());
1263	assert!(Token::new_hash(false, false, true, 4, 0).contains_escape_chars());
1264	assert!(Token::new_hash(false, false, true, 4, 0).is_lower_case());
1265	assert!(!Token::new_hash(true, false, false, 4, 0).is_lower_case());
1266	assert_eq!(Token::new_hash(true, false, false, 4, 0).len(), 4);
1267	assert_eq!(Token::new_hash(true, false, false, 4, 0).hex_value(), 0);
1268	assert_eq!(Token::new_hash(true, false, false, 4, 18).hex_value(), 18);
1269}
1270
1271#[test]
1272#[should_panic]
1273fn test_new_string_with_quotes_none() {
1274	Token::new_string(QuoteStyle::None, false, true, 4);
1275}
1276
1277#[test]
1278fn test_new_delim() {
1279	assert_eq!(Token::new_delim('>'), Kind::Delim);
1280	assert_eq!(Token::new_delim('>'), '>');
1281	assert_eq!(Token::new_delim('>').len(), 1);
1282	assert_eq!(Token::new_delim('.'), Kind::Delim);
1283	assert_eq!(Token::new_delim('.'), '.');
1284	assert_eq!(Token::new_delim('.').len(), 1);
1285	assert_eq!(Token::new_delim('ℝ'), Kind::Delim);
1286	assert_eq!(Token::new_delim('ℝ'), 'ℝ');
1287	assert_eq!(Token::new_delim('ℝ').len(), 3);
1288	assert_eq!(Token::new_delim('💣'), Kind::Delim);
1289	assert_eq!(Token::new_delim('💣'), '💣');
1290	assert_eq!(Token::new_delim('💣').len(), 4);
1291	assert_eq!(Token::new_delim('💣').len(), 4);
1292	assert_eq!(Token::new_delim('💣').len(), 4);
1293}
1294
1295#[test]
1296fn with_associated_whitespace() {
1297	assert_eq!(
1298		Token::new_delim('>').with_associated_whitespace(
1299			AssociatedWhitespaceRules::EnforceBefore | AssociatedWhitespaceRules::EnforceAfter
1300		),
1301		AssociatedWhitespaceRules::EnforceBefore | AssociatedWhitespaceRules::EnforceBefore
1302	);
1303}
1304
1305#[test]
1306fn test_with_quotes() {
1307	assert_eq!(
1308		Token::new_string(QuoteStyle::Single, false, false, 4).with_quotes(QuoteStyle::Double),
1309		Token::new_string(QuoteStyle::Double, false, false, 4)
1310	);
1311	assert_eq!(
1312		Token::new_string(QuoteStyle::Double, true, true, 8).with_quotes(QuoteStyle::Single),
1313		Token::new_string(QuoteStyle::Single, true, true, 8),
1314	);
1315}
1316
1317#[test]
1318#[should_panic]
1319fn test_with_quotes_none() {
1320	Token::new_string(QuoteStyle::Single, false, true, 4).with_quotes(QuoteStyle::None);
1321	Token::new_string(QuoteStyle::Double, false, true, 4).with_quotes(QuoteStyle::None);
1322}
1323
1324#[test]
1325fn test_new_dimension() {
1326	{
1327		let token = Token::new_dimension(false, false, 3, 3, 999.0, 0);
1328		assert_eq!(token, Kind::Dimension);
1329		assert_eq!(token.value(), 999.0);
1330		assert_eq!(token.numeric_len(), 3);
1331		assert_eq!(token.len(), 6);
1332		assert!(!token.is_float());
1333		assert!(!token.has_sign());
1334	}
1335	{
1336		let token = Token::new_dimension(false, false, 5, 2, 8191.0, 0);
1337		assert_eq!(token, Kind::Dimension);
1338		assert_eq!(token.value(), 8191.0);
1339		assert_eq!(token.numeric_len(), 5);
1340		assert_eq!(token.len(), 7);
1341		assert!(!token.is_float());
1342		assert!(!token.has_sign());
1343	}
1344	for i in -8191..8191 {
1345		let token = Token::new_dimension(false, false, 9, 3, i as f32, 0);
1346		assert_eq!(token.value(), i as f32);
1347	}
1348}
css_lexer/token.rs

css_lexer/
token.rs