css_lexer/token.rs
1use crate::{
2 AssociatedWhitespaceRules, CommentStyle, Cursor, Kind, KindSet, PairWise, QuoteStyle, SourceOffset, Whitespace,
3};
4use std::char::REPLACEMENT_CHARACTER;
5
6/// An abstract representation of the chunk of the source text, retaining certain "facts" about the source.
7///
8/// # Design
9///
10/// The [Token] type is an immutable packing of two [u32s][u32] that represents a unit in the source text, but without
11/// the associated offset data that points to its position in the source text. This is important because it means that
12/// equivalent [Tokens][Token] are equal even in different parts of the document. For the most part a [Token] doesn't
13/// represent data that can be put into a text file because it lacks the underlying character data. It is lossy. For
14/// example a [Token] with [Kind::Ident] just represents _an_ ident, but it doesn't retain what the keyword is).
15/// Storing raw-character data would require either storing tokens on the heap (and therefore they couldn't be [Sized])
16/// or by keeping a reference to `&'a str` which means larger token sizes and lifetime tracking. By _not_ storing
17/// character data we can keep [Token] [Sized] and keep it to `size_of` `8`, avoiding the heap, avoiding
18/// references/lifetimes, and keeping [Token] entirely in the stack. For a lot of tokens this is _fine_ because the
19/// underlying character data isn't that useful past a certain point.
20///
21/// A [Token] retains certain "facts" about the underlying unit of text, though. For example it retains the [Kind], how
22/// many characters the token consumed, and various other pieces of information, depending on the [Kind]. In some
23/// cases, it's entirely possible to represent the full token, including character data, into the available bits (for
24/// example [Kind::Delim] stores its [char], [Kind::Number] stores its [f32]). Taking the time in the tokenizer to
25/// gather these facts and values can keep cache-lines hot, which speeds up subsequent checks in the parser.
26///
27/// If you're familiar with "red green" syntax trees such as [Swiftlang's libsyntax][1], or [Rust-Analyzer's Rowan][2]
28/// or [Roslyn][3] this might be a little familiar in some concepts. However [Token] does not represent a tree, and
29/// relies on resorting back to the string data to find out keyword values.
30///
31/// [1]: https://gh.io/AAtdqpg
32/// [2]: https://gh.io/AAtf8pt
33/// [3]: https://gh.io/AAtab90
34///
35/// This representation of facts, kind, length, or other metadata can be quite complex - so here's a
36/// full breakdown:
37///
38/// # Anatomy of Token
39///
40/// A [Token] is a struct of `(u32, u32)`. The second u32 is _usually_ the token length (hence keeping them separate).
41/// The first [u32], however, is split into 3 (sometimes 5) parts. The two u32s can be thought of like so:
42///
43/// ```md
44/// |-----|-------|--------------------------|---------------------------------|
45/// | TF | K | VD | Value |
46/// 0b| 000 | 00000 | 000000000000000000000000 | 0000000000000000000000000000000 |
47/// |-----|-------|--------------------------|---------------------------------|
48/// | 3-- | 5---- | 24---------------------- | 32----------------------------- |
49/// ```
50///
51/// ## TF = Type Flags (or "Token Facts")
52///
53/// This represents a bit-mask in the upper-most 3 bits. The flags are general purpose and change meaning depending on
54/// the Token's [Kind]. Each flag generally maps to a method so it's not necessary to remenber the contents of this
55/// table, but it can serve as a useful reference. Note that not all methods return a [bool], so footnotes have been
56/// added to explain these further.
57///
58/// | Kind:: | Flag | Description | Method |
59/// |--------------------|-------|-----------------------------|------------------------------------------|
60/// | [Kind::Number] | `001` | Floating Point | [Token::is_float()] |
61/// | | `010` | Has a "Sign" (-/+) | [Token::has_sign()] |
62/// | | `100` | (Reserved) | -- |
63/// | [Kind::Dimension] | `001` | Floating Point | [Token::is_float()] |
64/// | | `010` | Has a "Sign" (-/+) | [Token::has_sign()] |
65/// | | `100` | Unit is a known dimension | [Token::atom_bits()][^dimension] |
66/// | [Kind::String] | `001` | Uses Double Quotes | [Token::quote_style()][^quotes] |
67/// | | `010` | Has a closing quote | [Token::has_close_quote()] |
68/// | | `100` | Contains escape characters | [Token::contains_escape_chars()] |
69/// | [Kind::Ident] | `001` | Contains non-lower-ASCII | [Token::is_lower_case()] |
70/// | | `010` | Is a "Dashed Ident" | [Token::is_dashed_ident()] |
71/// | | `100` | Contains escape characters | [Token::contains_escape_chars()] |
72/// | [Kind::Function] | `001` | Contains non-lower-ASCII | [Token::is_lower_case()] |
73/// | | `010` | Is a "Dashed Ident" | [Token::is_dashed_ident()] |
74/// | | `100` | Contains escape characters | [Token::contains_escape_chars()] |
75/// | [Kind::AtKeyword] | `001` | Contains non-lower-ASCII | [Token::is_lower_case()] |
76/// | | `010` | Is a "Dashed Ident" | [Token::is_dashed_ident()] |
77/// | | `100` | Contains escape characters | [Token::contains_escape_chars()] |
78/// | [Kind::Hash] | `001` | Contains non-lower-ASCII | [Token::is_lower_case()] |
79/// | | `010` | First character is ASCII | [Token::hash_is_id_like()] |
80/// | | `100` | Contains escape characters | [Token::contains_escape_chars()] |
81/// | [Kind::Url] | `001` | Has a closing paren ) | [Token::url_has_closing_paren()] |
82/// | | `010` | Contains whitespace after ( | [Token::url_has_leading_space()] |
83/// | | `100` | Contains escape characters | [Token::contains_escape_chars()] |
84/// | [Kind::CdcOrCdo] | `001` | Is CDO (`000` would be CDC) | [Token::is_cdc()] |
85/// | | `010` | (Reserved) | -- |
86/// | | `100` | (Reserved) | -- |
87/// | [Kind::Whitespace] | `---` | Whitespace style | [Token::whitespace_style()][^whitespace] |
88/// | [Kind::Delim] | `---` | Associate whitespace rules | [Token::associated_whitespace()][^delim] |
89/// | [Kind::Comment] | `---` | (Special) | [Token::comment_style()][^comments] |
90///
91/// [^quotes]: Strings do not have a [bool] returning method for whether or not the quote is using double or single
92/// quotes, instead the [Token::quote_style()] method will returning the [QuoteStyle] enum for better readability.
93/// [^whitespace]: Whitespace tokens to not have a [bool] returning method, instead [Token::whitespace_style()] will
94/// return the [Whitespace] enum for improved readability.
95/// [^comments]: Rather than using the 3 bits as a bit-mask, Comment tokens use the data to store the [CommentStyle]
96/// enum, which is capable of representing 8 discrete comment styles.
97/// [^delim]: Delims can be used in interesting ways inside of CSS syntax. At higher levels CSS is _sometimes_
98/// whitespace sensitive, for example the whitespace inside of a CSS selector _sometimes_ represents the descendant
99/// combinator, meanwhile delimiters inside calc() are sensitive to whitespace collapse (`calc(1px + 1px)` is valid
100/// while `calc(1px+1px)` is a parse error). Further to this, introducing whitespace (say through a formatter) might
101/// break in interesting ways due to some combinations of Delims & Idents - for example Pseudo Classes like `:hover`,
102/// or CSS like languages such as SASS using `$var` style syntax. While `:hover` and `$var` are comprised of two tokens
103/// they're considered one conceptual unit. Having a way to express these relationships at the token level can be useful
104/// for other low level machinery such as formatters/minifiers, rather than introducing complex state at higher levels.
105/// For these reasons, Delim tokens have the ability to express their whitespace association. The lexer will always
106/// produce a token with empty whitespace rules, but parsers can replace this token with a more complex set of rules.
107///
108/// ## K = Kind Bits
109///
110/// The `K` value - upper-most bits 4-9 stores the 5-bit [Kind].
111///
112/// ## VD = Value Data
113///
114/// The `VD` value - the lower-most 24-bits - stores data depending on the [Token] [Kind]. For most kinds this data is
115/// reserved (just 0s). The value data cannot be interrogated manually, but it packs in additional data about the
116/// underlying string to make the string easier to parse without doing the same lookups that the tokenizer already had
117/// to - such as determining lengths of the various parts of the token, or packing values so that consulting the string
118/// can be avoided (which keeps cache-lines hot).
119///
120/// Below describes the special kinds which use the Value Data to store yet more information about the token...
121///
122/// ### Value Data for [Kind::Number]
123///
124/// If the [Kind] is [Kind::Number], Value Data represents the length of that number (this means the parser is
125/// restricted from representing numbers longer than 16,777,216 characters which is probably an acceptable limit). Note
126/// that this does not affect the _value_ of a number, just the characters in a string. Numbers in CSS are [f32]. The
127/// vast majority of [f32s][f32] can be represented in 16MM characters, but it's possible to author a document that
128/// contains a set of numeric characters longer than 16MM code points. These scenarios are considered [undefined
129/// behaviour][1].
130///
131/// [4]: https://en.wikipedia.org/wiki/Undefined_behavior
132///
133/// ### Value Data for [Kind::Hash]
134///
135/// If the [Kind] is [Kind::Hash], Value Data represents the length of that hash (this means the parser is restricted
136/// from representing IDs and hex codes longer than 16,777,216 characters which is probably an acceptable limit). Note
137/// that this restriction means that ID selectors have a much tigher limit than other tokens, such as strings or
138/// idents, but it's very unlikely to see a 16million character ID in CSS (String, maybe).
139///
140/// ### Value Data for [Kind::Url]
141///
142/// If the [Kind] is [Kind::Url], Value Data represents the "leading length" and "trailing length" of the URL. This
143/// means the value data is split into two 12 bit numbers:
144///
145/// ```md
146/// |--------------|--------------|
147/// | LL | TL |
148/// | 000000000000 | 000000000000 |
149/// |--------------|--------------|
150/// | 12---------- | 12---------- |
151/// ```
152///
153/// The "leading" length represents the `url(` part of the token. Typically this will be `4`, however it's possible
154/// (for legacy compatibility reasons within CSS) to add whitespace between the opening parenthesis and the URL value.
155/// It's also possible to escape the `url` ident portion. This means `\75\52\6c( ` is also a valid leading section of
156/// a URL ident (which has a character length of 13), as is `\000075 \000052 \00006c ( ` (28 characters). 12 bits
157/// allows for a maximum character length of 4,096. It is not possible to represent a URL token's leading section using
158/// 4,096 characters so there is some headroom (wasted bytes) here.
159///
160/// The "trailing" length represents the `)` part of the token. Typically this will be `1`, however it's possible to
161/// add any number of whitespace characters between the end of the URL and the closing parenthesis. If a CSS document
162/// contains more than 4095 whitespace characters then this is considered [undefined behaviour][4].
163///
164/// ### Value Data for [Kind::Dimension]
165///
166/// If K is a Dimension, then this represents both the number of characters in the numeric portion of the dimension
167/// and the length of the ident portion of the dimension... or the dimension unit itself (more on that below). This
168/// means the value data is split into two 12 bit numbers:
169///
170/// ```md
171/// |--------------|--------------|
172/// | NL | DUL |
173/// | 000000000000 | 000000000000 |
174/// |--------------|--------------|
175/// | 12---------- | 12---------- |
176///
177/// |--------------|-------| --------|
178/// | NL | KDUL | KNOWN |
179/// | 000000000000 | 00000 | 0000000 |
180/// |--------------|-------| --------|
181/// | 12---------- | 5---- | 7------ |
182/// ```
183///
184/// The NL portion - the numeric length - represents the length of characters the number contains. This means the
185/// numeric portion of a dimension can only be 4,096 characters long. This is dramatically shorter than the 16MM
186/// allowed for numbers but it's still also incredibly generous such that it's highly unlikely to ever be hit unless
187/// someone is intentionally trying to break the parser. The [Lexer][super::Lexer] encountering a dimension with a
188/// numeric portion longer than 4,096 characters is considered [undefined behaviour][4].
189///
190/// The DUL portion (if `TF & 100 == 0`) will represent the length of characters the ident portion of the dimension
191/// (aka the dimension unit) contains. This means the ident portion of a dimension can only be 4,096 characters long.
192/// For practical purposes CSS has a fixed set of dimensions - the longest of which (at the time of writing) are 5
193/// characters long (e.g. `svmax`). Through the use of escaping shenanigans it's possible to create a valid CSS
194/// dimension longer than 5 characters though (every ident can be made 8 times longer by using escape characters, e.g.
195/// `1svmax` at 6 characters can be instead written as `1\000073 \000076 \00006d \000061 \000078` at 40 characters). In
196/// addition to these factors, it's worth pointing out that there is scope for further dimensions and some [proposals
197/// for "custom" dimensions][5], and lastly this library is designed for CSS _and CSS-alike_ languages, which may
198/// invent their own dimension units. In other words being too restrictive on dimension ident length could be costly
199/// in the future, therefore 4,096 characters seems like a reasonable, if generous, trade-off.
200///
201/// There's a giant caveat here though. If `TF & 100 != 0`, then the dimension is considered "known" and DUL will be
202/// encoded differently. Instead of just containing the dimension unit length, which requires consulting the underlying
203/// `&str` to get the actual dimension, it will be used to store an Atom - but only the first 7 bits (the KNOWN
204/// portion), which for an Atom must be a Dimension atom (an assummption made on anything that implements
205/// [AtomSet][crate::AtomSet] is that all dimension units should be stored in the byte values of 1-127, so that they
206/// can be encoded in this space). Dimension units _can_ be escape encoded, and so the underlying character data may
207/// differ from the unescaped unit length, as such 5-bit KDUL portion represents character data length, in other words
208/// `KNOWN.len()` may not always equal KDUL`.
209///
210/// [5]: https://github.com/w3c/csswg-drafts/issues/7379
211///
212/// ## Value
213///
214/// The `Value` portion of [Token] represents the length of the token for most token kinds. However, for some tokens
215/// their length is already packed into the first u32. So it would make more sense to use this u32 to store more
216/// interesting data.
217///
218/// ## Value for [Kind::Delim] and single character tokens
219///
220/// [Kind::Delim] and single-character tokens (i.e. [Kind::Colon]->[Kind::RightCurly]) typically have a length of `1`
221/// ([Kind::Delim] can have a varied length for surrogate pairs). Instead of storing the length and wasting a whole
222/// [u32], this region stores the [char]. Calling [Token::char()] will return an [Option] which will always be [Some]
223/// for [Kind::Delim] and single-character tokens.
224///
225/// ## Value for [Kind::Hash]
226///
227/// The length of a hash is stored in its `VD` portion, leaving 32bits to storing other data. It just so happens that
228/// a 8-character hex code (#ffaabbcc) fits nicely inside of 32-bits. During tokenization we can eagerly parse the hex
229/// code and stuff it here, so it can be more easily reasoned about in upstream code (rather than
230/// reading the character data).
231///
232/// ## Value for [Kind::Number] and [Kind::Dimension]
233///
234/// As these tokens store their length data in the `VD` portion, this [u32] instead stores the _value_ of the number,
235/// stored as [f32::to_bits()].
236///
237/// ## Value data for other tokens.
238///
239/// In all other cases, this represents the length of the token as utf-8 bytes. This means the token length is
240/// 4,294,967,296 aka ~4GB. This sounds very long but also CSS can host very large image data and browsers will
241/// accomodate very large URLs. [An mdn article on Data URLs][6] claims that Firefox supports 32mb Data URLs, Chrome
242/// supports over 512mb, and Safari over 2gb. The reality is that if someone has such a large data URL in their CSS
243/// they probably should split it out, but we have a whole 32 bits to store the length so we may as well use it...
244///
245/// [6]: https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URLs#common_problems
246#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
247pub struct Token(u32, u32);
248
249impl Default for Token {
250 fn default() -> Self {
251 Self((Kind::Whitespace as u32) << 24, 0)
252 }
253}
254
255const KIND_MASK: u32 = !((1 << 24) - 1);
256const LENGTH_MASK: u32 = (1 << 24) - 1;
257const HALF_LENGTH_MASK: u32 = !((1 << 12) - 1);
258
259impl Token {
260 /// Represents an empty token.
261 pub const EMPTY: Token = Token::new_whitespace(Whitespace::none(), 0);
262
263 /// Represents an EOF token.
264 pub const EOF: Token = Token(0b0, 0);
265
266 /// Represents a CDO (`<!--`) token.
267 pub const CDO: Token = Token(((Kind::CdcOrCdo as u32) << 24) & KIND_MASK, 4);
268
269 /// Represents a CDC (`-->`) token.
270 pub const CDC: Token = Token((((Kind::CdcOrCdo as u32) | 0b001_00000) << 24) & KIND_MASK, 3);
271
272 /// Represents a single ' ' space token.
273 pub const SPACE: Token = Token::new_whitespace(Whitespace::Space, 1);
274
275 /// Represents a single Tab token.
276 pub const TAB: Token = Token::new_whitespace(Whitespace::Tab, 1);
277
278 /// Represents a single `\n` token.
279 pub const NEWLINE: Token = Token::new_whitespace(Whitespace::Newline, 1);
280
281 /// Represents the Number `0`. This is not equal to other representations of zero, such as `00`, `0e0`, `0.0` and so
282 /// on.
283 pub const NUMBER_ZERO: Token = Token((((Kind::Number as u32) | 0b100_00000) << 24) & KIND_MASK, 1);
284
285 /// Represents the `:` token.
286 pub const COLON: Token = Token::new_delim_kind(Kind::Colon, ':');
287
288 /// Represents the `;` token.
289 pub const SEMICOLON: Token = Token::new_delim_kind(Kind::Semicolon, ';');
290
291 /// Represents the `,` token.
292 pub const COMMA: Token = Token::new_delim_kind(Kind::Comma, ',');
293
294 /// Represents the `[` token.
295 pub const LEFT_SQUARE: Token = Token::new_delim_kind(Kind::LeftSquare, '[');
296
297 /// Represents the `]` token.
298 pub const RIGHT_SQUARE: Token = Token::new_delim_kind(Kind::RightSquare, ']');
299
300 /// Represents the `(` token.
301 pub const LEFT_PAREN: Token = Token::new_delim_kind(Kind::LeftParen, '(');
302
303 /// Represents the `)` token.
304 pub const RIGHT_PAREN: Token = Token::new_delim_kind(Kind::RightParen, ')');
305
306 /// Represents the `{` token.
307 pub const LEFT_CURLY: Token = Token::new_delim_kind(Kind::LeftCurly, '{');
308
309 /// Represents the `}` token.
310 pub const RIGHT_CURLY: Token = Token::new_delim_kind(Kind::RightCurly, '}');
311
312 /// Represents a `!` [Kind::Delim] token.
313 pub const BANG: Token = Token::new_delim('!');
314
315 /// Represents a `#` [Kind::Delim] token.
316 pub const HASH: Token = Token::new_delim('#');
317
318 /// Represents a `$` [Kind::Delim] token.
319 pub const DOLLAR: Token = Token::new_delim('$');
320
321 /// Represents a `%` [Kind::Delim] token - not to be confused with the `%` dimension.
322 pub const PERCENT: Token = Token::new_delim('%');
323
324 /// Represents a `&` [Kind::Delim] token.
325 pub const AMPERSAND: Token = Token::new_delim('&');
326
327 /// Represents a `*` [Kind::Delim] token.
328 pub const ASTERISK: Token = Token::new_delim('*');
329
330 /// Represents a `+` [Kind::Delim] token.
331 pub const PLUS: Token = Token::new_delim('+');
332
333 /// Represents a `-` [Kind::Delim] token.
334 pub const DASH: Token = Token::new_delim('-');
335
336 /// Represents a `.` [Kind::Delim] token.
337 pub const PERIOD: Token = Token::new_delim('.');
338
339 /// Represents a `/` [Kind::Delim] token.
340 pub const SLASH: Token = Token::new_delim('/');
341
342 /// Represents a `<` [Kind::Delim] token.
343 pub const LESS_THAN: Token = Token::new_delim('<');
344
345 /// Represents a `=` [Kind::Delim] token.
346 pub const EQUALS: Token = Token::new_delim('=');
347
348 /// Represents a `>` [Kind::Delim] token.
349 pub const GREATER_THAN: Token = Token::new_delim('>');
350
351 /// Represents a `?` [Kind::Delim] token.
352 pub const QUESTION: Token = Token::new_delim('?');
353
354 /// Represents a `@` [Kind::Delim] token. Not to be confused with the @keyword token.
355 pub const AT: Token = Token::new_delim('@');
356
357 /// Represents a `\\` [Kind::Delim] token.
358 pub const BACKSLASH: Token = Token::new_delim('\\');
359
360 /// Represents a `^` [Kind::Delim] token.
361 pub const CARET: Token = Token::new_delim('^');
362
363 /// Represents a `_` [Kind::Delim] token.
364 pub const UNDERSCORE: Token = Token::new_delim('_');
365
366 /// Represents a `\`` [Kind::Delim] token.
367 pub const BACKTICK: Token = Token::new_delim('\'');
368
369 /// Represents a `|` [Kind::Delim] token.
370 pub const PIPE: Token = Token::new_delim('|');
371
372 /// Represents a `~` [Kind::Delim] token.
373 pub const TILDE: Token = Token::new_delim('~');
374
375 /// Represents a replacement character [Kind::Delim] token.
376 pub const REPLACEMENT_CHARACTER: Token = Token::new_delim(REPLACEMENT_CHARACTER);
377
378 /// Creates a "Dummy" token with no additional data, just the [Kind].
379 #[inline]
380 pub const fn dummy(kind: Kind) -> Self {
381 Self((kind as u32) << 24, 0)
382 }
383
384 /// Creates a "Dummy" token with no additional data, just [Kind::Ident].
385 #[inline]
386 pub const fn dummy_ident() -> Self {
387 Self((Kind::Ident as u32) << 24, 0)
388 }
389
390 /// Creates a [Kind::Whitesapce] token.
391 #[inline]
392 pub(crate) const fn new_whitespace(style: Whitespace, len: u32) -> Self {
393 let flags: u32 = Kind::Whitespace as u32 | ((style.to_bits() as u32) << 5);
394 Self((flags << 24) & KIND_MASK, len)
395 }
396
397 /// Creates a [Kind::Comment] token.
398 #[inline]
399 pub(crate) fn new_comment(style: CommentStyle, len: u32) -> Self {
400 let flags: u32 = Kind::Comment as u32 | ((style as u32) << 5);
401 Self((flags << 24) & KIND_MASK, len)
402 }
403
404 /// Creates a [Kind::Number] token.
405 #[inline]
406 pub(crate) fn new_number(is_float: bool, has_sign: bool, len: u32, value: f32) -> Self {
407 let flags: u32 = Kind::Number as u32 | ((is_float as u32) << 5) | ((has_sign as u32) << 6);
408 Self((flags << 24) & KIND_MASK | (len & LENGTH_MASK), value.to_bits())
409 }
410
411 /// Creates a new [Kind::Dimension] token.
412 #[inline]
413 pub(crate) fn new_dimension(
414 is_float: bool,
415 has_sign: bool,
416 num_len: u32,
417 unit_len: u32,
418 value: f32,
419 atom: u8,
420 ) -> Self {
421 debug_assert!(num_len <= 4097);
422 let num_len = (num_len << 12) & HALF_LENGTH_MASK;
423 let is_known_unit = if unit_len < 32 { ((atom != 0) as u32) << 7 } else { 0 };
424 let unit_len = if is_known_unit == 0 { unit_len } else { unit_len << 7 | (atom as u32 & 0b1111111) };
425 let flags: u32 = Kind::Dimension as u32 | is_known_unit | ((is_float as u32) << 5) | ((has_sign as u32) << 6);
426 Self(((flags << 24) & KIND_MASK) | ((num_len | unit_len) & LENGTH_MASK), value.to_bits())
427 }
428
429 /// Creates a new [Kind::BadString] token. Bad Strings are like String tokens but during lexing they failed to fully tokenize
430 /// into a proper string token, usually due to containing newline characters.
431 #[inline]
432 pub(crate) fn new_bad_string(len: u32) -> Self {
433 Self(((Kind::BadString as u32) << 24) & KIND_MASK, len)
434 }
435
436 /// Creates a new [Kind::BadUrl] token. Bad URLs are like URL tokens but during lexing they failed to fully tokenize into a
437 /// proper URL token, usually due to containing newline characters.
438 #[inline]
439 pub(crate) fn new_bad_url(len: u32) -> Self {
440 Self(((Kind::BadUrl as u32) << 24) & KIND_MASK, len)
441 }
442
443 /// Creates a new [Kind::Ident] token.
444 #[inline]
445 pub(crate) fn new_ident(
446 contains_non_lower_ascii: bool,
447 dashed: bool,
448 contains_escape: bool,
449 atom: u32,
450 len: u32,
451 ) -> Self {
452 let flags: u32 = Kind::Ident as u32
453 | ((contains_non_lower_ascii as u32) << 5)
454 | ((dashed as u32) << 6)
455 | ((contains_escape as u32) << 7);
456 debug_assert!(atom & LENGTH_MASK == atom);
457 Self((flags << 24) & KIND_MASK | atom, len)
458 }
459
460 /// Creates a new [Kind::Function] token.
461 #[inline]
462 pub(crate) fn new_function(
463 contains_non_lower_ascii: bool,
464 dashed: bool,
465 contains_escape: bool,
466 atom: u32,
467 len: u32,
468 ) -> Self {
469 let flags: u32 = Kind::Function as u32
470 | ((contains_non_lower_ascii as u32) << 5)
471 | ((dashed as u32) << 6)
472 | ((contains_escape as u32) << 7);
473 debug_assert!(atom & LENGTH_MASK == atom);
474 Self((flags << 24) & KIND_MASK | atom, len)
475 }
476
477 /// Creates a new [Kind::AtKeyword] token.
478 #[inline]
479 pub(crate) fn new_atkeyword(
480 contains_non_lower_ascii: bool,
481 dashed: bool,
482 contains_escape: bool,
483 atom: u32,
484 len: u32,
485 ) -> Self {
486 let flags: u32 = Kind::AtKeyword as u32
487 | ((contains_non_lower_ascii as u32) << 5)
488 | ((dashed as u32) << 6)
489 | ((contains_escape as u32) << 7);
490 debug_assert!(atom & LENGTH_MASK == atom);
491 Self((flags << 24) & KIND_MASK | atom, len)
492 }
493
494 /// Creates a new [Kind::Hash] token.
495 #[inline]
496 pub(crate) fn new_hash(
497 contains_non_lower_ascii: bool,
498 first_is_ascii: bool,
499 contains_escape: bool,
500 len: u32,
501 hex_value: u32,
502 ) -> Self {
503 let flags: u32 = Kind::Hash as u32
504 | ((contains_non_lower_ascii as u32) << 5)
505 | ((first_is_ascii as u32) << 6)
506 | ((contains_escape as u32) << 7);
507 debug_assert!(len < (1 << 24));
508 Self((flags << 24) & KIND_MASK | (len & LENGTH_MASK), hex_value)
509 }
510
511 /// Creates a new [Kind::String] token.
512 #[inline]
513 pub(crate) fn new_string(quotes: QuoteStyle, has_close_quote: bool, contains_escape: bool, len: u32) -> Self {
514 debug_assert!(quotes != QuoteStyle::None);
515 let quotes = if quotes == QuoteStyle::Double { 0b001_00000 } else { 0b0 };
516 let flags: u32 =
517 Kind::String as u32 | quotes | ((has_close_quote as u32) << 6) | ((contains_escape as u32) << 7);
518 Self((flags << 24) & KIND_MASK, len)
519 }
520
521 /// Creates a new [Kind::Url] token.
522 #[inline]
523 pub(crate) fn new_url(
524 ends_with_paren: bool,
525 contains_whitespace_after_open_paren: bool,
526 contains_escape: bool,
527 leading_length: u32,
528 trailing_length: u32,
529 len: u32,
530 ) -> Self {
531 let leading_length = (leading_length << 12) & HALF_LENGTH_MASK;
532 let flags: u32 = Kind::Url as u32
533 | ((ends_with_paren as u32) << 5)
534 | ((contains_whitespace_after_open_paren as u32) << 6)
535 | ((contains_escape as u32) << 7);
536 Self((flags << 24) & KIND_MASK | ((leading_length | trailing_length) & LENGTH_MASK), len)
537 }
538
539 /// Creates a new [Kind::Delim] token.
540 #[inline]
541 pub(crate) const fn new_delim(char: char) -> Self {
542 let flags: u32 = Kind::Delim as u32;
543 Self((flags << 24) & KIND_MASK, char as u32)
544 }
545
546 /// Creates a new [Kind::Delim] token.
547 #[inline]
548 pub(crate) const fn new_delim_kind(kind: Kind, char: char) -> Self {
549 let flags: u32 = kind as u32;
550 Self((flags << 24) & KIND_MASK, char as u32)
551 }
552
553 /// Creates a new [Kind::Delim] token with associated whitespace.
554 #[inline]
555 pub(crate) const fn new_delim_with_associated_whitespace(char: char, rules: AssociatedWhitespaceRules) -> Self {
556 let flags: u32 = Kind::Delim as u32 | ((rules.to_bits() as u32) << 5);
557 Self((flags << 24) & KIND_MASK, char as u32)
558 }
559
560 /// \[private\]
561 /// Creates a new Token with an interned string.
562 #[inline]
563 pub fn new_interned(kind: Kind, bits: u32, len: u32) -> Token {
564 debug_assert!(kind == KindSet::IDENT_LIKE);
565 debug_assert!(bits & LENGTH_MASK == bits);
566 debug_assert!(len > 0);
567 Self(((kind as u32) << 24) & KIND_MASK | (bits & LENGTH_MASK), len + ((kind != Kind::Ident) as u32))
568 }
569
570 /// Returns the raw bits representing the [Kind].
571 #[inline(always)]
572 pub(crate) const fn kind_bits(&self) -> u8 {
573 (self.0 >> 24 & 0b11111) as u8
574 }
575
576 /// Returns the [Kind].
577 #[inline]
578 pub const fn kind(&self) -> Kind {
579 Kind::from_bits(self.kind_bits())
580 }
581
582 /// Check if the TF upper-most bit is set.
583 #[inline(always)]
584 const fn first_bit_is_set(&self) -> bool {
585 self.0 >> 31 == 1
586 }
587
588 /// Check if the TF second-upper-most bit is set.
589 #[inline(always)]
590 const fn second_bit_is_set(&self) -> bool {
591 self.0 >> 30 & 0b1 == 1
592 }
593
594 /// Check if the TF third-upper-most bit is set.
595 #[inline(always)]
596 const fn third_bit_is_set(&self) -> bool {
597 self.0 >> 29 & 0b1 == 1
598 }
599
600 /// Check if the [Kind] is "Ident Like", i.e. it is [Kind::Ident], [Kind::AtKeyword], [Kind::Function], [Kind::Hash].
601 #[inline(always)]
602 pub(crate) const fn is_ident_like(&self) -> bool {
603 self.kind_bits() & 0b11000 == 0b01000 && self.kind_bits() != Kind::String as u8
604 }
605
606 /// Check if the [Kind] is "Delim Like", i.e. it is [Kind::Delim], [Kind::Colon], [Kind::Semicolon], [Kind::Comma],
607 /// [Kind::LeftSquare], [Kind::RightSquare], [Kind::LeftParen], [Kind::RightParen], [Kind::LeftCurly],
608 /// [Kind::RightCurly].
609 #[inline(always)]
610 pub(crate) const fn is_delim_like(&self) -> bool {
611 self.kind_bits() & 0b10000 == 0b10000
612 }
613
614 /// The only token with an empty length is EOF, but this method is available for symmetry with `len()`.
615 #[inline]
616 pub const fn is_empty(&self) -> bool {
617 self.kind_bits() == Kind::Eof as u8
618 }
619
620 /// Returns the amount of characters (utf-8 code points) this Token represents in the underlying source text.
621 #[inline]
622 pub const fn len(&self) -> u32 {
623 if self.kind_bits() == Kind::Eof as u8 {
624 0
625 } else if self.is_delim_like() {
626 debug_assert!(matches!(
627 self.kind(),
628 Kind::Delim
629 | Kind::Colon | Kind::Semicolon
630 | Kind::Comma | Kind::LeftSquare
631 | Kind::RightSquare
632 | Kind::LeftParen
633 | Kind::RightParen
634 | Kind::LeftCurly
635 | Kind::RightCurly
636 ));
637 self.char().unwrap().len_utf8() as u32
638 } else if self.kind_bits() == Kind::Number as u8 {
639 self.numeric_len()
640 } else if self.kind_bits() == Kind::Dimension as u8 {
641 if self.first_bit_is_set() {
642 self.numeric_len() + (self.0 >> 7 & 0b11111)
643 } else {
644 ((self.0 & LENGTH_MASK) >> 12) + (self.0 & !HALF_LENGTH_MASK)
645 }
646 } else if self.kind_bits() == Kind::Hash as u8 {
647 self.0 & LENGTH_MASK
648 } else {
649 self.1
650 }
651 }
652
653 /// If the [Kind] is "Delim Like" (i.e. it is [Kind::Delim], [Kind::Colon], [Kind::Semicolon], [Kind::Comma],
654 /// [Kind::LeftSquare], [Kind::RightSquare], [Kind::LeftParen], [Kind::RightParen], [Kind::LeftCurly],
655 /// [Kind::RightCurly]) then this will return a [Some] with a [char] representing the value.
656 /// For non-delim-like tokens this will return [None].
657 pub const fn char(&self) -> Option<char> {
658 if self.is_delim_like() {
659 return char::from_u32(self.1);
660 }
661 None
662 }
663
664 /// The [Token] is a [Kind::Dimension] or [Kind::Number] and is an integer - i.e. it has no `.`.
665 #[inline]
666 pub const fn is_int(&self) -> bool {
667 self.kind_bits() & 0b11100 == 0b00100 && !self.third_bit_is_set()
668 }
669
670 /// The [Token] is a [Kind::Dimension] or [Kind::Number] and is a float - i.e. it has decimal places. This will be
671 /// `true` even if the decimal places are 0. e.g. `0.0`.
672 #[inline]
673 pub const fn is_float(&self) -> bool {
674 self.kind_bits() & 0b11100 == 0b00100 && self.third_bit_is_set()
675 }
676
677 /// The [Token] is a [Kind::Dimension] or [Kind::Number] and the underlying character data included a `-` or `+`
678 /// character. Note that a positive number may not necessarily have a sign, e.g. `3` will return false, while `+3`
679 /// will return `true`.
680 #[inline]
681 pub const fn has_sign(&self) -> bool {
682 self.kind_bits() & 0b11100 == 0b00100 && self.second_bit_is_set()
683 }
684
685 /// If the [Token] is a [Kind::Dimension] or [Kind::Number] then this returns the amount of characters used to
686 /// represent this number in the underlying source text. Numbers may be inefficiently encoded in the source text,
687 /// e.g. `0.0000`.
688 ///
689 /// Asserts: the `kind()` is [Kind::Dimension] or [Kind::Number].
690 #[inline]
691 pub const fn numeric_len(&self) -> u32 {
692 debug_assert!(matches!(self.kind(), Kind::Number | Kind::Dimension));
693 if self.kind_bits() == Kind::Dimension as u8 {
694 (self.0 & LENGTH_MASK) >> 12
695 } else if self.first_bit_is_set() {
696 (self.0 & LENGTH_MASK) >> 16
697 } else {
698 self.0 & LENGTH_MASK
699 }
700 }
701
702 /// If the [Token] is a [Kind::Dimension] or [Kind::Number] then this returns the [f32] representation of the number's
703 /// value.
704 ///
705 /// Asserts: the `kind()` is [Kind::Dimension] or [Kind::Number].
706 #[inline]
707 pub fn value(&self) -> f32 {
708 debug_assert!(matches!(self.kind(), Kind::Number | Kind::Dimension));
709 f32::from_bits(self.1)
710 }
711
712 /// Returns the [Whitespace].
713 ///
714 /// If the [Token] is not a [Kind::Whitespace] this will return [Whitespace::none()].
715 #[inline]
716 pub fn whitespace_style(&self) -> Whitespace {
717 if self.kind_bits() == Kind::Whitespace as u8 {
718 Whitespace::from_bits((self.0 >> 29) as u8)
719 } else {
720 Whitespace::none()
721 }
722 }
723
724 /// Returns the [AssociatedWhitespaceRules].
725 ///
726 /// If the [Kind] is not "Delim Like" (i.e. it is not [Kind::Delim], [Kind::Colon], [Kind::Semicolon], [Kind::Comma],
727 /// [Kind::LeftSquare], [Kind::RightSquare], [Kind::LeftParen], [Kind::RightParen], [Kind::LeftCurly],
728 /// [Kind::RightCurly]) then this will always return `AssociatedWhitespaceRules::none()`.
729 #[inline]
730 pub fn associated_whitespace(&self) -> AssociatedWhitespaceRules {
731 if self.is_delim_like() {
732 AssociatedWhitespaceRules::from_bits((self.0 >> 29) as u8)
733 } else {
734 AssociatedWhitespaceRules::none()
735 }
736 }
737
738 /// Returns a new [Token] with the [AssociatedWhitespaceRules] set to the given [AssociatedWhitespaceRules],
739 /// if possible.
740 ///
741 /// If the [Kind] is not "Delim Like" (i.e. it is not [Kind::Delim], [Kind::Colon], [Kind::Semicolon], [Kind::Comma],
742 /// [Kind::LeftSquare], [Kind::RightSquare], [Kind::LeftParen], [Kind::RightParen], [Kind::LeftCurly],
743 /// [Kind::RightCurly]) then this will return the same [Token].
744 /// If the [AssociatedWhitespaceRules] is different it will return a new [Token].
745 #[inline]
746 pub fn with_associated_whitespace(&self, rules: AssociatedWhitespaceRules) -> Token {
747 if !self.is_delim_like() {
748 return *self;
749 }
750 Token::new_delim_with_associated_whitespace(self.char().unwrap(), rules)
751 }
752
753 /// Returns the [CommentStyle].
754 ///
755 /// If the [Token] is not a [Kind::Comment] this will return [None].
756 #[inline]
757 pub fn comment_style(&self) -> Option<CommentStyle> {
758 if self.kind_bits() == Kind::Comment as u8 { CommentStyle::from_bits((self.0 >> 29) as u8) } else { None }
759 }
760
761 /// Returns the [QuoteStyle].
762 ///
763 /// If the [Token] is not a [Kind::String] this will return [QuoteStyle::None].
764 #[inline]
765 pub fn quote_style(&self) -> QuoteStyle {
766 if self.kind_bits() == Kind::String as u8 {
767 if self.third_bit_is_set() {
768 return QuoteStyle::Double;
769 } else {
770 return QuoteStyle::Single;
771 }
772 }
773 QuoteStyle::None
774 }
775
776 /// Returns a new [Token] with the [QuoteStyle] set to the given [QuoteStyle], if possible.
777 ///
778 /// If the [Token] is not a [Kind::String], or the [QuoteStyle] is already the given [QuoteStyle] this will return the same [Token].
779 /// If the [QuoteStyle] is different it will return a new [Token].
780 /// [QuoteStyle] must not be [QuoteStyle::None]
781 #[inline]
782 pub fn with_quotes(&self, quote_style: QuoteStyle) -> Token {
783 debug_assert!(quote_style != QuoteStyle::None);
784 if self.kind_bits() != Kind::String as u8 || quote_style == self.quote_style() {
785 return *self;
786 }
787 Token::new_string(quote_style, self.has_close_quote(), self.contains_escape_chars(), self.len())
788 }
789
790 /// If the [Token] is a [Kind::String] this checks if the string ended in a close quote.
791 /// It is possible to have a valid String token that does not end in a close quote, by eliding the quote at the end of
792 /// a file.
793 ///
794 /// Asserts: The [Kind] is [Kind::String].
795 #[inline]
796 pub const fn has_close_quote(&self) -> bool {
797 debug_assert!(self.kind_bits() == Kind::String as u8);
798 self.second_bit_is_set()
799 }
800
801 /// Checks if it is possible for the [Token] to contain escape characters. Numbers, for example, cannot. Idents can.
802 #[inline]
803 pub const fn can_escape(&self) -> bool {
804 self.kind_bits() == Kind::String as u8 || self.kind_bits() == Kind::Dimension as u8 || self.is_ident_like()
805 }
806
807 /// If the [Token] can escape, checks if the underlying source text contained escape characters.
808 ///
809 /// Asserts: The token can escape ([Token::can_escape()]).
810 #[inline]
811 pub const fn contains_escape_chars(&self) -> bool {
812 if self.kind_bits() == Kind::Dimension as u8 {
813 // Always assume Dimension contains escape because we have other fast paths to handle dimension units
814 return true;
815 }
816 self.can_escape() && self.first_bit_is_set()
817 }
818
819 /// If the [Token] is Ident like, checks if the first two code points are HYPHEN-MINUS (`-`).
820 ///
821 /// Asserts: The token is "ident like", i.e. it is [Kind::Ident], [Kind::AtKeyword], [Kind::Function], [Kind::Hash].
822 #[inline]
823 pub const fn is_dashed_ident(&self) -> bool {
824 debug_assert!(self.is_ident_like());
825 self.second_bit_is_set()
826 }
827
828 /// Checks if the [Token] is Ident like and none of the characters are ASCII upper-case.
829 #[inline]
830 pub const fn is_lower_case(&self) -> bool {
831 self.is_ident_like() && !self.third_bit_is_set()
832 }
833
834 #[inline]
835 pub fn atom_bits(&self) -> u32 {
836 if self.kind_bits() == Kind::Dimension as u8 && self.first_bit_is_set() {
837 self.0 & 0b111_1111
838 } else if self.is_ident_like() && self.kind_bits() != Kind::Hash as u8 {
839 self.0 & LENGTH_MASK
840 } else {
841 0
842 }
843 }
844
845 /// Checks if the [Token] is Trivia-like, that is [Kind::Comment], [Kind::Whitespace], [Kind::Eof]
846 #[inline]
847 pub const fn is_trivia(&self) -> bool {
848 self.kind_bits() & 0b000011 == self.kind_bits()
849 }
850
851 /// If the [Token] is [Kind::Url], checks if there are leading Whitespace characters before the inner value.
852 ///
853 /// Asserts: The token is [Kind::Url].
854 #[inline]
855 pub const fn url_has_leading_space(&self) -> bool {
856 debug_assert!(self.kind_bits() == Kind::Url as u8);
857 self.second_bit_is_set()
858 }
859
860 /// If the [Token] is [Kind::Url], checks if the closing parenthesis is present.
861 ///
862 /// Asserts: The token is [Kind::Url].
863 #[inline]
864 pub const fn url_has_closing_paren(&self) -> bool {
865 debug_assert!(self.kind_bits() == Kind::Url as u8);
866 self.third_bit_is_set()
867 }
868
869 /// If the [Token] is [Kind::Hash], checks if the Hash is "ID-like" (i.e its first character is ASCII).
870 ///
871 /// Asserts: The token is [Kind::Hash].
872 #[inline]
873 pub const fn hash_is_id_like(&self) -> bool {
874 debug_assert!(self.kind_bits() == Kind::Hash as u8);
875 self.second_bit_is_set()
876 }
877
878 /// Checks if the [Token] is [Kind::BadString] or [Kind::BadUrl].
879 #[inline]
880 pub const fn is_bad(&self) -> bool {
881 (self.kind_bits() | 0b00001) & 0b11001 == 1
882 }
883
884 /// Checks if the [Token] is [Kind::CdcOrCdo] and is the CDC variant of that token.
885 #[inline]
886 pub const fn is_cdc(&self) -> bool {
887 self.kind_bits() == (Kind::CdcOrCdo as u8) && self.third_bit_is_set()
888 }
889
890 /// Some tokens may have a "leading" part:
891 /// - [Kind::AtKeyword] always starts with a `@`,
892 /// - [Kind::Hash] with a `#`.
893 /// - [Kind::String] with a `"` or `'`.
894 /// - [Kind::Comment] with a leading `/*` (or `//`).
895 /// - [Kind::Dimension] has a leading numeric portion.
896 /// - [Kind::Url] has the leading `url(` ident (which may vary in exact representation).
897 ///
898 /// This function returns the length of that, irrespective of the [Kind]. For other kinds not listed, this will return
899 /// `0`, but for the above kinds it will calculate the leading length. This is useful for parsing out the underlying
900 /// data which is likely to be of greater use.
901 pub fn leading_len(&self) -> u32 {
902 match self.kind() {
903 Kind::AtKeyword | Kind::Hash | Kind::String => 1,
904 Kind::Dimension => self.numeric_len(),
905 Kind::Comment => 2,
906 Kind::Url => (self.0 & LENGTH_MASK) >> 12,
907 _ => 0,
908 }
909 }
910
911 /// Some tokens may have a "trailing" part:
912 /// - [Kind::Function] will always have an opening `(`.
913 /// - [Kind::String] may have a closing `"` or `'`.
914 /// - [Kind::Comment] may have a closing `*/`
915 /// - [Kind::Url] may have a clsoing `)`.
916 ///
917 /// This function returns the length of that, irrespective of the [Kind]. For other kinds not listed, this will return
918 /// `0`, but for the above kinds it will calculate the leading length. This is useful for parsing out the underlying
919 /// data which is likely to be of greater use.
920 pub fn trailing_len(&self) -> u32 {
921 match self.kind() {
922 Kind::Function => 1,
923 Kind::String => self.has_close_quote() as u32,
924 Kind::Comment if self.comment_style().unwrap().is_block() => 2,
925 Kind::Url => self.0 & !HALF_LENGTH_MASK,
926 _ => 0,
927 }
928 }
929
930 /// Certain kinds have a [PairWise] equivalent:
931 /// - [Kind::LeftParen] has [Kind::RightParen]
932 /// - [Kind::LeftCurly] has [Kind::RightCurly]
933 /// - [Kind::LeftSquare] has [Kind::RightSquare]
934 ///
935 /// This function returns the [PairWise] enum, if the [Token] is one of the above listed [Kinds][Kind]. For any other
936 /// [Kind] this returns [None].
937 #[inline]
938 pub fn to_pairwise(&self) -> Option<PairWise> {
939 PairWise::from_token(self)
940 }
941
942 /// A convenience function for `Cursor::new(offset, token)`.
943 #[inline(always)]
944 pub fn with_cursor(self, offset: SourceOffset) -> Cursor {
945 Cursor::new(offset, self)
946 }
947
948 /// If the [Kind] is [Kind::Hash] then this token may have had the opportunity to be parsed as a `<hex-value>` (e.g.
949 /// `#fff`). When this happens the character data is parsed during tokenization into a u32 which stores the
950 /// RR,GG,BB,AA values.
951 #[inline(always)]
952 pub fn hex_value(self) -> u32 {
953 if self == Kind::Hash { self.1 } else { 0 }
954 }
955
956 /// If this [Token] is preceded by the [Token] `other` then a separating token (e.g. a comment) will need to be
957 /// inserted between these the two tokens during serialization, in order for them to be able to be re-tokenized as
958 /// the same tokens. For example an Ident ("a") adjacent to an Ident ("b"), if serialized without whitespace, would
959 /// create a single Ident ("ab"). The rules for estbalishing whether or not these tokens needs whitespace are quite
960 /// simple and are effectively [defined in the serialization section of the spec][1]. To reproduce the table:
961 ///
962 /// [1]: https://drafts.csswg.org/css-syntax/#serialization
963 ///
964 /// | | ident | function | url | bad url | - | number | percentage | dimension | CDC | ( | * | % |
965 /// |:-----------|:-----:|:--------:|:---:|:-------:|:-:|:------:|:----------:|:---------:|:---:|:-:|:-:|:-:|
966 /// | ident | ✗ | ✗ | ✗ | ✗ | ✗ | ✗ | ✗ | ✗ | ✗ | ✗ | | |
967 /// | at-keyword | ✗ | ✗ | ✗ | ✗ | ✗ | ✗ | ✗ | ✗ | ✗ | | | |
968 /// | hash | ✗ | ✗ | ✗ | ✗ | ✗ | ✗ | ✗ | ✗ | ✗ | | | |
969 /// | dimension | ✗ | ✗ | ✗ | ✗ | ✗ | ✗ | ✗ | ✗ | ✗ | | | |
970 /// | # | ✗ | ✗ | ✗ | ✗ | ✗ | ✗ | ✗ | ✗ | ✗ | | | |
971 /// | \- | ✗ | ✗ | ✗ | ✗ | ✗ | ✗ | ✗ | ✗ | ✗ | | | |
972 /// | number | ✗ | ✗ | ✗ | ✗ | | ✗ | ✗ | ✗ | ✗ | | | ✗ |
973 /// | @ | ✗ | ✗ | ✗ | ✗ | ✗ | | | | ✗ | | | |
974 /// | . | | | | | | ✗ | ✗ | ✗ | | | | |
975 /// | + | | | | | | ✗ | ✗ | ✗ | | | | |
976 /// | / | | | | | | | | | | | ✗ | |
977 ///
978 /// The one exception not in this table is that two consecutive `/` characters should also be separated by spaces in
979 /// order to avoid abmiguities with CSS-alike languages that treat two consecutive `/` characters as a single line
980 /// comment.
981 ///
982 /// # Example
983 ///
984 /// ```
985 /// use css_lexer::*;
986 /// let mut lexer = Lexer::new(&EmptyAtomSet::ATOMS, "10 %");
987 /// let first = lexer.advance();
988 /// let _ = lexer.advance(); // Whitespace
989 /// let second = lexer.advance();
990 /// assert!(first.needs_separator_for(second));
991 /// ```
992 pub fn needs_separator_for(&self, second: Token) -> bool {
993 if second == AssociatedWhitespaceRules::EnforceBefore && *self != Kind::Whitespace
994 || *self == AssociatedWhitespaceRules::EnforceAfter && second != Kind::Whitespace
995 {
996 // We need whitespace after, unless the next token is actually whitespace.
997 return true;
998 }
999 if *self == AssociatedWhitespaceRules::BanAfter {
1000 return false;
1001 }
1002 match self.kind() {
1003 Kind::Ident => {
1004 (matches!(second.kind(), Kind::Number | Kind::Dimension) &&
1005 // numbers with a `-` need separating, but with `+` they do not.
1006 (!second.has_sign() || second.value() < 0.0))
1007 || matches!(second.kind(), Kind::Ident | Kind::Function | Kind::Url | Kind::BadUrl)
1008 || matches!(second.char(), Some('(' | '-'))
1009 || second.is_cdc()
1010 }
1011 Kind::AtKeyword | Kind::Hash | Kind::Dimension => {
1012 (matches!(second.kind(), Kind::Number | Kind::Dimension) &&
1013 // numbers with a `-` need separating, but with `+` they do not.
1014 (!second.has_sign() || second.value() < 0.0))
1015 || matches!(second.kind(), Kind::Ident | Kind::Function | Kind::Url | Kind::BadUrl)
1016 || matches!(second.char(), Some('-'))
1017 || second.is_cdc()
1018 }
1019 Kind::Number => {
1020 matches!(
1021 second.kind(),
1022 Kind::Ident | Kind::Function | Kind::Url | Kind::BadUrl | Kind::Number | Kind::Dimension
1023 ) || matches!(second.char(), Some('%'))
1024 || second.is_cdc()
1025 }
1026 _ => match self.char() {
1027 Some('#') => {
1028 matches!(
1029 second.kind(),
1030 Kind::Ident | Kind::Function | Kind::Url | Kind::BadUrl | Kind::Number | Kind::Dimension
1031 ) || matches!(second.char(), Some('-'))
1032 || second.is_cdc()
1033 }
1034 Some('-') => {
1035 matches!(
1036 second.kind(),
1037 Kind::Ident | Kind::Function | Kind::Url | Kind::BadUrl | Kind::Number | Kind::Dimension
1038 ) || matches!(second.char(), Some('-'))
1039 || second.is_cdc()
1040 }
1041 Some('@') => {
1042 matches!(second.kind(), Kind::Ident | Kind::Function | Kind::Url | Kind::BadUrl)
1043 || matches!(second.char(), Some('-'))
1044 || second.is_cdc()
1045 }
1046 Some('.') => matches!(second.kind(), Kind::Number | Kind::Dimension),
1047 Some('+') => matches!(second.kind(), Kind::Number | Kind::Dimension),
1048 Some('/') => matches!(second.char(), Some('*' | '/')),
1049 _ => false,
1050 },
1051 }
1052 }
1053}
1054
1055impl core::fmt::Debug for Token {
1056 fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
1057 let mut d = f.debug_struct(format!("Token::{}", self.kind().as_str()).as_str());
1058 match self.kind() {
1059 Kind::Eof => &mut d,
1060 Kind::Number => d.field("value", &self.value()).field("len", &self.numeric_len()),
1061 Kind::Dimension => {
1062 d.field("value", &self.value()).field("len", &self.numeric_len()).field("dimension_len", &self.len())
1063 }
1064 _ if self.is_delim_like() => {
1065 d.field("char", &self.char().unwrap()).field("len", &self.len());
1066 if !self.associated_whitespace().is_none() {
1067 d.field("associated_whitespace", &self.associated_whitespace());
1068 }
1069 &mut d
1070 }
1071 Kind::String => d
1072 .field("quote_style", &if self.first_bit_is_set() { "Double" } else { "Single" })
1073 .field("has_close_quote", &self.second_bit_is_set())
1074 .field("contains_escape_chars", &self.third_bit_is_set())
1075 .field("len", &self.len()),
1076 Kind::Ident | Kind::Function | Kind::AtKeyword => d
1077 .field("is_lower_case", &self.first_bit_is_set())
1078 .field("is_dashed_ident", &self.second_bit_is_set())
1079 .field("contains_escape_chars", &self.third_bit_is_set())
1080 .field("len", &self.len()),
1081 Kind::Hash => d
1082 .field("is_lower_case", &self.first_bit_is_set())
1083 .field("hash_is_id_like", &self.second_bit_is_set())
1084 .field("contains_escape_chars", &self.third_bit_is_set())
1085 .field("len", &self.len()),
1086 Kind::Url => d
1087 .field("url_has_closing_paren", &self.first_bit_is_set())
1088 .field("url_has_leading_space", &self.second_bit_is_set())
1089 .field("contains_escape_chars", &self.third_bit_is_set())
1090 .field("len", &self.len()),
1091 Kind::CdcOrCdo => d.field("is_cdc", &self.first_bit_is_set()).field("len", &self.len()),
1092 Kind::Whitespace => d.field("contains", &self.whitespace_style()).field("len", &self.len()),
1093 _ => d
1094 .field("flag_0", &self.first_bit_is_set())
1095 .field("flag_1", &self.second_bit_is_set())
1096 .field("flag_2", &self.third_bit_is_set())
1097 .field("len", &self.len()),
1098 }
1099 .finish()
1100 }
1101}
1102
1103impl std::fmt::Display for Token {
1104 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
1105 match self.kind() {
1106 Kind::Delim => write!(f, "Delim({})", self.char().unwrap()),
1107 k => write!(f, "{}", k.as_str()),
1108 }
1109 }
1110}
1111
1112#[cfg(feature = "serde")]
1113impl serde::ser::Serialize for Token {
1114 fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
1115 where
1116 S: serde::ser::Serializer,
1117 {
1118 use serde::ser::SerializeStruct;
1119 if *self == Self::EMPTY {
1120 return serializer.serialize_none();
1121 }
1122 let mut state = serializer.serialize_struct("Token", 3)?;
1123 state.serialize_field("kind", self.kind().as_str())?;
1124 state.serialize_field("len", &self.len())?;
1125 state.end()
1126 }
1127}
1128
1129impl From<Token> for Kind {
1130 fn from(token: Token) -> Self {
1131 token.kind()
1132 }
1133}
1134
1135impl PartialEq<Kind> for Token {
1136 fn eq(&self, other: &Kind) -> bool {
1137 self.kind_bits() == *other as u8
1138 }
1139}
1140
1141impl From<Token> for KindSet {
1142 fn from(token: Token) -> Self {
1143 KindSet::new(&[token.kind()])
1144 }
1145}
1146
1147impl PartialEq<KindSet> for Token {
1148 fn eq(&self, other: &KindSet) -> bool {
1149 other.contains_bits(self.kind_bits())
1150 }
1151}
1152
1153impl From<Token> for QuoteStyle {
1154 fn from(token: Token) -> Self {
1155 token.quote_style()
1156 }
1157}
1158
1159impl PartialEq<QuoteStyle> for Token {
1160 fn eq(&self, other: &QuoteStyle) -> bool {
1161 &self.quote_style() == other
1162 }
1163}
1164
1165impl From<Token> for Whitespace {
1166 fn from(token: Token) -> Self {
1167 token.whitespace_style()
1168 }
1169}
1170
1171impl PartialEq<Whitespace> for Token {
1172 fn eq(&self, other: &Whitespace) -> bool {
1173 self.whitespace_style().intersects(*other)
1174 }
1175}
1176
1177impl PartialEq<AssociatedWhitespaceRules> for Token {
1178 fn eq(&self, other: &AssociatedWhitespaceRules) -> bool {
1179 self.associated_whitespace().intersects(*other)
1180 }
1181}
1182
1183impl PartialEq<CommentStyle> for Token {
1184 fn eq(&self, other: &CommentStyle) -> bool {
1185 self.comment_style().map(|style| &style == other).unwrap_or(false)
1186 }
1187}
1188
1189impl PartialEq<PairWise> for Token {
1190 fn eq(&self, other: &PairWise) -> bool {
1191 self.to_pairwise().map(|style| &style == other).unwrap_or(false)
1192 }
1193}
1194
1195impl PartialEq<char> for Token {
1196 fn eq(&self, other: &char) -> bool {
1197 self.char().map(|char| char == *other).unwrap_or(false)
1198 }
1199}
1200
1201#[test]
1202fn size_test() {
1203 assert_eq!(::std::mem::size_of::<Token>(), 8);
1204}
1205
1206#[test]
1207fn test_new_whitespace() {
1208 assert_eq!(Token::SPACE, Kind::Whitespace);
1209 assert_eq!(Token::SPACE, Whitespace::Space);
1210 assert_eq!(Token::TAB, Kind::Whitespace);
1211 assert_eq!(Token::TAB, Whitespace::Tab);
1212 assert_eq!(Token::NEWLINE, Kind::Whitespace);
1213 assert_eq!(Token::NEWLINE, Whitespace::Newline);
1214 assert_eq!(Token::new_whitespace(Whitespace::Space, 4), Kind::Whitespace);
1215 assert_eq!(Token::new_whitespace(Whitespace::Space | Whitespace::Newline, 4), Whitespace::Space);
1216 assert_eq!(Token::new_whitespace(Whitespace::Space, 4).len(), 4);
1217 assert_eq!(Token::new_whitespace(Whitespace::Tab | Whitespace::Space, 4), Whitespace::Tab);
1218 assert_eq!(Token::new_whitespace(Whitespace::Newline, 4), Whitespace::Newline);
1219 assert_eq!(Token::new_whitespace(Whitespace::Newline, 4).len(), 4);
1220}
1221
1222#[test]
1223fn test_new_comment() {
1224 assert_eq!(Token::new_comment(CommentStyle::Block, 4), Kind::Comment);
1225 assert_eq!(Token::new_comment(CommentStyle::Block, 4), CommentStyle::Block);
1226 assert_eq!(Token::new_comment(CommentStyle::Single, 4), CommentStyle::Single);
1227}
1228
1229#[test]
1230fn test_new_number() {
1231 assert_eq!(Token::new_number(false, false, 3, 4.2), Kind::Number);
1232 assert_eq!(Token::new_number(false, false, 3, 4.2).value(), 4.2);
1233 assert_eq!(Token::new_number(false, false, 3, 4.2).len(), 3);
1234 assert_eq!(Token::new_number(false, true, 9, 4.2), Kind::Number);
1235 assert_eq!(Token::new_number(false, true, 9, 4.2).value(), 4.2);
1236 assert_eq!(Token::new_number(false, true, 9, 4.2).len(), 9);
1237 assert!(!Token::new_number(false, false, 3, 4.2).has_sign());
1238 assert!(Token::new_number(false, true, 3, 4.2).has_sign());
1239 assert!(!Token::new_number(false, true, 3, 4.0).is_float());
1240 assert!(Token::new_number(true, false, 3, 4.2).is_float());
1241}
1242
1243#[test]
1244fn test_new_string() {
1245 assert_eq!(Token::new_string(QuoteStyle::Single, false, false, 4), Kind::String);
1246 assert_eq!(Token::new_string(QuoteStyle::Single, false, false, 4), QuoteStyle::Single);
1247 assert!(!Token::new_string(QuoteStyle::Single, false, false, 4).has_close_quote());
1248 assert!(!Token::new_string(QuoteStyle::Single, false, false, 4).contains_escape_chars());
1249 assert_eq!(Token::new_string(QuoteStyle::Single, false, false, 4).len(), 4);
1250 assert_eq!(Token::new_string(QuoteStyle::Double, false, false, 4), Kind::String);
1251 assert_eq!(Token::new_string(QuoteStyle::Double, false, false, 4), QuoteStyle::Double);
1252 assert!(Token::new_string(QuoteStyle::Double, true, false, 4).has_close_quote());
1253 assert!(!Token::new_string(QuoteStyle::Double, true, false, 4).contains_escape_chars());
1254 assert_eq!(Token::new_string(QuoteStyle::Double, true, false, 5).len(), 5);
1255 assert!(Token::new_string(QuoteStyle::Double, true, true, 4).contains_escape_chars());
1256 assert!(Token::new_string(QuoteStyle::Double, false, true, 4).contains_escape_chars());
1257}
1258
1259#[test]
1260fn test_new_hash() {
1261 assert_eq!(Token::new_hash(false, false, false, 4, 0), Kind::Hash);
1262 assert!(!Token::new_hash(false, false, false, 4, 0).contains_escape_chars());
1263 assert!(Token::new_hash(false, false, true, 4, 0).contains_escape_chars());
1264 assert!(Token::new_hash(false, false, true, 4, 0).is_lower_case());
1265 assert!(!Token::new_hash(true, false, false, 4, 0).is_lower_case());
1266 assert_eq!(Token::new_hash(true, false, false, 4, 0).len(), 4);
1267 assert_eq!(Token::new_hash(true, false, false, 4, 0).hex_value(), 0);
1268 assert_eq!(Token::new_hash(true, false, false, 4, 18).hex_value(), 18);
1269}
1270
1271#[test]
1272#[should_panic]
1273fn test_new_string_with_quotes_none() {
1274 Token::new_string(QuoteStyle::None, false, true, 4);
1275}
1276
1277#[test]
1278fn test_new_delim() {
1279 assert_eq!(Token::new_delim('>'), Kind::Delim);
1280 assert_eq!(Token::new_delim('>'), '>');
1281 assert_eq!(Token::new_delim('>').len(), 1);
1282 assert_eq!(Token::new_delim('.'), Kind::Delim);
1283 assert_eq!(Token::new_delim('.'), '.');
1284 assert_eq!(Token::new_delim('.').len(), 1);
1285 assert_eq!(Token::new_delim('ℝ'), Kind::Delim);
1286 assert_eq!(Token::new_delim('ℝ'), 'ℝ');
1287 assert_eq!(Token::new_delim('ℝ').len(), 3);
1288 assert_eq!(Token::new_delim('💣'), Kind::Delim);
1289 assert_eq!(Token::new_delim('💣'), '💣');
1290 assert_eq!(Token::new_delim('💣').len(), 4);
1291 assert_eq!(Token::new_delim('💣').len(), 4);
1292 assert_eq!(Token::new_delim('💣').len(), 4);
1293}
1294
1295#[test]
1296fn with_associated_whitespace() {
1297 assert_eq!(
1298 Token::new_delim('>').with_associated_whitespace(
1299 AssociatedWhitespaceRules::EnforceBefore | AssociatedWhitespaceRules::EnforceAfter
1300 ),
1301 AssociatedWhitespaceRules::EnforceBefore | AssociatedWhitespaceRules::EnforceBefore
1302 );
1303}
1304
1305#[test]
1306fn test_with_quotes() {
1307 assert_eq!(
1308 Token::new_string(QuoteStyle::Single, false, false, 4).with_quotes(QuoteStyle::Double),
1309 Token::new_string(QuoteStyle::Double, false, false, 4)
1310 );
1311 assert_eq!(
1312 Token::new_string(QuoteStyle::Double, true, true, 8).with_quotes(QuoteStyle::Single),
1313 Token::new_string(QuoteStyle::Single, true, true, 8),
1314 );
1315}
1316
1317#[test]
1318#[should_panic]
1319fn test_with_quotes_none() {
1320 Token::new_string(QuoteStyle::Single, false, true, 4).with_quotes(QuoteStyle::None);
1321 Token::new_string(QuoteStyle::Double, false, true, 4).with_quotes(QuoteStyle::None);
1322}
1323
1324#[test]
1325fn test_new_dimension() {
1326 {
1327 let token = Token::new_dimension(false, false, 3, 3, 999.0, 0);
1328 assert_eq!(token, Kind::Dimension);
1329 assert_eq!(token.value(), 999.0);
1330 assert_eq!(token.numeric_len(), 3);
1331 assert_eq!(token.len(), 6);
1332 assert!(!token.is_float());
1333 assert!(!token.has_sign());
1334 }
1335 {
1336 let token = Token::new_dimension(false, false, 5, 2, 8191.0, 0);
1337 assert_eq!(token, Kind::Dimension);
1338 assert_eq!(token.value(), 8191.0);
1339 assert_eq!(token.numeric_len(), 5);
1340 assert_eq!(token.len(), 7);
1341 assert!(!token.is_float());
1342 assert!(!token.has_sign());
1343 }
1344 for i in -8191..8191 {
1345 let token = Token::new_dimension(false, false, 9, 3, i as f32, 0);
1346 assert_eq!(token.value(), i as f32);
1347 }
1348}