Skip to main content

mos_parse/
inline.rs

1use crate::parser::Parser;
2use crate::support::{find_byte, scan_label_chars};
3use mos_core::{Suggestion, codes};
4
5use crate::{Inline, InlineKind};
6
7#[derive(Clone, Copy, Debug, Default)]
8enum InlineStyle {
9    #[default]
10    Plain,
11    Emphasis,
12    Strong,
13    BoldItalic,
14}
15
16impl InlineStyle {
17    fn with(self, delimiter: Delimiter) -> Self {
18        match delimiter {
19            Delimiter::Strong => self.with_strong(),
20            Delimiter::Emphasis => self.with_emphasis(),
21        }
22    }
23
24    fn with_strong(self) -> Self {
25        match self {
26            Self::Plain => Self::Strong,
27            Self::Emphasis | Self::Strong | Self::BoldItalic => Self::BoldItalic,
28        }
29    }
30
31    fn with_emphasis(self) -> Self {
32        match self {
33            Self::Plain => Self::Emphasis,
34            Self::Strong | Self::Emphasis | Self::BoldItalic => Self::BoldItalic,
35        }
36    }
37
38    fn kind(self) -> InlineKind {
39        match self {
40            Self::Plain => InlineKind::Text,
41            Self::Emphasis => InlineKind::Emphasis,
42            Self::Strong => InlineKind::Strong,
43            Self::BoldItalic => InlineKind::BoldItalic,
44        }
45    }
46}
47
48#[derive(Clone, Copy, Debug)]
49enum Delimiter {
50    Emphasis,
51    Strong,
52}
53
54impl Delimiter {
55    fn width(self) -> usize {
56        match self {
57            Self::Emphasis => 1,
58            Self::Strong => 2,
59        }
60    }
61
62    fn closing_text(self) -> &'static str {
63        match self {
64            Self::Emphasis => "*",
65            Self::Strong => "**",
66        }
67    }
68}
69
70struct ParsedSegment {
71    inlines: Vec<Inline>,
72    next: usize,
73    closed: Option<ClosedDelimiter>,
74}
75
76struct ClosedDelimiter {
77    end: usize,
78}
79
80impl Parser<'_> {
81    /// Tokenize `slice` (whose first byte sits at `base` in `self.src`)
82    /// into inline runs. Backtick code and `@label` references are
83    /// atomic; emphasis delimiters can nest into bold+italic text runs.
84    pub(crate) fn parse_inlines(&mut self, slice: &str, base: usize) -> Vec<Inline> {
85        self.parse_inline_segment(slice, base, 0, InlineStyle::default(), None)
86            .inlines
87    }
88
89    fn parse_inline_segment(
90        &mut self,
91        slice: &str,
92        base: usize,
93        from: usize,
94        style: InlineStyle,
95        close: Option<Delimiter>,
96    ) -> ParsedSegment {
97        let bytes = slice.as_bytes();
98        let mut out: Vec<Inline> = Vec::new();
99        // `pending` accumulates characters that belong to the current
100        // styled text run but aren't a verbatim slice of `slice`: at
101        // the moment, only the soft-hyphen shorthand `\-` (which
102        // contributes a U+00AD codepoint without `\` or `-` ever
103        // appearing in the run). When `pending` is non-empty, the run
104        // can't be captured by a single `slice[start..end]` so we
105        // switch to a `String`-buffered flush path. `pending_source_start`
106        // remembers the first source byte that fed `pending` so the
107        // emitted Inline's span still covers the full source extent
108        // (including the consumed `\-` bytes).
109        let mut pending: String = String::new();
110        let mut pending_source_start: Option<usize> = None;
111        let mut i = from;
112        let mut text_start = from;
113        while i < bytes.len() {
114            let c = bytes[i];
115            if c == b'\\' {
116                if i + 1 < bytes.len() && bytes[i + 1] == b'\\' {
117                    self.flush_styled_text_with_pending(
118                        &mut out,
119                        slice,
120                        base,
121                        text_start,
122                        i,
123                        style,
124                        &mut pending,
125                        &mut pending_source_start,
126                    );
127                    out.push(Inline {
128                        kind: InlineKind::HardBreak,
129                        text: String::new(),
130                        span: self.span(base + i, base + i + 2),
131                        label_span: None,
132                    });
133                    i += 2;
134                    text_start = i;
135                    continue;
136                }
137                if i + 1 < bytes.len() && bytes[i + 1] == b'-' {
138                    // `\-` -> literal U+00AD soft hyphen. Splice the
139                    // preceding slice text into `pending`, append the
140                    // SHY codepoint, skip both source bytes. Remember
141                    // the earliest source byte covered by `pending` so
142                    // the eventual flush spans the original `\-` bytes
143                    // instead of collapsing to a zero-width range.
144                    if pending_source_start.is_none() {
145                        pending_source_start = Some(text_start);
146                    }
147                    pending.push_str(&slice[text_start..i]);
148                    pending.push('\u{AD}');
149                    i += 2;
150                    text_start = i;
151                    continue;
152                }
153                if i + 1 < bytes.len() && bytes[i + 1] == b'<' {
154                    // `\<` -> literal `<`. Lets authors write angle-bracket text
155                    // (e.g. an HTML tag like `<head>`) in prose and headings
156                    // without `<...>` being read as label syntax. The heading
157                    // label scanners skip an escaped `<` to match.
158                    if pending_source_start.is_none() {
159                        pending_source_start = Some(text_start);
160                    }
161                    pending.push_str(&slice[text_start..i]);
162                    pending.push('<');
163                    i += 2;
164                    text_start = i;
165                    continue;
166                }
167                // Backslash followed by anything other than `\`, `-`, or `<`
168                // is left to fall through as a literal `\` byte (the
169                // slice path picks it up at the next flush). A lone
170                // trailing `\` at end-of-input gets a warning so the
171                // author notices a likely-incomplete escape; a `\`
172                // followed by some other character is kept silent
173                // because the previous behaviour was "backslash is
174                // literal", and emitting a diagnostic for every
175                // `C:\Temp` / `\*foo*` / etc. would be noisy.
176                if i + 1 >= bytes.len() {
177                    self.diagnostics.push(self.warn(
178                        &codes::MOS0038,
179                        "lone trailing `\\` is not a recognized escape; treated as literal text",
180                        base + i,
181                        base + i + 1,
182                    ));
183                }
184                i += 1;
185                continue;
186            }
187            if c == b'*' {
188                let run_len = star_run_len(bytes, i);
189                if let Some(delimiter) = close
190                    && delimiter_closes(delimiter, run_len)
191                {
192                    self.flush_styled_text_with_pending(
193                        &mut out,
194                        slice,
195                        base,
196                        text_start,
197                        i,
198                        style,
199                        &mut pending,
200                        &mut pending_source_start,
201                    );
202                    let width = delimiter.width();
203                    return ParsedSegment {
204                        inlines: out,
205                        next: i + width,
206                        closed: Some(ClosedDelimiter { end: i + width }),
207                    };
208                }
209
210                let delimiter = if run_len >= 2 {
211                    Delimiter::Strong
212                } else {
213                    Delimiter::Emphasis
214                };
215                let diagnostic_checkpoint = self.diagnostics.len();
216                let parsed = self.parse_inline_segment(
217                    slice,
218                    base,
219                    i + delimiter.width(),
220                    style.with(delimiter),
221                    Some(delimiter),
222                );
223
224                if let Some(closed) = parsed.closed {
225                    self.flush_styled_text_with_pending(
226                        &mut out,
227                        slice,
228                        base,
229                        text_start,
230                        i,
231                        style,
232                        &mut pending,
233                        &mut pending_source_start,
234                    );
235                    let mut children = parsed.inlines;
236                    widen_span_to_delimiters(&mut children, base + i, base + closed.end);
237                    out.extend(children);
238                    i = parsed.next;
239                    text_start = i;
240                    continue;
241                }
242
243                self.diagnostics.truncate(diagnostic_checkpoint);
244                if close.is_none() {
245                    self.warn_unterminated_delimiter(slice, base, i, delimiter);
246                }
247                i += delimiter.width();
248                continue;
249            }
250            if c == b'`' {
251                if let Some(end) = find_byte(bytes, b'`', i + 1) {
252                    self.flush_styled_text_with_pending(
253                        &mut out,
254                        slice,
255                        base,
256                        text_start,
257                        i,
258                        style,
259                        &mut pending,
260                        &mut pending_source_start,
261                    );
262                    out.push(Inline {
263                        kind: InlineKind::Code,
264                        text: slice[i + 1..end].to_owned(),
265                        span: self.span(base + i, base + end + 1),
266                        label_span: None,
267                    });
268                    i = end + 1;
269                    text_start = i;
270                    continue;
271                }
272                let mut diagnostic = self.warn(
273                    &codes::MOS0034,
274                    "unterminated `` `code` `` run; treated as text",
275                    base + i,
276                    base + i + 1,
277                );
278                if let Some(insertion) = Self::code_closing_insertion(slice, i, close) {
279                    let insertion = base + insertion;
280                    diagnostic = diagnostic
281                        .with_suggestion(Suggestion::new(self.span(insertion, insertion), "`"));
282                }
283                self.diagnostics.push(diagnostic);
284                i += 1;
285                continue;
286            }
287            if c == b'@' {
288                let id_end = scan_label_chars(bytes, i + 1);
289                if id_end > i + 1 {
290                    // `@page(label)`: a page reference. Only a *well-formed*
291                    // `@page(` + label + `)` takes this branch; anything else
292                    // (`@page` alone, an unterminated `@page(`, `@pages`) falls
293                    // through to the ordinary `@label` reference path below.
294                    if &slice[i + 1..id_end] == "page"
295                        && id_end < bytes.len()
296                        && bytes[id_end] == b'('
297                    {
298                        let label_start = id_end + 1;
299                        let label_end = scan_label_chars(bytes, label_start);
300                        if label_end > label_start
301                            && label_end < bytes.len()
302                            && bytes[label_end] == b')'
303                        {
304                            self.flush_styled_text_with_pending(
305                                &mut out,
306                                slice,
307                                base,
308                                text_start,
309                                i,
310                                style,
311                                &mut pending,
312                                &mut pending_source_start,
313                            );
314                            out.push(Inline {
315                                kind: InlineKind::PageReference,
316                                text: slice[label_start..label_end].to_owned(),
317                                span: self.span(base + i, base + label_end + 1),
318                                // The label identifier between `@page(` and `)`.
319                                label_span: Some(self.span(base + label_start, base + label_end)),
320                            });
321                            i = label_end + 1;
322                            text_start = i;
323                            continue;
324                        }
325                    }
326                    self.flush_styled_text_with_pending(
327                        &mut out,
328                        slice,
329                        base,
330                        text_start,
331                        i,
332                        style,
333                        &mut pending,
334                        &mut pending_source_start,
335                    );
336                    out.push(Inline {
337                        kind: InlineKind::Reference,
338                        text: slice[i + 1..id_end].to_owned(),
339                        span: self.span(base + i, base + id_end),
340                        // The label identifier after the `@` sigil.
341                        label_span: Some(self.span(base + i + 1, base + id_end)),
342                    });
343                    i = id_end;
344                    text_start = i;
345                    continue;
346                }
347                self.diagnostics.push(self.warn(
348                    &codes::MOS0036,
349                    "stray `@` is not followed by a label identifier; treated as text",
350                    base + i,
351                    base + i + 1,
352                ));
353                i += 1;
354                continue;
355            }
356            if c == b'[' && i + 1 < bytes.len() && bytes[i + 1] == b'@' {
357                // `[@key]`: citation. Only enter the citation branch
358                // once we have seen `[@`, so a bare `[` keeps its
359                // current literal-text behaviour and never warns.
360                let key_start = i + 2;
361                let key_end = scan_label_chars(bytes, key_start);
362                if key_end > key_start && key_end < bytes.len() && bytes[key_end] == b']' {
363                    self.flush_styled_text_with_pending(
364                        &mut out,
365                        slice,
366                        base,
367                        text_start,
368                        i,
369                        style,
370                        &mut pending,
371                        &mut pending_source_start,
372                    );
373                    let end = key_end + 1;
374                    out.push(Inline {
375                        kind: InlineKind::Citation,
376                        text: slice[key_start..key_end].to_owned(),
377                        span: self.span(base + i, base + end),
378                        label_span: None,
379                    });
380                    i = end;
381                    text_start = i;
382                    continue;
383                }
384                // Either the key was empty, the `]` was missing, or
385                // the body uses a not-yet-supported form (`[@a; @b]`,
386                // prefix/suffix). Warn once and *consume* the
387                // citation-candidate extent so the trailing `@key`
388                // bytes don't fall back through to the `@`-reference
389                // branch; that would surface a bogus MOS0033 in the
390                // resolver for what was syntactically a malformed
391                // citation, not an unknown label.
392                //
393                // Recovery extent:
394                // * if a `]` exists later in this inline slice,
395                //   consume up to and including it (covers
396                //   `[@a; @b]`, `[@see @key, p. 33]`, `[@]`);
397                // * otherwise skip past `[@` only (covers truly
398                //   unterminated `[@key…` at end of paragraph) and
399                //   let the bare key chars settle as literal text.
400                let recovery_end = if let Some(close) = find_byte(bytes, b']', key_start) {
401                    close + 1
402                } else {
403                    key_start
404                };
405                self.diagnostics.push(self.warn(
406                    &codes::MOS0039,
407                    "malformed citation `[@…]`; expected `[@key]`; treated as text",
408                    base + i,
409                    base + recovery_end,
410                ));
411                i = recovery_end;
412                continue;
413            }
414            i += 1;
415        }
416        self.flush_styled_text_with_pending(
417            &mut out,
418            slice,
419            base,
420            text_start,
421            bytes.len(),
422            style,
423            &mut pending,
424            &mut pending_source_start,
425        );
426        ParsedSegment {
427            inlines: out,
428            next: bytes.len(),
429            closed: None,
430        }
431    }
432
433    /// Flush `slice[from..to]` (possibly prefixed by buffered `pending`
434    /// text from earlier escape expansions like `\-` → U+00AD) into a
435    /// single styled-text inline. The span covers the full source range
436    /// from the earliest byte that fed `pending` (or `from` when pending
437    /// is empty) through `to`, so emitted inlines whose text includes
438    /// expanded escapes still carry a span covering the original source
439    /// bytes: including the consumed `\-` markers.
440    #[allow(
441        clippy::too_many_arguments,
442        reason = "transitional: extends the existing `flush_styled_text` (7-arg) with a buffered-text channel and a pending-source-start tracker for escape expansion. Bundling the slice/base/style triple into a context struct would churn every call site in `parse_inline_segment` for no net clarity."
443    )]
444    fn flush_styled_text_with_pending(
445        &self,
446        out: &mut Vec<Inline>,
447        slice: &str,
448        base: usize,
449        from: usize,
450        to: usize,
451        style: InlineStyle,
452        pending: &mut String,
453        pending_source_start: &mut Option<usize>,
454    ) {
455        if pending.is_empty() {
456            // Defensive: pending_source_start should always be paired
457            // with a non-empty pending. Clear it anyway so a future
458            // escape that splices into `pending` starts from a fresh
459            // state.
460            *pending_source_start = None;
461            self.flush_styled_text(out, slice, base, from, to, style);
462            return;
463        }
464        let mut text = std::mem::take(pending);
465        if from < to {
466            text.push_str(&slice[from..to]);
467        }
468        let span_from = pending_source_start.take().unwrap_or(from);
469        out.push(Inline {
470            kind: style.kind(),
471            text,
472            span: self.span(base + span_from, base + to),
473            label_span: None,
474        });
475    }
476
477    fn flush_styled_text(
478        &self,
479        out: &mut Vec<Inline>,
480        slice: &str,
481        base: usize,
482        from: usize,
483        to: usize,
484        style: InlineStyle,
485    ) {
486        if from < to {
487            out.push(Inline {
488                kind: style.kind(),
489                text: slice[from..to].to_owned(),
490                span: self.span(base + from, base + to),
491                label_span: None,
492            });
493        }
494    }
495
496    fn warn_unterminated_delimiter(
497        &mut self,
498        slice: &str,
499        base: usize,
500        i: usize,
501        delimiter: Delimiter,
502    ) {
503        let (def, message) = match delimiter {
504            Delimiter::Strong => (
505                &codes::MOS0028,
506                "unterminated `**strong**` run; treated as text",
507            ),
508            Delimiter::Emphasis => (
509                &codes::MOS0031,
510                "unterminated `*emphasis*` run; treated as text",
511            ),
512        };
513        let mut diagnostic = self.warn(def, message, base + i, base + i + delimiter.width());
514        if let Some(suggestion) = self.closing_delimiter_suggestion(slice, base, i, delimiter) {
515            diagnostic = diagnostic.with_suggestion(suggestion);
516        }
517        self.diagnostics.push(diagnostic);
518    }
519
520    fn closing_delimiter_suggestion(
521        &self,
522        slice: &str,
523        base: usize,
524        i: usize,
525        delimiter: Delimiter,
526    ) -> Option<Suggestion> {
527        let after_opener = i + delimiter.width();
528        if slice.as_bytes()[after_opener..].contains(&b'*') {
529            return None;
530        }
531        let insertion = base + slice.len();
532        Some(Suggestion::new(
533            self.span(insertion, insertion),
534            delimiter.closing_text(),
535        ))
536    }
537
538    fn code_closing_insertion(slice: &str, i: usize, close: Option<Delimiter>) -> Option<usize> {
539        let bytes = slice.as_bytes();
540        let mut cursor = i + 1;
541        while cursor < bytes.len() {
542            if bytes[cursor] == b'*' {
543                let run_len = star_run_len(bytes, cursor);
544                if close.is_some_and(|delimiter| delimiter_closes(delimiter, run_len)) {
545                    return Some(cursor);
546                }
547                return None;
548            }
549            cursor += 1;
550        }
551        Some(bytes.len())
552    }
553}
554
555fn star_run_len(bytes: &[u8], from: usize) -> usize {
556    let mut end = from;
557    while end < bytes.len() && bytes[end] == b'*' {
558        end += 1;
559    }
560    end - from
561}
562
563fn delimiter_closes(delimiter: Delimiter, run_len: usize) -> bool {
564    match delimiter {
565        Delimiter::Strong => run_len >= 2,
566        Delimiter::Emphasis => run_len % 2 == 1,
567    }
568}
569
570fn widen_span_to_delimiters(inlines: &mut [Inline], start: usize, end: usize) {
571    if let Some(first) = inlines.first_mut() {
572        first.span.set_start(start);
573    }
574    if let Some(last) = inlines.last_mut() {
575        last.span.set_end(end);
576    }
577}