Skip to main content

mos_parse/
support.rs

1//! Internal parser support helpers.
2
3/// One marker line captured during list collection. Not user-facing --
4/// the public AST uses [`crate::ListItem`] after nesting is resolved.
5#[derive(Debug, Clone, Copy)]
6pub(super) struct RawListLine {
7    /// Byte count of ASCII spaces before the marker.
8    pub(super) indent: usize,
9    /// `true` for `\d+\. `, `false` for `- `.
10    pub(super) ordered: bool,
11    /// Byte offset (into `Parser::src`) of the first content byte
12    /// after the marker and its trailing whitespace.
13    pub(super) content_start: usize,
14    /// Byte offset of the line's content end (excluding any `\r\n` or
15    /// `\n` terminator).
16    pub(super) content_end: usize,
17    /// Byte offset of the start of the line (the first leading-space
18    /// byte). Used for the item's `SourceSpan`.
19    pub(super) line_start: usize,
20}
21
22/// If the line that starts at `pos` opens with a list marker, return
23/// `Some((indent, ordered, content_start))`. `indent` counts the
24/// leading ASCII spaces before the marker; `ordered` is `true` for
25/// `\d+\. ` and `false` for `- `; `content_start` is the byte offset
26/// of the first byte after the marker plus its trailing whitespace
27/// run. Tabs are not recognised as either indent or post-marker
28/// whitespace in MVP 0.
29pub(super) fn list_marker_at(bytes: &[u8], pos: usize) -> Option<(usize, bool, usize)> {
30    let mut i = pos;
31    let mut indent = 0_usize;
32    while i < bytes.len() && bytes[i] == b' ' {
33        indent += 1;
34        i += 1;
35    }
36    if i >= bytes.len() || bytes[i] == b'\n' || bytes[i] == b'\r' {
37        return None;
38    }
39    if bytes[i] == b'-' {
40        let after = i + 1;
41        if after >= bytes.len() {
42            return None;
43        }
44        if bytes[after] != b' ' && bytes[after] != b'\t' {
45            return None;
46        }
47        let mut j = after;
48        while j < bytes.len() && (bytes[j] == b' ' || bytes[j] == b'\t') {
49            j += 1;
50        }
51        return Some((indent, false, j));
52    }
53    if bytes[i].is_ascii_digit() {
54        let mut j = i;
55        while j < bytes.len() && bytes[j].is_ascii_digit() {
56            j += 1;
57        }
58        if j >= bytes.len() || bytes[j] != b'.' {
59            return None;
60        }
61        let after = j + 1;
62        if after >= bytes.len() {
63            return None;
64        }
65        if bytes[after] != b' ' && bytes[after] != b'\t' {
66            return None;
67        }
68        let mut k = after;
69        while k < bytes.len() && (bytes[k] == b' ' || bytes[k] == b'\t') {
70            k += 1;
71        }
72        return Some((indent, true, k));
73    }
74    None
75}
76
77/// Skip ASCII whitespace (space, tab, CR, LF) inside a `#set` body.
78pub(super) fn skip_set_ws(bytes: &[u8], from: usize, end: usize) -> usize {
79    let mut i = from;
80    while i < end && matches!(bytes[i], b' ' | b'\t' | b'\n' | b'\r') {
81        i += 1;
82    }
83    i
84}
85
86/// Advance to the next `,` or end-of-body, used for error recovery
87/// inside directive argument parsing.
88pub(super) fn skip_to_comma(bytes: &[u8], from: usize, end: usize) -> usize {
89    let mut i = from;
90    while i < end && bytes[i] != b',' {
91        i += 1;
92    }
93    i
94}
95
96/// Return the byte offset of the next character boundary at or after
97/// `from + 1`. Used to step over a single Unicode scalar value when
98/// accumulating string literal contents.
99pub(super) fn next_char_boundary(src: &str, from: usize) -> usize {
100    let mut i = from + 1;
101    while i < src.len() && !src.is_char_boundary(i) {
102        i += 1;
103    }
104    i
105}
106
107pub(super) fn find_byte(haystack: &[u8], needle: u8, from: usize) -> Option<usize> {
108    haystack[from..]
109        .iter()
110        .position(|&b| b == needle)
111        .map(|p| p + from)
112}
113
114/// Returns the byte offset just past the longest label-identifier run
115/// that starts at `from` in `bytes`. Empty (caller should detect via
116/// `id_end == from`) if the first byte is not a valid identifier char.
117///
118/// The accepted alphabet matches manifest ยง3.3 examples:
119/// `[A-Za-z0-9_:.-]`. Critically `:` is included so `fig:wells` and
120/// `eq:bayes` round-trip.
121pub(super) fn scan_label_chars(bytes: &[u8], from: usize) -> usize {
122    let mut i = from;
123    while i < bytes.len() {
124        let b = bytes[i];
125        let is_id = b.is_ascii_alphanumeric() || matches!(b, b'_' | b'-' | b':' | b'.');
126        if !is_id {
127            break;
128        }
129        i += 1;
130    }
131    i
132}
133
134pub(super) fn normalize_raw_text(text: &str) -> String {
135    let text = text
136        .strip_prefix("\r\n")
137        .or_else(|| text.strip_prefix('\n'))
138        .or_else(|| text.strip_prefix('\r'))
139        .unwrap_or(text);
140    text.replace("\r\n", "\n").replace('\r', "\n")
141}
142
143pub(super) struct ParsedLabel {
144    pub text: String,
145    pub start: usize,
146    pub end: usize,
147}
148
149/// If the substring `src[start..end]` begins with optional ASCII
150/// whitespace followed by `<label>`, return `(label_body_start, Some(id))`
151/// where `label_body_start` is the offset just past the closing `>`
152/// (with any trailing whitespace also consumed). Otherwise return
153/// `(start, None)`.
154///
155/// Only a single leading label is recognised; further `<...>` runs in
156/// the body are left intact for downstream stages.
157pub(super) fn strip_leading_label(
158    src: &str,
159    start: usize,
160    end: usize,
161) -> (usize, Option<ParsedLabel>) {
162    let bytes = src.as_bytes();
163    let mut i = start;
164    while i < end && (bytes[i] == b' ' || bytes[i] == b'\t') {
165        i += 1;
166    }
167    if i >= end || bytes[i] != b'<' {
168        return (start, None);
169    }
170    let id_start = i + 1;
171    let id_end = scan_label_chars(bytes, id_start);
172    if id_end == id_start || id_end >= end || bytes[id_end] != b'>' {
173        return (start, None);
174    }
175    let label = ParsedLabel {
176        text: src[id_start..id_end].to_owned(),
177        start: id_start,
178        end: id_end,
179    };
180    let mut after = id_end + 1;
181    while after < end && (bytes[after] == b' ' || bytes[after] == b'\t' || bytes[after] == b'\n') {
182        after += 1;
183    }
184    (after, Some(label))
185}
186
187/// If the substring `src[start..end]` ends with `<label>` (after any
188/// trailing ASCII whitespace), return `(text_end, Some(id))` where
189/// `text_end` is the offset of the first byte to *exclude* from the
190/// preceding text -- trailing whitespace before the label is also
191/// trimmed. Otherwise return `(end, None)`.
192pub(super) fn strip_trailing_label(
193    src: &str,
194    start: usize,
195    end: usize,
196) -> (usize, Option<ParsedLabel>) {
197    let bytes = src.as_bytes();
198    if end <= start || bytes[end - 1] != b'>' {
199        return (end, None);
200    }
201    let close = end - 1;
202    // Walk back over identifier chars to find the matching `<`.
203    let mut i = close;
204    while i > start {
205        let b = bytes[i - 1];
206        let is_id = b.is_ascii_alphanumeric() || matches!(b, b'_' | b'-' | b':' | b'.');
207        if !is_id {
208            break;
209        }
210        i -= 1;
211    }
212    if i == close || i == start || bytes[i - 1] != b'<' {
213        return (end, None);
214    }
215    // A `\<` opens literal angle-bracket text (e.g. an HTML tag), not a label.
216    if is_escaped(bytes, start, i - 1) {
217        return (end, None);
218    }
219    let label = ParsedLabel {
220        text: src[i..close].to_owned(),
221        start: i,
222        end: close,
223    };
224    let mut text_end = i - 1;
225    while text_end > start && (bytes[text_end - 1] == b' ' || bytes[text_end - 1] == b'\t') {
226        text_end -= 1;
227    }
228    (text_end, Some(label))
229}
230
231/// Locate the first `<label>` token in `src[start..end]`, returning the parsed
232/// label and the byte offset just past its `>`. Unlike [`strip_trailing_label`]
233/// this finds a label *anywhere* in the range, so the heading parser can detect
234/// a label that is not the trailing element (e.g. `= Title <id> trailing`) and
235/// flag it (`MOS0048`) instead of silently swallowing it into the text.
236pub(super) fn locate_label(src: &str, start: usize, end: usize) -> Option<(ParsedLabel, usize)> {
237    let bytes = src.as_bytes();
238    let mut open = start;
239    while open < end {
240        if bytes[open] == b'<' {
241            let mut i = open + 1;
242            while i < end {
243                let b = bytes[i];
244                if b == b'>' {
245                    break;
246                }
247                if !(b.is_ascii_alphanumeric() || matches!(b, b'_' | b'-' | b':' | b'.')) {
248                    break;
249                }
250                i += 1;
251            }
252            // A real label has at least one identifier byte and a closing `>`,
253            // and its `<` must not be escaped (`\<` opens literal angle-bracket
254            // text such as an HTML tag, not label syntax).
255            if i < end && bytes[i] == b'>' && i > open + 1 && !is_escaped(bytes, start, open) {
256                let label = ParsedLabel {
257                    text: src[open + 1..i].to_owned(),
258                    start: open + 1,
259                    end: i,
260                };
261                return Some((label, i + 1));
262            }
263        }
264        open += 1;
265    }
266    None
267}
268
269/// Whether the byte at `pos` is escaped by an odd-length run of `\` immediately
270/// before it (bounded below by `start`). Backslashes pair up (`\\` is a hard
271/// break, not an escape), so only an odd count escapes the following byte. The
272/// label scanners use this so a `\<` opens literal `<` text instead of a label.
273fn is_escaped(bytes: &[u8], start: usize, pos: usize) -> bool {
274    let mut count = 0;
275    let mut j = pos;
276    while j > start && bytes[j - 1] == b'\\' {
277        count += 1;
278        j -= 1;
279    }
280    count % 2 == 1
281}