Skip to main content

mos_layout/
word.rs

1use mos_fonts::{EmbeddedFontId, Font, ShapedGlyph, WordSubRun, shape_with_fallback, text_width};
2
3#[derive(Clone, Debug)]
4pub(crate) struct Word {
5    pub(crate) text: String,
6    pub(crate) actual_text: Option<String>,
7    /// Primary face -- the style-resolved choice from the active
8    /// `FontFamily` (regular/bold/italic/monospace). Used for line
9    /// metrics (ascent/descent), inter-word spacing, and
10    /// character-wise hyphenation width estimates. Per-glyph fallback
11    /// faces (e.g. Noto Sans Math for `<=`) live inside [`Word::subruns`].
12    pub(crate) font: Font,
13    pub(crate) size_pt: f32,
14    /// Pre-computed advance width -- populated when the word is
15    /// constructed in `collect_words` (sum of `subruns[i].advance_pt`)
16    /// so the line-breaker doesn't re-measure on every comparison.
17    pub(crate) width_pt: f32,
18    /// Per-glyph-fallback sub-runs produced by `shape_with_fallback`.
19    /// One sub-run per contiguous source span that shares a face;
20    /// each carries its own font + text slice + glyph stream with
21    /// cluster offsets rebased to its local text. `flush_line` emits
22    /// one [`crate::TextRun`] per sub-run, advancing the x cursor by
23    /// `subrun.advance_pt` between them. For Base14 primary faces
24    /// the result is always a single sub-run with empty `glyphs`
25    /// (no fallback target -- Base14 emit path uses `WinAnsi`-byte
26    /// strings instead).
27    pub(crate) subruns: Vec<WordSubRun>,
28    /// Byte offsets into `text` where the source contained a U+00AD
29    /// soft hyphen. The SHY codepoints are stripped before shaping
30    /// (`split_soft_hyphens` in the layout crate); these offsets mark
31    /// the cluster boundaries where the author permits a line break.
32    /// The greedy breaker consults them via [`try_shy_break`] when a
33    /// word would otherwise overflow the line: the chosen prefix gets
34    /// a visible `-` appended and the suffix continues as the next
35    /// word. The Knuth-Plass cutover will use the same offsets as
36    /// flagged Penalty(50) items for optimal (non-greedy) selection.
37    pub(crate) shy_break_offsets: Vec<usize>,
38}
39
40/// Result of splitting a [`Word`] at one of its SHY break offsets.
41/// `prefix.text` already includes a trailing U+002D HYPHEN-MINUS and
42/// its `width_pt` is the post-shape advance sum (including the
43/// hyphen). `suffix.text` carries the remaining bytes with
44/// `shy_break_offsets` rebased to the suffix's local indexing and
45/// boundary offsets (0 / `len`) dropped.
46#[derive(Clone, Debug)]
47pub(crate) struct ShyBreak {
48    pub(crate) prefix: Word,
49    pub(crate) suffix: Word,
50}
51
52/// Try to break `word` at the latest SHY offset whose prefix-plus-
53/// visible-hyphen fits in `max_prefix_width`. Returns `None` if no
54/// valid offset fits. Offsets equal to `0` or `word.text.len()`
55/// (leading / trailing SHY) are ignored, matching the rule that a
56/// break must produce a non-empty visible prefix and a non-empty
57/// suffix. Consecutive duplicate offsets (e.g. `a\u{AD}\u{AD}b` →
58/// `[1, 1]`) are deduped on the fly.
59///
60/// Re-shapes both halves through [`shape_with_fallback`] so the
61/// resulting [`Word::width_pt`] matches the post-shape subrun
62/// advances exactly; the cheap [`text_width`] estimate used during
63/// the fit search may differ from the shaped width when fallback
64/// splits the run, so the shaped sum is the authoritative value and
65/// is re-checked against `max_prefix_width` before the candidate is
66/// accepted.
67pub(crate) fn try_shy_break(
68    word: &Word,
69    max_prefix_width: f32,
70    fallbacks: &[EmbeddedFontId],
71) -> Option<ShyBreak> {
72    if word.shy_break_offsets.is_empty() {
73        return None;
74    }
75    let text_len = word.text.len();
76    // Walk offsets right-to-left so the first candidate that fits is
77    // the latest fitting break (greedy = prefer longer prefix).
78    let mut seen: Option<usize> = None;
79    for &off in word.shy_break_offsets.iter().rev() {
80        if off == 0 || off >= text_len {
81            continue;
82        }
83        if seen == Some(off) {
84            continue;
85        }
86        seen = Some(off);
87        let Some(prefix_src) = word.text.get(..off) else {
88            continue;
89        };
90        let mut prefix_text = String::with_capacity(prefix_src.len() + 1);
91        prefix_text.push_str(prefix_src);
92        prefix_text.push('-');
93        let prefix_subruns = shape_with_fallback(word.font, fallbacks, word.size_pt, &prefix_text);
94        let prefix_width: f32 = prefix_subruns.iter().map(|s| s.advance_pt).sum();
95        if prefix_width > max_prefix_width {
96            // Rounding pushed the shaped width just over; try the
97            // next-smaller candidate rather than emit an overflow.
98            continue;
99        }
100        let Some(suffix_src) = word.text.get(off..) else {
101            continue;
102        };
103        let suffix_text = suffix_src.to_owned();
104        let suffix_len = suffix_text.len();
105        let suffix_offsets: Vec<usize> = word
106            .shy_break_offsets
107            .iter()
108            .filter_map(|&o| {
109                if o > off {
110                    let rebased = o - off;
111                    if rebased > 0 && rebased < suffix_len {
112                        Some(rebased)
113                    } else {
114                        None
115                    }
116                } else {
117                    None
118                }
119            })
120            .collect();
121        let suffix_subruns = shape_with_fallback(word.font, fallbacks, word.size_pt, &suffix_text);
122        let suffix_width: f32 = suffix_subruns.iter().map(|s| s.advance_pt).sum();
123        let prefix = Word {
124            text: prefix_text,
125            actual_text: None,
126            font: word.font,
127            size_pt: word.size_pt,
128            width_pt: prefix_width,
129            subruns: prefix_subruns,
130            // The hyphenated side has committed a break already; no
131            // further SHY breaks live on it.
132            shy_break_offsets: Vec::new(),
133        };
134        let suffix = Word {
135            text: suffix_text,
136            actual_text: None,
137            font: word.font,
138            size_pt: word.size_pt,
139            width_pt: suffix_width,
140            subruns: suffix_subruns,
141            shy_break_offsets: suffix_offsets,
142        };
143        return Some(ShyBreak { prefix, suffix });
144    }
145    None
146}
147
148/// Inline item emitted by `collect_words`. The greedy line-breaker
149/// (and, later, the Knuth-Plass breaker) walks the stream and emits
150/// page geometry; `HardBreak` is a sentinel that forces a flush of
151/// the in-progress line without contributing any glyphs.
152#[derive(Clone, Debug)]
153pub(crate) enum WordItem {
154    Word(Word),
155    HardBreak,
156}
157
158/// Strip U+00AD (soft hyphen) codepoints from `text` and return the
159/// stripped string plus the byte offsets *in the stripped output*
160/// where each SHY originally sat. The offsets mark the codepoint
161/// boundary *after* the preceding cluster: a break taken at offset
162/// `o` leaves bytes `[0..o)` on the previous line and `[o..)` on the
163/// next.
164///
165/// The greedy line-breaker consumes these offsets through
166/// [`try_shy_break`] when a word would overflow; the Knuth-Plass
167/// cutover treats each as a flagged Penalty(50) item with hyphen-glyph
168/// advance as its post-break width.
169///
170/// `text` is expected to be NFC-normalized. NFC does not decompose
171/// U+00AD, so no quasi-SHY sequences need to be handled.
172pub(crate) fn split_soft_hyphens(text: &str) -> (String, Vec<usize>) {
173    if !text.contains('\u{AD}') {
174        return (text.to_owned(), Vec::new());
175    }
176    let mut stripped = String::with_capacity(text.len());
177    let mut offsets = Vec::new();
178    for ch in text.chars() {
179        if ch == '\u{AD}' {
180            offsets.push(stripped.len());
181        } else {
182            stripped.push(ch);
183        }
184    }
185    (stripped, offsets)
186}
187
188pub(crate) fn word_clusters(word: &Word) -> Vec<WordSubRun> {
189    let mut clusters = Vec::new();
190    for sub in &word.subruns {
191        if sub.glyphs.is_empty() {
192            for ch in sub.text.chars() {
193                let mut text = String::new();
194                text.push(ch);
195                clusters.push(WordSubRun {
196                    font: sub.font,
197                    advance_pt: text_width(sub.font, word.size_pt, &text),
198                    text,
199                    glyphs: Vec::new(),
200                });
201            }
202            continue;
203        }
204
205        let mut i = 0;
206        while i < sub.glyphs.len() {
207            let cluster = sub.glyphs[i].cluster;
208            let mut j = i + 1;
209            while j < sub.glyphs.len() && sub.glyphs[j].cluster == cluster {
210                j += 1;
211            }
212            let start = usize::try_from(cluster).unwrap_or(usize::MAX);
213            let end = if j < sub.glyphs.len() {
214                usize::try_from(sub.glyphs[j].cluster).unwrap_or(usize::MAX)
215            } else {
216                sub.text.len()
217            };
218            debug_assert!(start <= end && end <= sub.text.len());
219            let Some(text) = sub.text.get(start..end) else {
220                i = j;
221                continue;
222            };
223            let shift = u32::try_from(start).unwrap_or(u32::MAX);
224            let glyphs: Vec<_> = sub.glyphs[i..j]
225                .iter()
226                .map(|g| ShapedGlyph {
227                    cluster: g.cluster.saturating_sub(shift),
228                    ..*g
229                })
230                .collect();
231            clusters.push(WordSubRun {
232                font: sub.font,
233                text: text.to_owned(),
234                advance_pt: glyphs_advance_pt(sub.font, word.size_pt, &glyphs),
235                glyphs,
236            });
237            i = j;
238        }
239    }
240    clusters
241}
242
243fn glyphs_advance_pt(font: Font, size_pt: f32, glyphs: &[ShapedGlyph]) -> f32 {
244    let upem = match font {
245        Font::Embedded(id) => f32::from(id.data().units_per_em),
246        Font::Base14(_) => 1000.0,
247    };
248    // Sign-preserving conversion lives in mos-fonts to keep the
249    // two crates from drifting on hmtx semantics.
250    glyphs
251        .iter()
252        .map(|g| mos_fonts::advance_units_to_pt(g.advance_units, size_pt, upem))
253        .sum()
254}
255
256#[cfg(test)]
257mod tests {
258    use super::{Word, split_soft_hyphens, try_shy_break};
259    use mos_fonts::{Base14Font, Font, WordSubRun, shape_with_fallback, text_width};
260
261    fn make_shy_word(text: &str, offsets: Vec<usize>) -> Word {
262        let font = Font::Base14(Base14Font::Helvetica);
263        let size_pt = 12.0;
264        let subruns: Vec<WordSubRun> = shape_with_fallback(font, &[], size_pt, text);
265        let width_pt: f32 = subruns.iter().map(|s| s.advance_pt).sum();
266        Word {
267            text: text.to_owned(),
268            actual_text: None,
269            font,
270            size_pt,
271            width_pt,
272            subruns,
273            shy_break_offsets: offsets,
274        }
275    }
276
277    #[test]
278    fn split_soft_hyphens_no_op_when_absent() {
279        let (stripped, offsets) = split_soft_hyphens("hello");
280        assert_eq!(stripped, "hello");
281        assert!(offsets.is_empty());
282    }
283
284    #[test]
285    fn split_soft_hyphens_records_offsets_in_stripped_text() {
286        // "super\u{AD}cali\u{AD}fragil" -> "supercalifragil", with
287        // break opportunities at the byte offsets where SHY sat in
288        // the *stripped* text (i.e. between the preceding and
289        // following clusters in the rendered word).
290        let (stripped, offsets) = split_soft_hyphens("super\u{AD}cali\u{AD}fragil");
291        assert_eq!(stripped, "supercalifragil");
292        assert_eq!(offsets, vec![5, 9]);
293    }
294
295    #[test]
296    fn split_soft_hyphens_handles_consecutive_shy() {
297        // Two SHYs in a row collapse to the same offset (no
298        // codepoints separate them in the stripped output).
299        let (stripped, offsets) = split_soft_hyphens("a\u{AD}\u{AD}b");
300        assert_eq!(stripped, "ab");
301        assert_eq!(offsets, vec![1, 1]);
302    }
303
304    #[test]
305    fn try_shy_break_returns_none_when_no_offsets() {
306        let word = make_shy_word("hello", Vec::new());
307        assert!(try_shy_break(&word, 1000.0, &[]).is_none());
308    }
309
310    #[test]
311    fn try_shy_break_picks_latest_offset_that_fits() {
312        // "supercalifragil" with breaks at byte 5 ("super") and 9 ("cali").
313        // A generous width admits both; the latest one (9) wins.
314        let word = make_shy_word("supercalifragil", vec![5, 9]);
315        let result = try_shy_break(&word, 1000.0, &[]).expect("must split");
316        assert_eq!(result.prefix.text, "supercali-");
317        assert_eq!(result.suffix.text, "fragil");
318        assert!(result.suffix.shy_break_offsets.is_empty());
319    }
320
321    #[test]
322    fn try_shy_break_falls_back_to_earlier_offset_when_latest_overflows() {
323        // Tight width: "supercali-" (10 chars) too wide, "super-" (6) fits.
324        let word = make_shy_word("supercalifragil", vec![5, 9]);
325        let font = word.font;
326        let size = word.size_pt;
327        let max = text_width(font, size, "super-") + 0.5;
328        let result = try_shy_break(&word, max, &[]).expect("must split");
329        assert_eq!(result.prefix.text, "super-");
330        assert_eq!(result.suffix.text, "califragil");
331        // Suffix retains the later SHY rebased: 9 - 5 = 4.
332        assert_eq!(result.suffix.shy_break_offsets, vec![4]);
333    }
334
335    #[test]
336    fn try_shy_break_ignores_leading_and_trailing_offsets() {
337        // Offsets at 0 and len() must never be chosen.
338        let word = make_shy_word("foo", vec![0, 3]);
339        assert!(try_shy_break(&word, 1000.0, &[]).is_none());
340    }
341
342    #[test]
343    fn try_shy_break_returns_none_when_no_break_fits() {
344        // Width smaller than even the shortest prefix+hyphen.
345        let word = make_shy_word("supercalifragil", vec![5, 9]);
346        assert!(try_shy_break(&word, 1.0, &[]).is_none());
347    }
348
349    #[test]
350    fn try_shy_break_dedupes_consecutive_duplicate_offsets() {
351        // `a\u{AD}\u{AD}b` produces offsets [1, 1]; both point to
352        // the same break, so dedupe-on-the-fly avoids re-shaping.
353        let word = make_shy_word("ab", vec![1, 1]);
354        let result = try_shy_break(&word, 1000.0, &[]).expect("must split");
355        assert_eq!(result.prefix.text, "a-");
356        assert_eq!(result.suffix.text, "b");
357    }
358}