Skip to main content

mos_layout/
boundary.rs

1//! Page boundary signatures (issue #70; design note
2//! `docs/incremental-dependencies.md` §4.5, §6, §7).
3//!
4//! Reflow and fixpoint work (manifest §33) needs to detect *where* pagination
5//! changed between two layout runs without re-diffing whole pages or
6//! serializing PDF output. A [`PageBoundarySignature`] is a compact, stable
7//! digest of one laid-out [`Page`]'s break-defining content; a
8//! [`PageGraphSignature`] is the ordered per-page list, so the first index
9//! where two graphs disagree is exactly where the page breaks diverged.
10//!
11//! This is the §4.5 `PageOutputHash` ("did the laid-out page actually change?")
12//! reduced to the layout primitives that exist today (text runs and image
13//! placements). It is identity/comparison only: no `DepNode` graph, no
14//! `CacheKey` wiring, no reflow loop; those consume these signatures later.
15//!
16//! # What feeds a signature
17//!
18//! Per page, in order: the page number, the quantized page box, then each text
19//! run (quantized position + size, a backend-neutral font identity, text) and
20//! each image placement (intrinsic pixel dimensions + quantized rectangle). Run
21//! and image counts are folded too, so adding or removing one shifts the digest.
22//!
23//! Deliberately **excluded**, per the determinism rules (§5) and the §4.2/§4.3
24//! carve-outs:
25//!
26//! - **Shaped glyphs** on a run: derived from text + font + shaper, so folding
27//!   them would bind the signature to a transcoder version it should not care
28//!   about. The authored text and font identity stand in for them.
29//! - **The PDF resource name** of a font (`F1`..): a backend emitter slot;
30//!   layout must not depend on it, so a backend-neutral font identity is folded
31//!   instead (see `font_identity`).
32//! - **Decoded image pixels** (`rgb8`): an asset-content concern (§4.3),
33//!   addressed by the asset's own hash, not the page boundary.
34//! - **`resolved_path`** on an image handle: an absolute filesystem path, which
35//!   §5 rule 1 forbids from any hash.
36//! - **`handle.id`**: assigned in image-encounter order, so folding it would
37//!   churn unrelated pages' signatures when an image is added earlier; the
38//!   intrinsic pixel dimensions identify the asset's footprint instead.
39//!
40//! # Quantization
41//!
42//! Every `f32` dimension is snapped to the 1/64-pt grid (§6) before folding, so
43//! two layouts that reach the same length through slightly different arithmetic
44//! agree. The design note specifies an `i32` count of 1/64 pt; we snap to the
45//! same grid and fold the canonical bit pattern of the integral count instead
46//! (see [`quantize_units`]), which is equivalent for hashing and avoids a
47//! lint-denied float-to-int cast. The §9.5 quantization newtype can later adopt
48//! the `i32` form without changing this boundary's observable behavior.
49
50use mos_core::{ContentHash, ContentHasher};
51use mos_fonts::{Base14Font, EmbeddedFontId, Font};
52
53use crate::types::{ImagePlacement, LayoutResult, Page, PageGraph, TextRun};
54
55/// Domain tag separating this boundary from every other `H(...)` (§4).
56const PAGE_DOMAIN: &[u8] = b"mos-layout/page-boundary/v1";
57
58/// A backend-neutral, stable identity for a font face.
59///
60/// Deliberately *not* `Font::pdf_resource_name` (`F1`..): that is a PDF emitter
61/// resource slot, and a layout signature must not depend on backend layout. The
62/// returned tag is owned by this boundary and stable forever; the exhaustive
63/// match means a new bundled face fails to compile here until it is assigned a
64/// tag, so the identity can never silently alias.
65fn font_identity(font: Font) -> &'static [u8] {
66    match font {
67        Font::Base14(Base14Font::Helvetica) => b"b14/helvetica",
68        Font::Base14(Base14Font::HelveticaBold) => b"b14/helvetica-bold",
69        Font::Base14(Base14Font::HelveticaOblique) => b"b14/helvetica-oblique",
70        Font::Base14(Base14Font::HelveticaBoldOblique) => b"b14/helvetica-boldoblique",
71        Font::Base14(Base14Font::TimesRoman) => b"b14/times-roman",
72        Font::Base14(Base14Font::TimesBold) => b"b14/times-bold",
73        Font::Base14(Base14Font::TimesItalic) => b"b14/times-italic",
74        Font::Base14(Base14Font::TimesBoldItalic) => b"b14/times-bolditalic",
75        Font::Base14(Base14Font::Courier) => b"b14/courier",
76        Font::Base14(Base14Font::CourierBold) => b"b14/courier-bold",
77        Font::Base14(Base14Font::CourierOblique) => b"b14/courier-oblique",
78        Font::Base14(Base14Font::CourierBoldOblique) => b"b14/courier-boldoblique",
79        Font::Base14(Base14Font::Symbol) => b"b14/symbol",
80        Font::Base14(Base14Font::ZapfDingbats) => b"b14/zapfdingbats",
81        Font::Embedded(EmbeddedFontId::Regular) => b"emb/noto-sans-regular",
82        Font::Embedded(EmbeddedFontId::Bold) => b"emb/noto-sans-bold",
83        Font::Embedded(EmbeddedFontId::Italic) => b"emb/noto-sans-italic",
84        Font::Embedded(EmbeddedFontId::BoldItalic) => b"emb/noto-sans-bolditalic",
85        Font::Embedded(EmbeddedFontId::Mono) => b"emb/noto-sans-mono",
86        Font::Embedded(EmbeddedFontId::Math) => b"emb/noto-sans-math",
87    }
88}
89
90/// Snap a point measurement to the 1/64-pt grid (§6) and return the canonical
91/// bit pattern of that integral count.
92///
93/// Snapping first means two dimensions within one shaper unit fold equal.
94/// Dividing is avoided entirely: `(pt * 64).round()` is already the integral
95/// count, and while that count stays below `2^24` (i.e. `pt < 262144`, ~five
96/// orders of magnitude above any real page coordinate) it is represented
97/// exactly by `f32`, so its bit pattern is a canonical, collision-free encoding
98/// of that integer. Past `2^24` consecutive counts would alias: out of the
99/// domain layout produces, but the bound the `i32` form (§9.5) would lift.
100/// `-0.0`, `NaN`, and non-finite values normalize to `0` so they cannot create
101/// spurious distinctions.
102fn quantize_units(pt: f32) -> u32 {
103    let units = (pt * 64.0).round();
104    let canonical = if units.is_finite() && units != 0.0 {
105        units
106    } else {
107        0.0
108    };
109    canonical.to_bits()
110}
111
112/// The number of items folded as a count, saturating so an absurd length cannot
113/// silently wrap.
114fn fold_count(hasher: &mut ContentHasher, len: usize) {
115    hasher.u32(u32::try_from(len).unwrap_or(u32::MAX));
116}
117
118fn fold_run(hasher: &mut ContentHasher, run: &TextRun) {
119    hasher
120        .u32(quantize_units(run.x_pt))
121        .u32(quantize_units(run.baseline_from_top_pt))
122        .u32(quantize_units(run.size_pt))
123        .field(font_identity(run.font))
124        .field(run.text.as_bytes());
125    // Optional `/ActualText`: fold a presence flag so `Some("")` and `None`
126    // stay distinct, then the bytes when present.
127    match &run.actual_text {
128        Some(actual) => {
129            hasher.u32(1).field(actual.as_bytes());
130        }
131        None => {
132            hasher.u32(0);
133        }
134    }
135    // `glyphs` excluded: derived from text + font + shaper (§4.2 carve-out).
136}
137
138fn fold_image(hasher: &mut ContentHasher, image: &ImagePlacement) {
139    hasher
140        // Intrinsic pixel dimensions identify the asset's layout footprint.
141        .u32(image.handle.pixel_width)
142        .u32(image.handle.pixel_height)
143        .u32(quantize_units(image.x_pt))
144        .u32(quantize_units(image.top_from_top_pt))
145        .u32(quantize_units(image.width_pt))
146        .u32(quantize_units(image.height_pt));
147    // `handle.id` is excluded: it is assigned in image-encounter order
148    // (`intern_image` uses `image_handles.len()`), so folding it would shift
149    // later images' signatures when an unrelated image is added earlier,
150    // wrecking the page-locality `first_divergence` relies on. `handle.rgb8`
151    // (asset content, §4.3) and `handle.resolved_path` (absolute path, §5
152    // rule 1) are excluded too.
153}
154
155/// A compact, deterministic digest of one laid-out [`Page`]'s break-defining
156/// content (design note §4.5).
157///
158/// Equal for identical pagination of identical content, different when a page's
159/// content or its break changes. It is the cache slot's staleness check, not a
160/// human-readable description: use [`content_hash`](Self::content_hash) to read
161/// the underlying value.
162///
163/// # Examples
164///
165/// ```
166/// use mos_layout::{LayoutEngine, PageBoundarySignature};
167/// use mos_core::Document;
168/// use std::path::PathBuf;
169///
170/// let doc = Document::new(PathBuf::from("doc.mos"));
171/// let result = LayoutEngine::new().layout(&doc);
172/// // Re-signing the same page yields the same signature.
173/// for page in &result.graph.pages {
174///     assert_eq!(
175///         PageBoundarySignature::of_page(page),
176///         PageBoundarySignature::of_page(page),
177///     );
178/// }
179/// ```
180#[derive(Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Hash, Debug)]
181pub struct PageBoundarySignature(ContentHash);
182
183impl PageBoundarySignature {
184    /// Compute the boundary signature of one laid-out page.
185    #[must_use]
186    pub fn of_page(page: &Page) -> Self {
187        let mut hasher = ContentHasher::new();
188        hasher
189            .field(PAGE_DOMAIN)
190            .u32(page.number)
191            .u32(quantize_units(page.width_pt))
192            .u32(quantize_units(page.height_pt));
193        fold_count(&mut hasher, page.runs.len());
194        for run in &page.runs {
195            fold_run(&mut hasher, run);
196        }
197        fold_count(&mut hasher, page.images.len());
198        for image in &page.images {
199            fold_image(&mut hasher, image);
200        }
201        Self(hasher.finish())
202    }
203
204    /// The underlying content hash.
205    #[must_use]
206    pub const fn content_hash(self) -> ContentHash {
207        self.0
208    }
209}
210
211/// The ordered per-page boundary signatures of a whole [`PageGraph`].
212///
213/// Comparing two graph signatures answers both "did pagination change?"
214/// (inequality) and "where?" ([`first_divergence`](Self::first_divergence)).
215///
216/// # Examples
217///
218/// ```
219/// use mos_layout::{LayoutEngine, PageGraphSignature};
220/// use mos_core::Document;
221/// use std::path::PathBuf;
222///
223/// let doc = Document::new(PathBuf::from("doc.mos"));
224/// let result = LayoutEngine::new().layout(&doc);
225/// let signature = PageGraphSignature::of_graph(&result.graph);
226/// // An unchanged layout signs identically and diverges nowhere.
227/// assert_eq!(signature, PageGraphSignature::of_graph(&result.graph));
228/// assert_eq!(signature.first_divergence(&signature), None);
229/// ```
230#[derive(Clone, Eq, PartialEq, Hash, Debug, Default)]
231pub struct PageGraphSignature(Vec<PageBoundarySignature>);
232
233impl PageGraphSignature {
234    /// Sign every page of `graph`, in page order.
235    #[must_use]
236    pub fn of_graph(graph: &PageGraph) -> Self {
237        Self(
238            graph
239                .pages
240                .iter()
241                .map(PageBoundarySignature::of_page)
242                .collect(),
243        )
244    }
245
246    /// The per-page signatures, in page order.
247    #[must_use]
248    pub fn pages(&self) -> &[PageBoundarySignature] {
249        &self.0
250    }
251
252    /// The index of the first page whose signature differs from `other`, or
253    /// [`None`] if the two are identical.
254    ///
255    /// When one graph is a prefix of the other (a page was added or removed at
256    /// the end), the divergence is the length of the shorter graph; the first
257    /// page index that exists in one but not the other.
258    ///
259    /// # Examples
260    ///
261    /// ```
262    /// use mos_layout::PageGraphSignature;
263    ///
264    /// let empty = PageGraphSignature::default();
265    /// assert_eq!(empty.first_divergence(&empty), None);
266    /// ```
267    #[must_use]
268    pub fn first_divergence(&self, other: &Self) -> Option<usize> {
269        self.0
270            .iter()
271            .zip(other.0.iter())
272            .position(|(a, b)| a != b)
273            .or_else(|| (self.0.len() != other.0.len()).then_some(self.0.len().min(other.0.len())))
274    }
275}
276
277impl LayoutResult {
278    /// The page boundary signatures of this layout's [`PageGraph`] (design note
279    /// §4.5). Convenience for cache/reflow consumers.
280    #[must_use]
281    pub fn page_boundary_signatures(&self) -> PageGraphSignature {
282        PageGraphSignature::of_graph(&self.graph)
283    }
284}
285
286#[cfg(test)]
287mod tests {
288    use std::sync::Arc;
289
290    use mos_fonts::{Base14Font, Font, ShapedGlyph};
291
292    use super::{PageBoundarySignature, PageGraphSignature, quantize_units};
293    use crate::types::{ImageHandle, ImagePlacement, Page, PageGraph, TextRun};
294
295    fn run(text: &str, x_pt: f32) -> TextRun {
296        TextRun {
297            x_pt,
298            baseline_from_top_pt: 100.0,
299            size_pt: 11.0,
300            font: Font::Base14(Base14Font::Helvetica),
301            text: text.to_owned(),
302            actual_text: None,
303            glyphs: Vec::new(),
304        }
305    }
306
307    fn page(number: u32, runs: Vec<TextRun>) -> Page {
308        Page {
309            number,
310            width_pt: 595.276,
311            height_pt: 841.89,
312            runs,
313            images: Vec::new(),
314        }
315    }
316
317    fn graph(pages: Vec<Page>) -> PageGraph {
318        PageGraph {
319            pages,
320            images: Vec::new(),
321        }
322    }
323
324    #[test]
325    fn unchanged_page_signs_identically() {
326        let a = page(1, vec![run("hello", 68.0), run("world", 110.0)]);
327        let b = page(1, vec![run("hello", 68.0), run("world", 110.0)]);
328        assert_eq!(
329            PageBoundarySignature::of_page(&a),
330            PageBoundarySignature::of_page(&b),
331        );
332    }
333
334    #[test]
335    fn changed_content_changes_the_signature() {
336        let base = page(1, vec![run("hello", 68.0)]);
337        let edited_text = page(1, vec![run("hallo", 68.0)]);
338        let moved = page(1, vec![run("hello", 70.0)]);
339        let extra = page(1, vec![run("hello", 68.0), run("more", 68.0)]);
340
341        let base_sig = PageBoundarySignature::of_page(&base);
342        assert_ne!(base_sig, PageBoundarySignature::of_page(&edited_text));
343        assert_ne!(base_sig, PageBoundarySignature::of_page(&moved));
344        assert_ne!(base_sig, PageBoundarySignature::of_page(&extra));
345    }
346
347    #[test]
348    fn page_number_is_folded() {
349        // Identical content on a differently-numbered page must sign
350        // differently; the page number is part of the boundary.
351        let runs = || vec![run("same", 68.0)];
352        assert_ne!(
353            PageBoundarySignature::of_page(&page(1, runs())),
354            PageBoundarySignature::of_page(&page(2, runs())),
355        );
356    }
357
358    #[test]
359    fn sub_shaper_unit_position_changes_are_ignored() {
360        // Two positions within one 1/64-pt unit snap to the same grid cell.
361        let base = page(1, vec![run("x", 68.0)]);
362        let nudged = page(1, vec![run("x", 68.0 + 0.001)]);
363        assert_eq!(
364            PageBoundarySignature::of_page(&base),
365            PageBoundarySignature::of_page(&nudged),
366        );
367    }
368
369    #[test]
370    fn quantize_snaps_to_one_sixty_fourth_point() {
371        // Within a grid cell: equal. Across a cell boundary: distinct.
372        assert_eq!(quantize_units(10.0), quantize_units(10.001));
373        assert_ne!(quantize_units(10.0), quantize_units(10.5));
374        // -0.0, +0.0, and non-finite values all normalize to the zero cell so
375        // they never create spurious distinctions.
376        assert_eq!(quantize_units(0.0), quantize_units(-0.0));
377        assert_eq!(quantize_units(f32::NAN), quantize_units(0.0));
378        assert_eq!(quantize_units(f32::INFINITY), quantize_units(0.0));
379        assert_eq!(quantize_units(f32::NEG_INFINITY), quantize_units(0.0));
380    }
381
382    #[test]
383    fn shaped_glyphs_do_not_affect_the_signature() {
384        // Glyphs are derived from text + font, so two runs that differ only in
385        // their shaped-glyph stream must sign the same.
386        let plain = page(1, vec![run("hi", 68.0)]);
387        let mut shaped_run = run("hi", 68.0);
388        shaped_run.glyphs = vec![ShapedGlyph {
389            gid: 42,
390            advance_units: 500,
391            x_offset_units: 0,
392            y_offset_units: 0,
393            cluster: 0,
394        }];
395        let shaped = page(1, vec![shaped_run]);
396        assert_eq!(
397            PageBoundarySignature::of_page(&plain),
398            PageBoundarySignature::of_page(&shaped),
399        );
400    }
401
402    #[test]
403    fn image_content_path_and_unstable_id_are_excluded_but_footprint_is_not() {
404        let place = |handle: ImageHandle| Page {
405            number: 1,
406            width_pt: 595.276,
407            height_pt: 841.89,
408            runs: Vec::new(),
409            images: vec![ImagePlacement {
410                handle,
411                x_pt: 68.0,
412                top_from_top_pt: 100.0,
413                width_pt: 200.0,
414                height_pt: 150.0,
415            }],
416        };
417        let handle = |id: u32, path: &str, rgb8: &[u8]| ImageHandle {
418            id,
419            resolved_path: path.to_owned(),
420            pixel_width: 2,
421            pixel_height: 1,
422            rgb8: Arc::from(rgb8.to_vec()),
423        };
424        // Different encounter-order id, different absolute path, and different
425        // decoded bytes, but the same layout footprint: the signature must not
426        // change. None of those belong in a deterministic, locality-preserving
427        // page boundary.
428        let a = place(handle(7, "/home/alice/fig.png", &[1, 2, 3, 4, 5, 6]));
429        let b = place(handle(8, "/home/bob/fig.png", &[9, 9, 9, 9, 9, 9]));
430        assert_eq!(
431            PageBoundarySignature::of_page(&a),
432            PageBoundarySignature::of_page(&b),
433        );
434        // The intrinsic pixel size *is* part of the footprint (the signature
435        // reads dimensions, not the pixel buffer), so a differently-sized asset
436        // signs differently.
437        let mut resized = handle(7, "/home/alice/fig.png", &[1, 2, 3, 4, 5, 6]);
438        resized.pixel_width = 4;
439        assert_ne!(
440            PageBoundarySignature::of_page(&a),
441            PageBoundarySignature::of_page(&place(resized)),
442        );
443    }
444
445    #[test]
446    fn graph_signature_localizes_a_pagination_change() {
447        // Page 1 unchanged; page 2 gains a run (a break moved). The graph
448        // signatures must diverge first at index 1.
449        let before = graph(vec![
450            page(1, vec![run("a", 68.0)]),
451            page(2, vec![run("b", 68.0)]),
452        ]);
453        let after = graph(vec![
454            page(1, vec![run("a", 68.0)]),
455            page(2, vec![run("b", 68.0), run("c", 110.0)]),
456        ]);
457        let before_sig = PageGraphSignature::of_graph(&before);
458        let after_sig = PageGraphSignature::of_graph(&after);
459
460        assert_ne!(before_sig, after_sig);
461        assert_eq!(before_sig.first_divergence(&after_sig), Some(1));
462        assert_eq!(before_sig.first_divergence(&before_sig), None);
463    }
464
465    #[test]
466    fn graph_signature_flags_an_added_trailing_page() {
467        let short = graph(vec![page(1, vec![run("a", 68.0)])]);
468        let long = graph(vec![
469            page(1, vec![run("a", 68.0)]),
470            page(2, vec![run("b", 68.0)]),
471        ]);
472        let short_sig = PageGraphSignature::of_graph(&short);
473        let long_sig = PageGraphSignature::of_graph(&long);
474        // The shared page 0 matches; divergence is the new trailing index.
475        assert_eq!(short_sig.first_divergence(&long_sig), Some(1));
476        assert_eq!(short_sig.pages().len(), 1);
477        assert_eq!(long_sig.pages().len(), 2);
478    }
479}