Skip to main content

mos_pdf/
lib.rs

1//! PDF backend for Mosaic (manifest §21.1).
2//!
3//! Emits a fixed-A4 PDF declaring all 14 standard PDF base fonts
4//! (Helvetica/Times/Courier × 4 + Symbol + `ZapfDingbats`). No font
5//! data ships; every glyph outline is supplied by the PDF reader's
6//! built-in Core 14 implementations.
7//!
8//! For each Latin Core 14 face actually used, the backend plans a
9//! per-document `/Encoding` dict that layers a `/Differences` array
10//! on top of `WinAnsiEncoding` to reach the 99 extended glyphs each
11//! AFM carries beyond `WinAnsi` (`Ł`, `ł`, `Ě`, `ě`, `Ő`, `ő`, the
12//! Romanian comma-below set, math operators `−≤≥≠√∂∑∆◊`, `fi`/`fl`).
13//! A matching `/ToUnicode` `CMap` is emitted so the bytes we mint
14//! decode back to real Unicode in copy/paste and search.
15//!
16//! See the private `encoding` module for the planner. PDF/A, tagged
17//! PDF, hyperlinks, bookmarks, and full font embedding (issue #9)
18//! are deferred.
19
20#![doc(
21    html_logo_url = "https://mosaic.kjanat.dev/assets/A4.svg",
22    html_favicon_url = "https://mosaic.kjanat.dev/assets/A4.svg"
23)]
24
25mod content;
26mod embedded;
27mod encoding;
28mod images;
29
30use std::collections::HashMap;
31use std::path::Path;
32
33use mos_core::{CoreError, Diagnostic, Result, codes};
34use mos_fonts::EmbeddedFontId;
35use mos_layout::{Base14Font, Font, PageGraph, TextRun};
36use pdf_writer::types::{SystemInfo, UnicodeCmap};
37use pdf_writer::writers::Encoding;
38use pdf_writer::{Finish, Name, Pdf, Rect, Ref, Str, TextStr};
39
40use crate::embedded::{EmbeddedFontPlan, EmbeddedRefs};
41use crate::encoding::{DocEncoding, EncodingPlanner};
42
43/// Identifies Mosaic as the PDF's producing application, written to the
44/// Info dictionary `/Producer` and `/Creator` so a built PDF traces back
45/// to the compiler that bred it (the way ffmpeg/Word/Adobe stamp theirs).
46/// A compile-time constant, so output stays byte-for-byte deterministic:
47/// no wall-clock, host, path, or user data leaks in. The version tracks
48/// the workspace `CARGO_PKG_VERSION` automatically.
49///
50/// Follow-up (intentionally deferred to keep this stamp deterministic):
51/// - `/CreationDate` + `/ModDate` driven by a `SOURCE_DATE_EPOCH`-style
52///   deterministic input (UTC, stable `D:YYYYMMDDHHmmSS'+00'00'` format).
53/// - An XMP metadata packet (catalog `/Metadata`) for PDF/A / Adobe tooling,
54///   kept in sync with this Info dict.
55const PRODUCER: &str = concat!("Mosaic ", env!("CARGO_PKG_VERSION"));
56
57/// Document-level metadata that gets written to the PDF Info
58/// dictionary. Populated by the lowerer from `#set document(...)`.
59/// The `language` field is captured but not yet emitted (it belongs in
60/// the catalog `/Lang` entry, which is the next slice).
61///
62/// # Examples
63///
64/// ```
65/// use mos_pdf::PdfMetadata;
66///
67/// let metadata = PdfMetadata {
68///     title: Some("Demo".to_owned()),
69///     author: Some("Mosaic".to_owned()),
70///     language: Some("en".to_owned()),
71/// };
72///
73/// assert_eq!(metadata.title.as_deref(), Some("Demo"));
74/// ```
75#[derive(Debug, Clone, Default)]
76pub struct PdfMetadata {
77    pub title: Option<String>,
78    pub author: Option<String>,
79    pub language: Option<String>,
80}
81
82/// Emit `graph` as a PDF file at `out`. Creates `out`'s parent
83/// directory if it doesn't already exist.
84///
85/// Returns any diagnostics raised during PDF emission; currently
86/// only `MOS0032` (per-font extended-glyph budget exhausted). Layout
87/// diagnostics flow through [`mos_layout::LayoutResult::diagnostics`]
88/// separately; callers (the CLI) typically render both.
89///
90/// # Errors
91///
92/// Returns a wrapped [`Diagnostic`] if writing the file (or creating
93/// its parent directory) fails.
94///
95/// # Examples
96///
97/// ```no_run
98/// use std::path::Path;
99///
100/// use mos_layout::PageGraph;
101/// use mos_pdf::PdfMetadata;
102///
103/// let graph = PageGraph::default();
104/// let metadata = PdfMetadata::default();
105/// let diagnostics = mos_pdf::emit(&graph, &metadata, Path::new("build/main.pdf"))?;
106///
107/// assert!(diagnostics.is_empty());
108/// # Ok::<(), mos_core::CoreError>(())
109/// ```
110pub fn emit(graph: &PageGraph, metadata: &PdfMetadata, out: &Path) -> Result<Vec<Diagnostic>> {
111    let (bytes, diagnostics) = build_pdf(graph, metadata)?;
112    if let Some(parent) = out.parent()
113        && !parent.as_os_str().is_empty()
114    {
115        std::fs::create_dir_all(parent).map_err(|err| {
116            io_diagnostic(format!(
117                "could not create output directory `{}`: {err}",
118                mos_core::display_path(parent)
119            ))
120        })?;
121    }
122    std::fs::write(out, bytes).map_err(|err| {
123        io_diagnostic(format!(
124            "could not write PDF to `{}`: {err}",
125            mos_core::display_path(out)
126        ))
127    })?;
128    Ok(diagnostics)
129}
130
131fn io_diagnostic(message: String) -> CoreError {
132    CoreError::Diagnostic(Box::new(Diagnostic::simple(&codes::MOS0014, None, message)))
133}
134
135/// Build the PDF bytes from `graph`. Pulled out of [`emit`] so tests
136/// can round-trip without touching the filesystem. Returns the bytes
137/// plus any encoding diagnostics (currently `MOS0032` for Base14
138/// `/Differences` overflow). Kept `pub(crate)`; the public surface
139/// is [`emit`].
140///
141/// # Errors
142///
143/// Returns an error if font subsetting fails for any embedded face
144/// (only with corrupted font data; the bundled cuts have been
145/// verified).
146pub(crate) fn build_pdf(
147    graph: &PageGraph,
148    metadata: &PdfMetadata,
149) -> Result<(Vec<u8>, Vec<Diagnostic>)> {
150    // Phase 1a: scan every run and plan per-face Base14 /Differences
151    // encodings (embedded-font runs are skipped: they take the Type 0
152    // CID path below).
153    let mut planner = EncodingPlanner::new();
154    for page in &graph.pages {
155        planner.observe_runs(&page.runs);
156    }
157    let mut diagnostics: Vec<Diagnostic> = Vec::new();
158    let encodings = planner.finalize(&mut diagnostics);
159
160    // Phase 1b: subset every embedded face actually used. One plan
161    // per face referenced; absent if the face never appears in `runs`.
162    // Only embedded-font runs need cloning into the flat slice the
163    // planner consumes: Base14 runs would be filtered out by
164    // `plan_embedded` anyway, so cloning them up front is pure waste
165    // for documents where Base14 dominates.
166    let embedded_runs: Vec<TextRun> = graph
167        .pages
168        .iter()
169        .flat_map(|p| p.runs.iter())
170        .filter(|r| matches!(r.font, Font::Embedded(_)))
171        .cloned()
172        .collect();
173    let embedded_plans: Vec<EmbeddedFontPlan> = embedded::plan_embedded(&embedded_runs)?;
174    let embedded_by_id: HashMap<EmbeddedFontId, &EmbeddedFontPlan> =
175        embedded_plans.iter().map(|p| (p.id, p)).collect();
176
177    // Phase 2: emit. Refs allocated up front so the page tree, font
178    // dicts, encoding dicts, FontFile2 streams, and ToUnicode streams
179    // can cross-reference.
180    let mut pdf = Pdf::new();
181    let mut next_id: i32 = 1;
182    let mut alloc = || {
183        let id = Ref::new(next_id);
184        next_id += 1;
185        id
186    };
187
188    let catalog_id = alloc();
189    let page_tree_id = alloc();
190    let info_id = alloc();
191
192    // One indirect ref per Base14 face, in the order published by
193    // `Font::ALL_BASE14`. Always all 14 entries so every page's
194    // resource dictionary is identical for Base14: preserves byte
195    // stability for Base14-only documents.
196    let base14_refs: Vec<(Font, Ref)> = Font::ALL_BASE14.iter().map(|f| (*f, alloc())).collect();
197
198    // For each Latin face that needs a `/Differences` map, pre-allocate
199    // the indirect refs for the custom encoding dict and the
200    // `/ToUnicode` CMap stream. Symbol/Dingbats and unused faces get
201    // no extra refs. Iterate `Font::ALL_BASE14` (not `&encodings`) so the
202    // `alloc()` order, and therefore the byte layout of the produced
203    // PDF is deterministic across runs.
204    let mut encoding_refs: HashMap<Font, (Ref, Ref)> = HashMap::new();
205    for font in Font::ALL_BASE14 {
206        if let Some(enc) = encodings.get(&font)
207            && enc.has_differences()
208        {
209            let enc_ref = alloc();
210            let cmap_ref = alloc();
211            encoding_refs.insert(font, (enc_ref, cmap_ref));
212        }
213    }
214
215    // One set of 5 refs per embedded face actually referenced.
216    let embedded_refs: HashMap<EmbeddedFontId, EmbeddedRefs> = embedded_plans
217        .iter()
218        .map(|plan| {
219            (
220                plan.id,
221                EmbeddedRefs {
222                    font: alloc(),
223                    cid_font: alloc(),
224                    descriptor: alloc(),
225                    font_file: alloc(),
226                    to_unicode: alloc(),
227                },
228            )
229        })
230        .collect();
231
232    // Allocate one indirect ref per unique image. Compression itself
233    // happens at emit time (see the loop below) so we don't hold every
234    // compressed stream in memory simultaneously: `graph.images` is
235    // already the deduped set, and an image-heavy document can blow
236    // peak RAM if we buffer all compressed copies before writing them.
237    let image_refs: Vec<Ref> = graph.images.iter().map(|_| alloc()).collect();
238
239    let page_refs: Vec<(Ref, Ref)> = graph.pages.iter().map(|_| (alloc(), alloc())).collect();
240
241    pdf.catalog(catalog_id).pages(page_tree_id);
242
243    let page_count = i32::try_from(page_refs.len()).unwrap_or(i32::MAX);
244    pdf.pages(page_tree_id)
245        .kids(page_refs.iter().map(|(p, _)| *p))
246        .count(page_count);
247
248    for (page, (page_id, content_id)) in graph.pages.iter().zip(page_refs.iter()) {
249        let mut page_obj = pdf.page(*page_id);
250        page_obj.media_box(Rect::new(0.0, 0.0, page.width_pt, page.height_pt));
251        page_obj.parent(page_tree_id);
252        page_obj.contents(*content_id);
253        {
254            let mut resources = page_obj.resources();
255            {
256                let mut fonts = resources.fonts();
257                for (face, font_id) in &base14_refs {
258                    fonts.pair(Name(face.pdf_resource_name()), *font_id);
259                }
260                // Embedded faces actually referenced in this document.
261                // Each page's resource dict lists every embedded face used
262                // anywhere in the document, not just on this page, so
263                // resource dicts stay identical across pages. Iterate
264                // `EmbeddedFontId::ALL` for deterministic order.
265                for id in EmbeddedFontId::ALL {
266                    if let Some(refs) = embedded_refs.get(&id) {
267                        fonts.pair(Name(id.pdf_resource_name()), refs.font);
268                    }
269                }
270            }
271            // Image XObjects. Every page lists every image referenced
272            // anywhere in the document so resource dicts stay byte-
273            // stable across pages: same pattern as the font dicts.
274            if !graph.images.is_empty() {
275                let mut x_objects = resources.x_objects();
276                for (handle, image_id) in graph.images.iter().zip(image_refs.iter()) {
277                    let name = images::resource_name(handle);
278                    x_objects.pair(Name(name.as_bytes()), *image_id);
279                }
280            }
281        }
282        page_obj.finish();
283
284        let stream_bytes =
285            content::build_content_stream(page.height_pt, page, &encodings, &embedded_by_id)?;
286        pdf.stream(*content_id, &stream_bytes);
287    }
288
289    // Emit each Image XObject. Order matches `graph.images` (and
290    // therefore the `alloc()` order above), keeping byte output
291    // deterministic. Each image is compressed in this loop and the
292    // compressed buffer dropped at the end of the iteration, so peak
293    // memory holds at most one compressed image at a time on top of
294    // the (Arc-shared) decoded pixel buffer the handle already owns.
295    for (handle, id) in graph.images.iter().zip(image_refs.iter()) {
296        let compressed = images::flate_compress(&handle.rgb8);
297        images::emit_image_xobject(&mut pdf, *id, handle, &compressed);
298    }
299
300    for (face, font_id) in &base14_refs {
301        let Some(base14) = face.base14() else {
302            continue;
303        };
304        let mut font_dict = pdf.type1_font(*font_id);
305        font_dict.base_font(Name(face.pdf_base_name().as_bytes()));
306        // Symbol and ZapfDingbats use their own PostScript encodings;
307        // overriding to WinAnsi would be a category error (Symbol's
308        // `A` is Alpha). Skip /Encoding entirely for those.
309        if matches!(base14, Base14Font::Symbol | Base14Font::ZapfDingbats) {
310            continue;
311        }
312        match encoding_refs.get(face) {
313            Some(&(enc_ref, cmap_ref)) => {
314                // Custom /Encoding dict + /ToUnicode CMap (emitted
315                // below at top level so the refs resolve).
316                font_dict.pair(Name(b"Encoding"), enc_ref);
317                font_dict.to_unicode(cmap_ref);
318            }
319            None => {
320                // No extended glyphs needed for this face: the
321                // standard WinAnsi shortcut suffices. PDF readers
322                // default Type1 dicts to the font's built-in
323                // encoding (StandardEncoding for Helvetica), so
324                // declaring WinAnsi is required for bytes ≥ 0x80
325                // (Euro, smart quotes, accented Latin, …) to render
326                // the right glyph.
327                font_dict.encoding_predefined(Name(b"WinAnsiEncoding"));
328            }
329        }
330    }
331
332    // Emit each embedded face's 5-object cluster (Type 0 + CIDFont +
333    // descriptor + FontFile2 stream + ToUnicode CMap).
334    for plan in &embedded_plans {
335        let refs = embedded_refs[&plan.id];
336        embedded::emit_embedded(&mut pdf, plan, refs);
337    }
338
339    // Emit the custom /Encoding dicts and /ToUnicode CMap streams.
340    // Same `Font::ALL_BASE14` walk as the allocation pass above keeps
341    // emit order deterministic.
342    for font in Font::ALL_BASE14 {
343        let Some(enc) = encodings.get(&font) else {
344            continue;
345        };
346        let Some(&(enc_ref, cmap_ref)) = encoding_refs.get(&font) else {
347            continue;
348        };
349        emit_encoding_dict(&mut pdf, enc_ref, enc);
350        emit_to_unicode_cmap(&mut pdf, cmap_ref, enc);
351    }
352
353    {
354        let mut info = pdf.document_info(info_id);
355        if let Some(title) = metadata.title.as_deref() {
356            info.title(TextStr(title));
357        }
358        if let Some(author) = metadata.author.as_deref() {
359            info.author(TextStr(author));
360        }
361        // Provenance stamp: mark Mosaic as the producing application. Both
362        // keys carry the same constant string, so this adds no wall-clock
363        // or environment data and the output stays deterministic.
364        info.producer(TextStr(PRODUCER));
365        info.creator(TextStr(PRODUCER));
366        info.finish();
367    }
368
369    Ok((pdf.finish(), diagnostics))
370}
371
372/// Emits one PDF indirect object: a custom `/Encoding` dict with
373/// `/BaseEncoding /WinAnsiEncoding` and a `/Differences` array.
374/// `pdf-writer`'s `Differences::consecutive(start, names)` emits the
375/// run-length form `[ start /n1 /n2 /n3 ]`. We use one group per
376/// contiguous run for compactness; isolated slots get their own
377/// single-element group.
378fn emit_encoding_dict(pdf: &mut Pdf, id: Ref, enc: &DocEncoding) {
379    let mut enc_dict: Encoding<'_> = pdf.indirect(id).start();
380    enc_dict.base_encoding(Name(b"WinAnsiEncoding"));
381    {
382        let mut diffs = enc_dict.differences();
383        let mut i = 0;
384        while i < enc.differences.len() {
385            let (start, _) = enc.differences[i];
386            // Find the end of this contiguous run (slot[j] == slot[j-1] + 1).
387            let mut j = i + 1;
388            while j < enc.differences.len() && enc.differences[j].0 == enc.differences[j - 1].0 + 1
389            {
390                j += 1;
391            }
392            let names = enc.differences[i..j]
393                .iter()
394                .map(|(_, n)| Name(n.as_bytes()));
395            diffs.consecutive(start, names);
396            i = j;
397        }
398    }
399    enc_dict.finish();
400}
401
402/// Emits a `/ToUnicode` `CMap` stream that round-trips every byte
403/// used by `enc` back to its original Unicode codepoint, so
404/// copy-paste and full-text search work for both `WinAnsi` natives
405/// and `/Differences`-remapped slots.
406fn emit_to_unicode_cmap(pdf: &mut Pdf, id: Ref, enc: &DocEncoding) {
407    // The `SystemInfo` here is embedded inside the PostScript-y CMap
408    // stream content (the `%%BeginResource: CMap …` header that
409    // `UnicodeCmap::new` writes). The `/CMapName` and `/CIDSystemInfo`
410    // entries set further down go on the stream dictionary itself.
411    // both are required by PDF 1.7 §9.7.5.4 / §9.10.3 (pdf-writer
412    // documents `.name()` and `.system_info()` as "Required"), even
413    // though readers we've tested tolerate their absence because the
414    // PS content carries the same info.
415    let system_info = SystemInfo {
416        registry: Str(b"Adobe"),
417        ordering: Str(b"UCS"),
418        supplement: 0,
419    };
420    let mut cmap: UnicodeCmap<u8> = UnicodeCmap::new(Name(b"Adobe-Identity-UCS"), system_info);
421    for &(byte, ch) in &enc.to_unicode_entries {
422        cmap.pair(byte, ch);
423    }
424    let cmap_bytes = cmap.finish();
425    let mut cmap_writer = pdf.cmap(id, &cmap_bytes);
426    cmap_writer.name(Name(b"Adobe-Identity-UCS"));
427    cmap_writer.system_info(system_info);
428}
429
430#[cfg(test)]
431mod tests {
432    // No `#![allow]` here. The two filesystem-touching tests
433    // (`emit_writes_file`, `emit_fails_with_mos0014_when_target_is_a_directory`)
434    // return `TestResult` and surface failures via `?` / `ensure!`
435    // instead of `unwrap`/`expect`/`panic!`. The rest return `()`
436    // and use plain `assert!`, which is not covered by
437    // `clippy::panic`.
438    use std::error::Error;
439
440    use lopdf::{Document as LopdfDocument, Object};
441    use mos_layout::{Base14Font, Font, Page, PageGraph, TextRun};
442
443    use super::*;
444
445    // Explicit `std::result::Result` because the parent module
446    // imports `mos_core::Result` which only takes one type
447    // parameter.
448    type TestResult = std::result::Result<(), Box<dyn Error>>;
449
450    /// `assert!`-shaped helper that returns `Err` instead of
451    /// panicking, so `-> TestResult` bodies stay clippy-clean under
452    /// `clippy::panic_in_result_fn`. Mirrors the precedent in
453    /// `pdf-base14-metrics/tests/winansi_vendor.rs` and the
454    /// integration test at `tests/extended_latin_roundtrip.rs`.
455    macro_rules! ensure {
456        ($cond:expr, $($arg:tt)*) => {
457            if !$cond {
458                return Err(format!($($arg)*).into());
459            }
460        };
461    }
462
463    fn count_bytes(haystack: &[u8], needle: &[u8]) -> usize {
464        haystack
465            .windows(needle.len())
466            .filter(|w| *w == needle)
467            .count()
468    }
469
470    fn sample_graph() -> PageGraph {
471        PageGraph {
472            pages: vec![Page {
473                number: 1,
474                width_pt: 595.276_f32,
475                height_pt: 841.89_f32,
476                runs: vec![
477                    TextRun {
478                        x_pt: 68.0,
479                        baseline_from_top_pt: 100.0,
480                        size_pt: 20.0,
481                        font: Font::Base14(Base14Font::HelveticaBold),
482                        text: "Title".to_owned(),
483                        actual_text: None,
484                        glyphs: Vec::new(),
485                    },
486                    TextRun {
487                        x_pt: 68.0,
488                        baseline_from_top_pt: 130.0,
489                        size_pt: 11.0,
490                        font: Font::Base14(Base14Font::Helvetica),
491                        text: "Body".to_owned(),
492                        actual_text: None,
493                        glyphs: Vec::new(),
494                    },
495                ],
496                images: Vec::new(),
497            }],
498            images: Vec::new(),
499        }
500    }
501
502    fn info_string<'info>(
503        info: &'info lopdf::Dictionary,
504        key: &[u8],
505    ) -> std::result::Result<&'info str, Box<dyn Error>> {
506        let Object::String(bytes, _) = info.get(key)? else {
507            return Err(format!(
508                "expected Info key /{} to be a string",
509                String::from_utf8_lossy(key)
510            )
511            .into());
512        };
513        Ok(std::str::from_utf8(bytes)?)
514    }
515
516    #[test]
517    fn build_pdf_starts_with_pdf_header_and_ends_with_eof() {
518        let (bytes, diags) = build_pdf(&sample_graph(), &PdfMetadata::default()).unwrap();
519        assert!(bytes.starts_with(b"%PDF-"), "missing PDF header");
520        assert!(
521            bytes.windows(5).any(|w| w == b"%%EOF"),
522            "missing %%EOF marker"
523        );
524        assert!(diags.is_empty(), "unexpected diagnostics: {diags:?}");
525    }
526
527    #[test]
528    fn build_pdf_embeds_text_runs_as_visible_strings() {
529        let (bytes, _) = build_pdf(&sample_graph(), &PdfMetadata::default()).unwrap();
530        // The Str writer emits ASCII inside `(...)` so we can grep
531        // the raw bytes for the visible payload.
532        assert!(
533            bytes.windows(b"(Title)".len()).any(|w| w == b"(Title)"),
534            "Title not found in stream"
535        );
536        assert!(
537            bytes.windows(b"(Body)".len()).any(|w| w == b"(Body)"),
538            "Body not found in stream"
539        );
540    }
541
542    #[test]
543    fn empty_graph_still_produces_valid_pdf() {
544        let (bytes, _) = build_pdf(&PageGraph::default(), &PdfMetadata::default()).unwrap();
545        assert!(bytes.starts_with(b"%PDF-"));
546    }
547
548    #[test]
549    fn metadata_and_provenance_appear_in_info_dictionary() -> TestResult {
550        let metadata = PdfMetadata {
551            title: Some("My Doc".to_owned()),
552            author: Some("A. Person".to_owned()),
553            language: None,
554        };
555        let (bytes, _) = build_pdf(&sample_graph(), &metadata).unwrap();
556        let doc = LopdfDocument::load_mem(&bytes)?;
557        let Object::Reference(info_id) = doc.trailer.get(b"Info")? else {
558            return Err("expected trailer /Info reference".into());
559        };
560        let info = doc.get_dictionary(*info_id)?;
561
562        ensure!(info_string(info, b"Title")? == "My Doc", "wrong /Title");
563        ensure!(
564            info_string(info, b"Author")? == "A. Person",
565            "wrong /Author"
566        );
567        ensure!(
568            info_string(info, b"Producer")? == PRODUCER,
569            "wrong /Producer"
570        );
571        ensure!(info_string(info, b"Creator")? == PRODUCER, "wrong /Creator");
572        Ok(())
573    }
574
575    #[test]
576    fn actual_text_is_emitted_for_replacement_runs() {
577        let graph = PageGraph {
578            pages: vec![Page {
579                number: 1,
580                width_pt: 595.276_f32,
581                height_pt: 841.89_f32,
582                runs: vec![TextRun {
583                    x_pt: 68.0,
584                    baseline_from_top_pt: 100.0,
585                    size_pt: 12.0,
586                    font: Font::Base14(Base14Font::Courier),
587                    text: "    println".to_owned(),
588                    actual_text: Some("\tprintln".to_owned()),
589                    glyphs: Vec::new(),
590                }],
591                images: Vec::new(),
592            }],
593            images: Vec::new(),
594        };
595
596        let (bytes, diags) = build_pdf(&graph, &PdfMetadata::default()).unwrap();
597
598        assert!(diags.is_empty(), "unexpected diagnostics: {diags:?}");
599        assert!(
600            bytes
601                .windows(b"/ActualText".len())
602                .any(|w| w == b"/ActualText"),
603            "missing /ActualText"
604        );
605        assert!(
606            bytes.windows(b"println".len()).any(|w| w == b"println"),
607            "actual text payload missing"
608        );
609    }
610
611    #[test]
612    fn actual_text_wraps_adjacent_fragments_once() {
613        let graph = PageGraph {
614            pages: vec![Page {
615                number: 1,
616                width_pt: 595.276_f32,
617                height_pt: 841.89_f32,
618                runs: vec![
619                    TextRun {
620                        x_pt: 68.0,
621                        baseline_from_top_pt: 100.0,
622                        size_pt: 12.0,
623                        font: Font::Base14(Base14Font::Courier),
624                        text: "    ".to_owned(),
625                        actual_text: Some("\tprintln".to_owned()),
626                        glyphs: Vec::new(),
627                    },
628                    TextRun {
629                        x_pt: 92.0,
630                        baseline_from_top_pt: 100.0,
631                        size_pt: 12.0,
632                        font: Font::Base14(Base14Font::CourierBold),
633                        text: "println".to_owned(),
634                        actual_text: Some("\tprintln".to_owned()),
635                        glyphs: Vec::new(),
636                    },
637                ],
638                images: Vec::new(),
639            }],
640            images: Vec::new(),
641        };
642
643        let (bytes, diags) = build_pdf(&graph, &PdfMetadata::default()).unwrap();
644
645        assert!(diags.is_empty(), "unexpected diagnostics: {diags:?}");
646        assert_eq!(count_bytes(&bytes, b"/ActualText"), 1);
647        assert_eq!(count_bytes(&bytes, b"println"), 1);
648    }
649
650    /// A graph containing Polish + Czech text: exercises the
651    /// `/Differences` and `/ToUnicode` emit paths end to end.
652    fn extended_latin_graph() -> PageGraph {
653        PageGraph {
654            pages: vec![Page {
655                number: 1,
656                width_pt: 595.276_f32,
657                height_pt: 841.89_f32,
658                runs: vec![TextRun {
659                    x_pt: 68.0,
660                    baseline_from_top_pt: 100.0,
661                    size_pt: 12.0,
662                    font: Font::Base14(Base14Font::Helvetica),
663                    text: "Łódź Příliš ě".to_owned(),
664                    actual_text: None,
665                    glyphs: Vec::new(),
666                }],
667                images: Vec::new(),
668            }],
669            images: Vec::new(),
670        }
671    }
672
673    #[test]
674    fn extended_latin_emits_differences_and_to_unicode() {
675        let (bytes, diags) = build_pdf(&extended_latin_graph(), &PdfMetadata::default()).unwrap();
676        assert!(diags.is_empty(), "unexpected diagnostics: {diags:?}");
677        // The /Encoding dict carries /BaseEncoding /WinAnsiEncoding.
678        assert!(
679            bytes
680                .windows(b"/BaseEncoding /WinAnsiEncoding".len())
681                .any(|w| w == b"/BaseEncoding /WinAnsiEncoding"),
682            "missing /BaseEncoding"
683        );
684        // The /Differences array contains the AFM glyph names for the
685        // non-WinAnsi codepoints in the sample: Ł→Lslash, ř→rcaron,
686        // ě→ecaron, ź→zacute. (ó/d/í/l/i/š are WinAnsi natives, so
687        // they don't show up in /Differences.)
688        for name in [b"/Lslash" as &[u8], b"/rcaron", b"/ecaron", b"/zacute"] {
689            assert!(
690                bytes.windows(name.len()).any(|w| w == name),
691                "missing {:?} in /Differences",
692                std::str::from_utf8(name).unwrap_or("?")
693            );
694        }
695        // A /ToUnicode CMap was emitted.
696        assert!(
697            bytes
698                .windows(b"/ToUnicode".len())
699                .any(|w| w == b"/ToUnicode"),
700            "missing /ToUnicode reference"
701        );
702        assert!(
703            bytes
704                .windows(b"beginbfchar".len())
705                .any(|w| w == b"beginbfchar"),
706            "missing beginbfchar in CMap"
707        );
708    }
709
710    #[test]
711    fn pure_ascii_graph_keeps_predefined_winansi_shortcut() {
712        // Existing sample_graph() is pure ASCII; no /Differences
713        // should be emitted, the predefined WinAnsi shortcut path is
714        // exercised. This guards against accidental "always emit a
715        // custom encoding" regressions that would balloon every PDF.
716        let (bytes, _) = build_pdf(&sample_graph(), &PdfMetadata::default()).unwrap();
717        assert!(
718            bytes
719                .windows(b"/Encoding /WinAnsiEncoding".len())
720                .any(|w| w == b"/Encoding /WinAnsiEncoding"),
721            "expected predefined WinAnsi shortcut on ASCII-only doc"
722        );
723        assert!(
724            !bytes
725                .windows(b"/BaseEncoding".len())
726                .any(|w| w == b"/BaseEncoding"),
727            "no custom /Encoding dict expected for ASCII-only doc"
728        );
729    }
730
731    #[test]
732    fn extended_latin_content_stream_uses_remapped_bytes() {
733        // Polish "Ł" lands in the first gap slot (0x7F) by the
734        // allocator's deterministic order. The run also contains
735        // Latin-1 bytes ≥ 0x80 (`ó`, `í`, …) so pdf-writer switches
736        // the string from literal `(...)` to hex `<...>` form;
737        // 0x7F therefore appears in the document as the ASCII pair
738        // `7F`. This is a smoke check that the encoder routed Ł to
739        // a remapped slot rather than substituting `?` (0x3F).
740        //
741        // Both assertions operate on the page content stream slice
742        // only: scanning the whole PDF would let the `/ToUnicode`
743        // CMap (`<7F> <0141>`) satisfy the `7F` needle even if the
744        // content stream had silently substituted to `?`. Surgical
745        // slicing keeps the smoke test honest.
746        let (bytes, _) = build_pdf(&extended_latin_graph(), &PdfMetadata::default()).unwrap();
747        let content_stream = first_content_stream(&bytes).expect("content stream not found");
748        let needle = b"7F";
749        assert!(
750            content_stream.windows(needle.len()).any(|w| w == needle),
751            "content stream should reference remapped slot 0x7F"
752        );
753        let qmark_count = content_stream.iter().filter(|&&b| b == b'?').count();
754        assert!(
755            qmark_count < 5,
756            "too many `?` in PDF ({qmark_count}); did Ł/ř/ě/ź get substituted?"
757        );
758    }
759
760    #[test]
761    fn build_pdf_is_byte_for_byte_deterministic() {
762        // Regression guard for the HashMap-iteration-order bug that
763        // shuffled indirect IDs between builds. Two `build_pdf` calls
764        // on the same graph must produce identical bytes; otherwise
765        // golden tests and reproducible CI artifacts break.
766        let (a, _) = build_pdf(&extended_latin_graph(), &PdfMetadata::default()).unwrap();
767        let (b, _) = build_pdf(&extended_latin_graph(), &PdfMetadata::default()).unwrap();
768        assert_eq!(
769            a,
770            b,
771            "build_pdf is non-deterministic: byte lengths {} vs {}",
772            a.len(),
773            b.len()
774        );
775    }
776
777    /// Locate the first `stream` ... `endstream` body in a PDF byte
778    /// blob and return the bytes between them. `build_pdf` emits the
779    /// page content stream before any `/ToUnicode` `CMap` stream
780    /// (see the object-order comment in [`build_pdf`]), so the first
781    /// match is always the page content. Markers anchor on the
782    /// surrounding `\n` so the substring inside `endstream` doesn't
783    /// false-match the opener.
784    fn first_content_stream(bytes: &[u8]) -> Option<&[u8]> {
785        let open = b"\nstream\n";
786        let close = b"\nendstream";
787        let open_at = bytes.windows(open.len()).position(|w| w == open)?;
788        let body = &bytes[open_at + open.len()..];
789        let close_at = body.windows(close.len()).position(|w| w == close)?;
790        Some(&body[..close_at])
791    }
792
793    fn unique_temp_path(label: &str) -> std::path::PathBuf {
794        std::env::temp_dir().join(format!(
795            "mos-pdf-test-{label}-{}",
796            std::time::SystemTime::now()
797                .duration_since(std::time::UNIX_EPOCH)
798                .map_or(0, |d| d.as_nanos())
799        ))
800    }
801
802    #[test]
803    fn emit_writes_file() -> TestResult {
804        let dir = unique_temp_path("write");
805        let out = dir.join("out.pdf");
806        let diags = emit(&sample_graph(), &PdfMetadata::default(), &out)
807            .map_err(|e| format!("emit: {e:?}"))?;
808        ensure!(diags.is_empty(), "unexpected diagnostics: {diags:?}");
809        let bytes = std::fs::read(&out)?;
810        ensure!(bytes.starts_with(b"%PDF-"), "missing PDF header");
811        std::fs::remove_dir_all(&dir).ok();
812        Ok(())
813    }
814
815    /// Build a graph with one image: a 4×2 red-and-blue checker
816    /// flattened to RGB8, sized at 40×20 pt. Reused across multiple
817    /// emit tests below.
818    fn image_graph() -> PageGraph {
819        use mos_layout::{ImageHandle, ImagePlacement};
820        use std::sync::Arc;
821        // 4 columns × 2 rows; alternating red/blue cells.
822        let mut rgb8 = Vec::with_capacity(4 * 2 * 3);
823        for y in 0..2 {
824            for x in 0..4 {
825                if (x + y) % 2 == 0 {
826                    rgb8.extend_from_slice(&[255, 0, 0]);
827                } else {
828                    rgb8.extend_from_slice(&[0, 0, 255]);
829                }
830            }
831        }
832        let handle = ImageHandle {
833            id: 0,
834            resolved_path: "/tmp/checker.png".to_owned(),
835            pixel_width: 4,
836            pixel_height: 2,
837            rgb8: Arc::from(rgb8),
838        };
839        PageGraph {
840            pages: vec![Page {
841                number: 1,
842                width_pt: 595.276_f32,
843                height_pt: 841.89_f32,
844                runs: Vec::new(),
845                images: vec![ImagePlacement {
846                    handle: handle.clone(),
847                    x_pt: 68.0,
848                    top_from_top_pt: 100.0,
849                    width_pt: 40.0,
850                    height_pt: 20.0,
851                }],
852            }],
853            images: vec![handle],
854        }
855    }
856
857    #[test]
858    fn image_xobject_carries_width_height_and_devicergb() {
859        let (bytes, diags) = build_pdf(&image_graph(), &PdfMetadata::default()).unwrap();
860        assert!(diags.is_empty(), "unexpected diagnostics: {diags:?}");
861        // The Image XObject must declare /Subtype /Image, /Width 4,
862        // /Height 2, /ColorSpace /DeviceRGB, /BitsPerComponent 8, and
863        // /Filter /FlateDecode.
864        for needle in [
865            b"/Subtype /Image" as &[u8],
866            b"/Width 4",
867            b"/Height 2",
868            b"/ColorSpace /DeviceRGB",
869            b"/BitsPerComponent 8",
870            b"/Filter /FlateDecode",
871        ] {
872            assert!(
873                bytes.windows(needle.len()).any(|w| w == needle),
874                "missing {:?} in PDF",
875                std::str::from_utf8(needle).unwrap_or("?")
876            );
877        }
878    }
879
880    #[test]
881    fn image_placement_emits_do_operator_referencing_xobject() {
882        let (bytes, _) = build_pdf(&image_graph(), &PdfMetadata::default()).unwrap();
883        // The page's resource dict must list /Im0; the content stream
884        // must reference /Im0 via the Do operator.
885        assert!(
886            bytes.windows(b"/Im0 ".len()).any(|w| w == b"/Im0 "),
887            "/Im0 resource name not found"
888        );
889        assert!(
890            bytes.windows(b"/Im0 Do".len()).any(|w| w == b"/Im0 Do"),
891            "/Im0 Do operator not found in content stream"
892        );
893    }
894
895    #[test]
896    fn duplicate_image_emits_one_xobject() {
897        // Two placements of the same image should still produce one
898        // shared XObject; the layout pass already dedup'd them, so the
899        // PDF backend never sees two ImageHandle entries.
900        use mos_layout::{ImageHandle, ImagePlacement};
901        use std::sync::Arc;
902        let handle = ImageHandle {
903            id: 0,
904            resolved_path: "/tmp/shared.png".to_owned(),
905            pixel_width: 1,
906            pixel_height: 1,
907            rgb8: Arc::from(vec![10_u8, 20, 30]),
908        };
909        let graph = PageGraph {
910            pages: vec![Page {
911                number: 1,
912                width_pt: 595.276_f32,
913                height_pt: 841.89_f32,
914                runs: Vec::new(),
915                images: vec![
916                    ImagePlacement {
917                        handle: handle.clone(),
918                        x_pt: 10.0,
919                        top_from_top_pt: 50.0,
920                        width_pt: 5.0,
921                        height_pt: 5.0,
922                    },
923                    ImagePlacement {
924                        handle: handle.clone(),
925                        x_pt: 100.0,
926                        top_from_top_pt: 50.0,
927                        width_pt: 5.0,
928                        height_pt: 5.0,
929                    },
930                ],
931            }],
932            images: vec![handle],
933        };
934        let (bytes, _) = build_pdf(&graph, &PdfMetadata::default()).unwrap();
935        let xobject_marker = b"/Subtype /Image";
936        let count = bytes
937            .windows(xobject_marker.len())
938            .filter(|w| *w == xobject_marker)
939            .count();
940        assert_eq!(count, 1, "expected exactly one Image XObject, got {count}");
941        // Both placements show up as /Im0 Do.
942        let do_count = bytes
943            .windows(b"/Im0 Do".len())
944            .filter(|w| *w == b"/Im0 Do")
945            .count();
946        assert_eq!(
947            do_count, 2,
948            "expected two /Im0 Do operators, got {do_count}"
949        );
950    }
951
952    #[test]
953    fn image_only_pdf_remains_byte_deterministic() {
954        let (a, _) = build_pdf(&image_graph(), &PdfMetadata::default()).unwrap();
955        let (b, _) = build_pdf(&image_graph(), &PdfMetadata::default()).unwrap();
956        assert_eq!(a, b, "image emit must be byte-stable across runs");
957    }
958
959    #[test]
960    fn emit_fails_with_mos0014_when_target_is_a_directory() -> TestResult {
961        // Writing a file whose path collides with an existing
962        // directory must surface as an `MOS0014` diagnostic, not a
963        // panic or an `Unimplemented` error.
964        let dir = unique_temp_path("conflict");
965        std::fs::create_dir_all(&dir)?;
966        // `dir` itself is the bogus output target; `fs::write` will
967        // refuse to overwrite a directory.
968        let result = emit(&sample_graph(), &PdfMetadata::default(), &dir);
969        std::fs::remove_dir_all(&dir).ok();
970        let Err(err) = result else {
971            return Err("expected emit to fail when target is a directory".into());
972        };
973        let CoreError::Diagnostic(d) = err else {
974            return Err("expected Diagnostic, got Unimplemented".into());
975        };
976        ensure!(
977            d.def().code() == codes::MOS0014.code(),
978            "wrong code: {:?}",
979            d.def().code()
980        );
981        ensure!(
982            d.message().contains("could not write PDF"),
983            "message={:?}",
984            d.message()
985        );
986        Ok(())
987    }
988}