Skip to main content

mos_pdf/
embedded.rs

1//! Type 0 CID-keyed font emission for bundled embedded faces.
2//!
3//! Each embedded face used in a document is emitted as five indirect
4//! objects:
5//!
6//! 1. `Font` dict (`Type 0`, `Identity-H`, descendant + `ToUnicode` refs)
7//! 2. `CIDFont` dict (`CIDFontType2`, `/CIDSystemInfo`, `/W`,
8//!    `/CIDToGIDMap /Identity`, descriptor ref)
9//! 3. `FontDescriptor` dict (bbox, ascent, descent, italic angle,
10//!    stem widths, `/FontFile2` ref)
11//! 4. `/FontFile2` stream (the subsetted TTF, with `/Length1` =
12//!    uncompressed size)
13//! 5. `/ToUnicode` `CMap` (maps each subset CID back to the source
14//!    cluster's UTF-8 codepoints)
15//!
16//! The subset is built per-document over the union of glyph IDs seen
17//! across every run that uses the face. With `/CIDToGIDMap /Identity`
18//! the CID in the content stream equals the GID inside the subset
19//! font file. `subsetter::GlyphRemapper` provides the original-GID →
20//! subset-GID mapping.
21
22use std::collections::{BTreeMap, HashMap, HashSet};
23
24use mos_core::{CoreError, Diagnostic, Result, codes};
25use mos_fonts::{EmbeddedFontId, ShapedGlyph};
26use mos_layout::TextRun;
27use pdf_writer::types::{CidFontType, FontFlags, SystemInfo, UnicodeCmap};
28use pdf_writer::writers::FontDescriptor;
29use pdf_writer::{Finish, Name, Pdf, Rect, Ref, Str};
30use subsetter::GlyphRemapper;
31
32/// PDF objects emitted for one embedded face. The 5 refs are allocated
33/// up front so the cross-references resolve.
34#[derive(Debug, Clone, Copy)]
35pub(crate) struct EmbeddedRefs {
36    pub font: Ref,
37    pub cid_font: Ref,
38    pub descriptor: Ref,
39    pub font_file: Ref,
40    pub to_unicode: Ref,
41}
42
43/// Per-face plan: which glyph IDs were used, the subset bytes, the
44/// `GlyphRemapper` that maps original → subset GIDs, and the
45/// gid-to-source-text map used to build `/ToUnicode`.
46pub(crate) struct EmbeddedFontPlan {
47    pub id: EmbeddedFontId,
48    pub subset_bytes: Vec<u8>,
49    pub remapper: GlyphRemapper,
50    /// Original GIDs used in content streams, including GID 0
51    /// (`.notdef`) when unsupported codepoints were shaped. This is
52    /// wider than `gid_to_text`: `.notdef` needs a PDF width but no
53    /// `/ToUnicode` mapping.
54    pub used_gids: Vec<u16>,
55    /// Original GID → source text for that glyph's cluster. For
56    /// ligatures (1 glyph, N codepoints) the value is the multi-char
57    /// cluster string. For 1:1 mappings (typical LTR) it's a
58    /// single-char string. For one-codepoint-many-glyphs
59    /// decompositions (rare), the first glyph carries the codepoint
60    /// and later glyphs in the same cluster carry empty strings.
61    /// Co-populated with `used_gids` by `accumulate_glyphs`; keep those
62    /// sources of truth in sync.
63    pub gid_to_text: BTreeMap<u16, String>,
64}
65
66#[derive(Debug, Clone, PartialEq)]
67pub(crate) enum ContentOp {
68    SetTextMatrix([f32; 6]),
69    ShowCids(Vec<u16>),
70    AdjustText(f32),
71}
72
73/// Plan every embedded face touched by `runs`. Returns one
74/// [`EmbeddedFontPlan`] per face actually referenced, in stable
75/// (`EmbeddedFontId`-sorted) order.
76///
77/// # Errors
78///
79/// Returns an error if subsetting fails for a face: only possible
80/// with corrupted font data, which the bundled cuts are not.
81pub(crate) fn plan_embedded(runs: &[TextRun]) -> Result<Vec<EmbeddedFontPlan>> {
82    // gid set + gid → cluster text, per face.
83    let mut per_face: HashMap<EmbeddedFontId, (Vec<u16>, BTreeMap<u16, String>)> = HashMap::new();
84    for run in runs {
85        let Some(id) = run.font.embedded() else {
86            continue;
87        };
88        let entry = per_face.entry(id).or_default();
89        accumulate_glyphs(&mut entry.0, &mut entry.1, &run.text, &run.glyphs);
90    }
91    let mut plans: Vec<EmbeddedFontPlan> = Vec::with_capacity(per_face.len());
92    // Iterate ALL ids in fixed order so plan output is deterministic.
93    for id in EmbeddedFontId::ALL {
94        let Some((gids, gid_to_text)) = per_face.remove(&id) else {
95            continue;
96        };
97        // Drop duplicates while preserving first-occurrence order:
98        // GlyphRemapper assigns subset GIDs by first sighting, so this
99        // dedup keeps the remapper assignment (and PDF bytes) stable.
100        let mut seen: HashSet<u16> = HashSet::with_capacity(gids.len());
101        let gids: Vec<u16> = gids.into_iter().filter(|g| seen.insert(*g)).collect();
102        let font = id.data();
103        let subset_bytes = mos_fonts::subset(font, &gids).map_err(|err| {
104            CoreError::Diagnostic(Box::new(Diagnostic::simple(
105                &codes::MOS0026,
106                None,
107                format!("font subsetting failed for {id:?}: {err}"),
108            )))
109        })?;
110        let mut all = Vec::with_capacity(gids.len() + 1);
111        all.push(0_u16);
112        all.extend_from_slice(&gids);
113        let remapper = GlyphRemapper::new_from_glyphs(&all);
114        plans.push(EmbeddedFontPlan {
115            id,
116            subset_bytes,
117            remapper,
118            used_gids: gids,
119            gid_to_text,
120        });
121    }
122    Ok(plans)
123}
124
125fn accumulate_glyphs(
126    gids: &mut Vec<u16>,
127    gid_to_text: &mut BTreeMap<u16, String>,
128    source: &str,
129    glyphs: &[ShapedGlyph],
130) {
131    // Walk glyphs, grouping by cluster, so multi-codepoint clusters
132    // (ligatures) map their full text to the first glyph and an
133    // empty string to subsequent glyphs in the same cluster.
134    let mut i = 0;
135    while i < glyphs.len() {
136        let cluster = glyphs[i].cluster as usize;
137        let mut j = i + 1;
138        while j < glyphs.len() && glyphs[j].cluster as usize == cluster {
139            j += 1;
140        }
141        let next_cluster = if j < glyphs.len() {
142            glyphs[j].cluster as usize
143        } else {
144            source.len()
145        };
146        let cluster_str = source.get(cluster..next_cluster).unwrap_or("");
147        for (k, g) in glyphs[i..j].iter().enumerate() {
148            gids.push(g.gid);
149            // GID 0 is `.notdef`: rustybuzz emits it for codepoints
150            // the face doesn't cover. Recording a Unicode mapping for
151            // it would round-trip every unsupported character back to
152            // whichever source text happened to land on GID 0 first
153            // (e.g. `日本` shaped against a Latin-only face would
154            // round-trip `.notdef` glyphs to `日`). Leaving it out of
155            // `gid_to_text` keeps the CMap silent on `.notdef`, which
156            // is the right behaviour: PDF readers treat a missing
157            // bfchar entry as "no Unicode equivalent".
158            if g.gid == 0 {
159                continue;
160            }
161            gid_to_text.entry(g.gid).or_insert_with(|| {
162                if k == 0 {
163                    cluster_str.to_owned()
164                } else {
165                    String::new()
166                }
167            });
168        }
169        i = j;
170    }
171}
172
173/// Emit the 5 PDF objects for one embedded face. Caller allocates the
174/// refs and ensures they're cross-referenced from each page's `/Font`
175/// resource dict.
176pub(crate) fn emit_embedded(pdf: &mut Pdf, plan: &EmbeddedFontPlan, refs: EmbeddedRefs) {
177    let font = plan.id.data();
178    let subset_tag = subset_tag(&plan.subset_bytes);
179    let base_font = format!("{subset_tag}+{}", font.postscript_name);
180    let base_font_bytes = base_font.as_bytes();
181
182    // 1. Type 0 font dict.
183    let mut type0 = pdf.type0_font(refs.font);
184    type0.base_font(Name(base_font_bytes));
185    type0.encoding_predefined(Name(b"Identity-H"));
186    type0.descendant_font(refs.cid_font);
187    type0.to_unicode(refs.to_unicode);
188    type0.finish();
189
190    // 2. CIDFont dict.
191    let mut cid = pdf.cid_font(refs.cid_font);
192    cid.subtype(CidFontType::Type2);
193    cid.base_font(Name(base_font_bytes));
194    cid.system_info(SystemInfo {
195        registry: Str(b"Adobe"),
196        ordering: Str(b"Identity"),
197        supplement: 0,
198    });
199    cid.font_descriptor(refs.descriptor);
200    cid.default_width(0.0);
201    cid.cid_to_gid_map_predefined(Name(b"Identity"));
202    {
203        let mut widths = cid.widths();
204        // /W array: for each used subset GID, emit its advance in
205        // 1/1000 em. `pdf-writer`'s `Widths::consecutive(first, ws)`
206        // emits `first [w1 w2 ...]` runs. We group consecutive
207        // subset GIDs.
208        let upem = f32::from(font.units_per_em);
209        let mut entries: Vec<(u16, f32)> = plan
210            .used_gids
211            .iter()
212            .filter_map(|&orig_gid| {
213                let subset_gid = plan.remapper.get(orig_gid)?;
214                let advance_units = font.advance_units(orig_gid);
215                let advance_1000 = f32::from(advance_units) * 1000.0 / upem;
216                Some((subset_gid, advance_1000))
217            })
218            .collect();
219        entries.sort_by_key(|e| e.0);
220        entries.dedup_by_key(|e| e.0);
221        let mut i = 0;
222        while i < entries.len() {
223            let start = entries[i].0;
224            let mut j = i + 1;
225            while j < entries.len() && entries[j].0 == entries[j - 1].0 + 1 {
226                j += 1;
227            }
228            widths.consecutive(start, entries[i..j].iter().map(|e| e.1));
229            i = j;
230        }
231    }
232    cid.finish();
233
234    // 3. FontDescriptor.
235    let mut desc = pdf.font_descriptor(refs.descriptor);
236    write_font_descriptor(&mut desc, font, base_font_bytes, refs.font_file);
237    desc.finish();
238
239    // 4. /FontFile2 stream; the subsetted TTF.
240    {
241        let mut stream = pdf.stream(refs.font_file, &plan.subset_bytes);
242        let length1 = i32::try_from(plan.subset_bytes.len()).unwrap_or(i32::MAX);
243        stream.pair(Name(b"Length1"), length1);
244    }
245
246    // 5. ToUnicode CMap.
247    let system_info = SystemInfo {
248        registry: Str(b"Adobe"),
249        ordering: Str(b"UCS"),
250        supplement: 0,
251    };
252    let mut cmap: UnicodeCmap<u16> = UnicodeCmap::new(Name(b"Adobe-Identity-UCS"), system_info);
253    // Iterate gid_to_text in subset-GID order so the CMap entries are
254    // byte-stable across runs.
255    let mut by_subset: Vec<(u16, &str)> = plan
256        .gid_to_text
257        .iter()
258        .filter_map(|(orig, text)| plan.remapper.get(*orig).map(|sub| (sub, text.as_str())))
259        .collect();
260    by_subset.sort_by_key(|e| e.0);
261    for (subset_gid, text) in by_subset {
262        // Skip empty mappings: pdf-writer's `pair_with_multiple` with
263        // zero codepoints would emit `<XX> <>`, which some readers
264        // reject. Trailing glyphs of decompositions inherit through
265        // the missing-mapping convention.
266        if text.is_empty() {
267            continue;
268        }
269        cmap.pair_with_multiple(subset_gid, text.chars());
270    }
271    let cmap_bytes = cmap.finish();
272    let mut cmap_writer = pdf.cmap(refs.to_unicode, &cmap_bytes);
273    cmap_writer.name(Name(b"Adobe-Identity-UCS"));
274    cmap_writer.system_info(system_info);
275}
276
277fn write_font_descriptor(
278    desc: &mut FontDescriptor<'_>,
279    font: &mos_fonts::EmbeddedFont,
280    base_font: &[u8],
281    font_file: Ref,
282) {
283    let scale = 1000.0 / f32::from(font.units_per_em);
284    desc.name(Name(base_font));
285    desc.flags(FontFlags::from_bits_truncate(font.flags));
286    desc.bbox(Rect::new(
287        f32::from(font.bbox.0) * scale,
288        f32::from(font.bbox.1) * scale,
289        f32::from(font.bbox.2) * scale,
290        f32::from(font.bbox.3) * scale,
291    ));
292    desc.italic_angle(font.italic_angle);
293    desc.ascent(f32::from(font.ascender) * scale);
294    desc.descent(f32::from(font.descender) * scale);
295    desc.cap_height(f32::from(font.cap_height) * scale);
296    desc.stem_v(f32::from(font.stem_v));
297    desc.font_file2(font_file);
298}
299
300/// Six-letter uppercase subset tag derived deterministically from the
301/// subset bytes. Required by PDF 1.7 §9.6.4 for embedded subsets:
302/// the `/BaseFont` and `FontDescriptor` `/FontName` must start with
303/// `<6 uppercase letters>+`.
304fn subset_tag(subset_bytes: &[u8]) -> String {
305    let mut hash: u64 = 0xcbf2_9ce4_8422_2325;
306    for &b in subset_bytes {
307        hash ^= u64::from(b);
308        hash = hash.wrapping_mul(0x100_0000_01b3);
309    }
310    let mut tag = String::with_capacity(6);
311    for _ in 0..6 {
312        let r = (hash % 26) as u8;
313        tag.push(char::from(b'A' + r));
314        hash /= 26;
315    }
316    tag
317}
318
319/// Encode shaped glyphs into PDF text-content operations. Each
320/// `ShapedGlyph::gid` is remapped through `plan.remapper` to its
321/// subset CID. GPOS advances become `TJ` adjustments; glyphs with
322/// GPOS offsets get their own absolute `Tm` so marks draw at the shaped
323/// position and the next glyph resumes from the un-offset pen.
324pub(crate) fn encode_glyph_run(
325    plan: &EmbeddedFontPlan,
326    glyphs: &[ShapedGlyph],
327    size_pt: f32,
328    origin_x_pt: f32,
329    origin_y_pt: f32,
330) -> Vec<ContentOp> {
331    let font = plan.id.data();
332    let upem = f32::from(font.units_per_em);
333    let mut ops = Vec::new();
334    let mut pending = Vec::new();
335    let mut pen_units = 0_i32;
336    let mut normal_group_open = false;
337
338    for (index, glyph) in glyphs.iter().enumerate() {
339        let cid = if let Some(cid) = plan.remapper.get(glyph.gid) {
340            cid
341        } else {
342            debug_assert!(false, "GID {} missing from subset remapper", glyph.gid);
343            0
344        };
345        let has_offset = glyph.x_offset_units != 0 || glyph.y_offset_units != 0;
346        if has_offset {
347            flush_cids(&mut pending, &mut ops);
348            let offset_pen = [pen_units + glyph.x_offset_units, glyph.y_offset_units];
349            ops.push(ContentOp::SetTextMatrix([
350                1.0,
351                0.0,
352                0.0,
353                1.0,
354                origin_x_pt + units_to_pt(offset_pen[0], size_pt, upem),
355                origin_y_pt + units_to_pt(offset_pen[1], size_pt, upem),
356            ]));
357            pending.push(cid);
358            flush_cids(&mut pending, &mut ops);
359            normal_group_open = false;
360            pen_units += glyph.advance_units;
361            continue;
362        }
363
364        if !normal_group_open {
365            ops.push(ContentOp::SetTextMatrix([
366                1.0,
367                0.0,
368                0.0,
369                1.0,
370                origin_x_pt + units_to_pt(pen_units, size_pt, upem),
371                origin_y_pt,
372            ]));
373            normal_group_open = true;
374        }
375
376        pending.push(cid);
377
378        let nominal_units = i32::from(font.advance_units(glyph.gid));
379        let displacement_units = nominal_units - glyph.advance_units;
380        pen_units += glyph.advance_units;
381
382        if index + 1 < glyphs.len() && displacement_units != 0 {
383            flush_cids(&mut pending, &mut ops);
384            ops.push(ContentOp::AdjustText(units_to_text_adjust(
385                displacement_units,
386                upem,
387            )));
388        }
389    }
390    flush_cids(&mut pending, &mut ops);
391    ops
392}
393
394pub(crate) fn cids_to_bytes(cids: &[u16]) -> Vec<u8> {
395    let mut out = Vec::with_capacity(cids.len() * 2);
396    for cid in cids {
397        out.extend_from_slice(&cid.to_be_bytes());
398    }
399    out
400}
401
402fn flush_cids(pending: &mut Vec<u16>, ops: &mut Vec<ContentOp>) {
403    if pending.is_empty() {
404        return;
405    }
406    ops.push(ContentOp::ShowCids(std::mem::take(pending)));
407}
408
409fn units_to_pt(units: i32, size_pt: f32, upem: f32) -> f32 {
410    units_to_f32(units) * size_pt / upem
411}
412
413fn units_to_text_adjust(units: i32, upem: f32) -> f32 {
414    units_to_f32(units) * 1000.0 / upem
415}
416
417#[allow(
418    clippy::cast_precision_loss,
419    reason = "PDF coordinates are f32; i32 font-unit positions must not be clamped before scaling"
420)]
421fn units_to_f32(units: i32) -> f32 {
422    units as f32
423}