Skip to main content

pdf_base14_metrics/
agl_subset.rs

1// Hand-curated Unicode → PostScript glyph name table.
2//
3// Scope: the 99 glyph names that appear in the Adobe Core 14 Latin
4// AFMs (Helvetica/Times/Courier × 4) but have no PDF
5// `WinAnsiEncoding` byte. These are the glyphs reachable through a
6// custom `/Encoding /Differences` array on a Core 14 font: Latin
7// Extended-A in full, the Polish/Czech/Hungarian/Romanian/Turkish
8// long tail, the `fi`/`fl` ligatures, the math operators
9// (−, ≤, ≥, ≠, √, ∂, ∑, ∆, ◊), and the spacing diacritics
10// (˘ ˇ ˙ ˝ ˛ ˚).
11//
12// Encoded as a `&[(char, &'static str)]` sorted by `char` so callers
13// can `binary_search_by_key`. The names are exactly those in the
14// Adobe AFM (e.g. `Tcommaaccent`, not `Tcedilla`): what the PDF
15// reader looks up to find the glyph outline.
16//
17// The Romanian comma-below codepoints `U+0218`..`U+021B` and the
18// historical cedilla-below variants `U+015E`/`U+015F`/`U+0162`/`U+0163`
19// both resolve to the same glyph names in the AFM
20// (`Scommaaccent`/`scommaaccent`/`Tcommaaccent`/`tcommaaccent`,
21// `Scedilla`/`scedilla` only for S: Helvetica's AFM has both
22// `Scedilla` and `Scommaaccent` as distinct glyphs but only one
23// `Tcommaaccent`). The mapping below picks the AGLFN canonical name
24// for each codepoint.
25//
26// `WinAnsi` natives (`á`, `ß`, `€`, `“`, ...) are NOT in this table:
27// look them up through `winansi_byte` instead.
28//
29// Source: the Helvetica.adobe-font-metrics `CharSet` minus the 216 names in
30// `WINANSI_TABLE`, cross-referenced with the Adobe Glyph List for New
31// Fonts (AGLFN). License-clean: the 99 entries below are derivative
32// of the vendored AFM `CharSet` (Adobe APAFML) plus public PDF/Unicode
33// standards, not a reproduction of the AGL data file.
34
35// (char, AFM glyph name). Sorted by `char` for binary search.
36pub(crate) const AGL_SUBSET: &[(char, &str)] = &[
37    // Latin Extended-A
38    ('\u{0100}', "Amacron"),
39    ('\u{0101}', "amacron"),
40    ('\u{0102}', "Abreve"),
41    ('\u{0103}', "abreve"),
42    ('\u{0104}', "Aogonek"),
43    ('\u{0105}', "aogonek"),
44    ('\u{0106}', "Cacute"),
45    ('\u{0107}', "cacute"),
46    ('\u{010C}', "Ccaron"),
47    ('\u{010D}', "ccaron"),
48    ('\u{010E}', "Dcaron"),
49    ('\u{010F}', "dcaron"),
50    ('\u{0110}', "Dcroat"),
51    ('\u{0111}', "dcroat"),
52    ('\u{0112}', "Emacron"),
53    ('\u{0113}', "emacron"),
54    ('\u{0116}', "Edotaccent"),
55    ('\u{0117}', "edotaccent"),
56    ('\u{0118}', "Eogonek"),
57    ('\u{0119}', "eogonek"),
58    ('\u{011A}', "Ecaron"),
59    ('\u{011B}', "ecaron"),
60    ('\u{011E}', "Gbreve"),
61    ('\u{011F}', "gbreve"),
62    ('\u{0122}', "Gcommaaccent"),
63    ('\u{0123}', "gcommaaccent"),
64    ('\u{012A}', "Imacron"),
65    ('\u{012B}', "imacron"),
66    ('\u{012E}', "Iogonek"),
67    ('\u{012F}', "iogonek"),
68    ('\u{0130}', "Idotaccent"),
69    ('\u{0131}', "dotlessi"),
70    ('\u{0136}', "Kcommaaccent"),
71    ('\u{0137}', "kcommaaccent"),
72    ('\u{0139}', "Lacute"),
73    ('\u{013A}', "lacute"),
74    ('\u{013B}', "Lcommaaccent"),
75    ('\u{013C}', "lcommaaccent"),
76    ('\u{013D}', "Lcaron"),
77    ('\u{013E}', "lcaron"),
78    ('\u{0141}', "Lslash"),
79    ('\u{0142}', "lslash"),
80    ('\u{0143}', "Nacute"),
81    ('\u{0144}', "nacute"),
82    ('\u{0145}', "Ncommaaccent"),
83    ('\u{0146}', "ncommaaccent"),
84    ('\u{0147}', "Ncaron"),
85    ('\u{0148}', "ncaron"),
86    ('\u{014C}', "Omacron"),
87    ('\u{014D}', "omacron"),
88    ('\u{0150}', "Ohungarumlaut"),
89    ('\u{0151}', "ohungarumlaut"),
90    ('\u{0154}', "Racute"),
91    ('\u{0155}', "racute"),
92    ('\u{0156}', "Rcommaaccent"),
93    ('\u{0157}', "rcommaaccent"),
94    ('\u{0158}', "Rcaron"),
95    ('\u{0159}', "rcaron"),
96    ('\u{015A}', "Sacute"),
97    ('\u{015B}', "sacute"),
98    ('\u{015E}', "Scedilla"),
99    ('\u{015F}', "scedilla"),
100    // U+0162 / U+0163: historical T-cedilla codepoints. The AFM only
101    // ships `Tcommaaccent` (no `Tcedilla`); modern Romanian uses the
102    // comma-below codepoints U+021A/U+021B below, but Unicode data
103    // shipped before ~2000 commonly stores ţ as U+0163, so we route
104    // both to the same glyph.
105    ('\u{0162}', "Tcommaaccent"),
106    ('\u{0163}', "tcommaaccent"),
107    ('\u{0164}', "Tcaron"),
108    ('\u{0165}', "tcaron"),
109    ('\u{016A}', "Umacron"),
110    ('\u{016B}', "umacron"),
111    ('\u{016E}', "Uring"),
112    ('\u{016F}', "uring"),
113    ('\u{0170}', "Uhungarumlaut"),
114    ('\u{0171}', "uhungarumlaut"),
115    ('\u{0172}', "Uogonek"),
116    ('\u{0173}', "uogonek"),
117    ('\u{0179}', "Zacute"),
118    ('\u{017A}', "zacute"),
119    ('\u{017B}', "Zdotaccent"),
120    ('\u{017C}', "zdotaccent"),
121    // (`Zcaron`/`zcaron` are NOT here: they live in WinAnsi at 0x8E/0x9E.)
122    // Latin Extended-B (Romanian comma-below: modern canonical).
123    ('\u{0218}', "Scommaaccent"),
124    ('\u{0219}', "scommaaccent"),
125    ('\u{021A}', "Tcommaaccent"),
126    ('\u{021B}', "tcommaaccent"),
127    // Spacing modifier letters (PDF AFMs name these as plain spacing
128    // accents: caron, breve, dotaccent, hungarumlaut, ogonek, ring).
129    ('\u{02C7}', "caron"),
130    ('\u{02D8}', "breve"),
131    ('\u{02D9}', "dotaccent"),
132    ('\u{02DA}', "ring"),
133    ('\u{02DB}', "ogonek"),
134    ('\u{02DD}', "hungarumlaut"),
135    // Fraction slash (U+2044), math (− ≤ ≥ ≠ √ ∂ ∑ ∆), lozenge (◊),
136    // and ligatures. Order matters: this slice is binary-searched.
137    ('\u{2044}', "fraction"),
138    ('\u{2202}', "partialdiff"),
139    ('\u{2206}', "Delta"),
140    ('\u{2211}', "summation"),
141    ('\u{2212}', "minus"),
142    ('\u{221A}', "radical"),
143    ('\u{2260}', "notequal"),
144    ('\u{2264}', "lessequal"),
145    ('\u{2265}', "greaterequal"),
146    ('\u{25CA}', "lozenge"),
147    ('\u{FB01}', "fi"),
148    ('\u{FB02}', "fl"),
149];
150
151/// Returns the PostScript glyph name for `ch` if it is one of the
152/// non-`WinAnsi` glyphs known to live in every Core 14 Latin AFM
153/// (Latin Extended-A, common Latin Extended-B, math operators, spacing
154/// diacritics, `fi`/`fl` ligatures). Returns `None` for `WinAnsi`
155/// natives (use `winansi_byte` for those) and for codepoints with no
156/// glyph in any Core 14 font (Cyrillic, CJK, emoji, ...).
157pub(crate) fn agl_glyph_name(ch: char) -> Option<&'static str> {
158    AGL_SUBSET
159        .binary_search_by_key(&ch, |&(c, _)| c)
160        .ok()
161        .map(|i| AGL_SUBSET[i].1)
162}
163
164#[cfg(test)]
165mod tests {
166    use super::*;
167
168    #[test]
169    fn table_is_sorted_by_char() {
170        for w in AGL_SUBSET.windows(2) {
171            assert!(
172                w[0].0 < w[1].0,
173                "AGL_SUBSET out of order: {:?} >= {:?}",
174                w[0],
175                w[1]
176            );
177        }
178    }
179
180    #[test]
181    fn polish_lslash_resolves() {
182        assert_eq!(agl_glyph_name('\u{0141}'), Some("Lslash"));
183        assert_eq!(agl_glyph_name('\u{0142}'), Some("lslash"));
184    }
185
186    #[test]
187    fn czech_caron_glyphs_resolve() {
188        assert_eq!(agl_glyph_name('\u{011B}'), Some("ecaron"));
189        assert_eq!(agl_glyph_name('\u{0159}'), Some("rcaron"));
190        assert_eq!(agl_glyph_name('\u{010F}'), Some("dcaron"));
191    }
192
193    #[test]
194    fn romanian_comma_below_resolves_to_commaaccent() {
195        assert_eq!(agl_glyph_name('\u{0219}'), Some("scommaaccent"));
196        assert_eq!(agl_glyph_name('\u{021B}'), Some("tcommaaccent"));
197    }
198
199    #[test]
200    fn ligatures_resolve() {
201        assert_eq!(agl_glyph_name('\u{FB01}'), Some("fi"));
202        assert_eq!(agl_glyph_name('\u{FB02}'), Some("fl"));
203    }
204
205    #[test]
206    fn winansi_native_returns_none() {
207        // 'A' (U+0041) is in WinAnsi at byte 0x41; not our table.
208        assert_eq!(agl_glyph_name('A'), None);
209        // 'é' (U+00E9) is in WinAnsi at 0xE9.
210        assert_eq!(agl_glyph_name('é'), None);
211        // 'ž' (U+017E) IS in WinAnsi at 0x9E.
212        assert_eq!(agl_glyph_name('ž'), None);
213    }
214
215    #[test]
216    fn cjk_and_cyrillic_return_none() {
217        assert_eq!(agl_glyph_name('П'), None);
218        assert_eq!(agl_glyph_name('日'), None);
219    }
220}