Skip to main content

mos_csl/
parser.rs

1//! Parse a CSL 1.0.2 style document into the typed [`Style`] AST.
2//!
3//! A read-only [`roxmltree`] DOM walk, dispatching on element local names.
4//! The `<style>` root must be in the CSL namespace or none
5//! (an unnamespaced root is tolerated; a foreign namespace is rejected).
6//!
7//! It models the structure and common attributes; unmodelled attributes are ignored,
8//! unknown rendering elements are a [`CslParseError`], and in-style `<locale>` blocks are retained
9//! as raw XML for a later locale slice.
10
11use std::collections::BTreeMap;
12
13use roxmltree::{Document, Node};
14
15use crate::error::{CslParseError, CslParseErrorKind};
16use crate::style::{
17    Bibliography, BibliographyOptions, Branch, Choose, Citation, CitationOptions, Common,
18    Conditions, DateElement, DatePart, Element, EtAl, Group, Info, InfoCategory, InfoContributor,
19    InfoLink, InheritableNameOptions, Label, Layout, LocaleBlock, Match, NameElement, NamePart,
20    Names, Number, SortKey, SortKeyOptions, SortTarget, Style, StyleClass, StyleOptions, Text,
21    TextSource,
22};
23
24/// The CSL XML namespace. A namespaced `<style>` root must use it.
25const CSL_NAMESPACE: &str = "http://purl.org/net/xbiblio/csl";
26
27/// Parse `input` as a CSL 1.0.2 style.
28///
29/// # Errors
30///
31/// Returns a [`CslParseError`] when `input` is not well-formed XML, the root is
32/// not `<style>`, the required `version`/`class` attributes are missing or
33/// invalid or unsupported, a `<macro>` lacks a `name`, a `<citation>`/
34/// `<bibliography>` lacks its `<layout>`, a `<text>` selects no source or too
35/// many sources, a `<choose>` has invalid branch order, or an unsupported
36/// rendering element is encountered.
37///
38/// # Examples
39///
40/// ```
41/// use mos_csl::{parse_style, StyleClass};
42///
43/// let style = parse_style(
44///     r#"<style version="1.0" class="in-text">
45///          <info><title>Demo</title></info>
46///          <citation><layout><text variable="title"/></layout></citation>
47///        </style>"#,
48/// )
49/// .expect("valid CSL");
50/// assert_eq!(style.class, StyleClass::InText);
51/// assert!(style.citation.is_some());
52/// ```
53pub fn parse_style(input: &str) -> Result<Style, CslParseError> {
54    let document = Document::parse(input).map_err(|error| {
55        let offset = text_pos_to_byte_offset(input, error.pos()).unwrap_or(0);
56        CslParseError::new(CslParseErrorKind::MalformedXml(error.to_string()), offset)
57    })?;
58    let root = document.root_element();
59    if root.tag_name().name() != "style" {
60        let name = root.tag_name().name().to_owned();
61        return Err(err_at(root, CslParseErrorKind::UnexpectedRoot(name)));
62    }
63    // A namespaced root must use the CSL namespace; an unnamespaced root is
64    // tolerated (hand-authored styles routinely omit it). A foreign namespace
65    // is rejected: element local names alone would otherwise accept non-CSL XML.
66    if let Some(namespace) = root.tag_name().namespace()
67        && namespace != CSL_NAMESPACE
68    {
69        return Err(err_at(
70            root,
71            CslParseErrorKind::ForeignNamespace(namespace.to_owned()),
72        ));
73    }
74
75    let version = root
76        .attribute("version")
77        .ok_or_else(|| err_at(root, CslParseErrorKind::MissingVersion))?;
78    // Accept the `1.0` schema major and any `1.0.x` point release; the version
79    // attribute of real CSL 1.0.x styles is almost always `1.0`, but tolerate an
80    // explicit patch suffix rather than reject otherwise-valid styles.
81    if version != "1.0" && !version.starts_with("1.0.") {
82        return Err(err_at(
83            root,
84            CslParseErrorKind::UnsupportedVersion(version.to_owned()),
85        ));
86    }
87    let version = version.to_owned();
88    let class = match root.attribute("class") {
89        Some("in-text") => StyleClass::InText,
90        Some("note") => StyleClass::Note,
91        Some(other) => {
92            return Err(err_at(
93                root,
94                CslParseErrorKind::UnknownClass(other.to_owned()),
95            ));
96        }
97        None => return Err(err_at(root, CslParseErrorKind::MissingClass)),
98    };
99    let default_locale = attr(root, "default-locale");
100
101    let mut info = Info::default();
102    let mut citation = None;
103    let mut bibliography = None;
104    let mut macros = BTreeMap::new();
105    let mut locales = Vec::new();
106
107    for child in child_elements(root) {
108        match child.tag_name().name() {
109            "info" => info = parse_info(child),
110            "citation" => citation = Some(parse_citation(child)?),
111            "bibliography" => bibliography = Some(parse_bibliography(child)?),
112            "macro" => {
113                let name = child
114                    .attribute("name")
115                    .ok_or_else(|| err_at(child, CslParseErrorKind::MissingMacroName))?;
116                macros.insert(name.to_owned(), parse_elements(child)?);
117            }
118            "locale" => locales.push(parse_locale(child, input)),
119            other => {
120                return Err(err_at(
121                    child,
122                    CslParseErrorKind::UnsupportedElement(other.to_owned()),
123                ));
124            }
125        }
126    }
127
128    Ok(Style {
129        class,
130        version,
131        default_locale,
132        options: parse_style_options(root),
133        info,
134        citation,
135        bibliography,
136        macros,
137        locales,
138    })
139}
140
141fn parse_info(node: Node<'_, '_>) -> Info {
142    let mut info = Info::default();
143    for child in child_elements(node) {
144        match child.tag_name().name() {
145            "id" => info.id = child.text().map(str::to_owned),
146            "title" => info.title = child.text().map(str::to_owned),
147            "link" => info.links.push(parse_info_link(child)),
148            "category" => info.categories.push(parse_info_category(child)),
149            "author" => info.authors.push(parse_info_contributor(child)),
150            "contributor" => info.contributors.push(parse_info_contributor(child)),
151            "updated" => info.updated = child.text().map(str::to_owned),
152            "issn" => {
153                if let Some(text) = child.text() {
154                    info.issn.push(text.to_owned());
155                }
156            }
157            // Other <info> children are ignored.
158            _ => {}
159        }
160    }
161    info
162}
163
164fn parse_info_link(node: Node<'_, '_>) -> InfoLink {
165    InfoLink {
166        rel: attr(node, "rel"),
167        href: attr(node, "href"),
168        media_type: attr(node, "type"),
169    }
170}
171
172fn parse_info_category(node: Node<'_, '_>) -> InfoCategory {
173    InfoCategory {
174        citation_format: attr(node, "citation-format"),
175        field: attr(node, "field"),
176    }
177}
178
179fn parse_info_contributor(node: Node<'_, '_>) -> InfoContributor {
180    let mut contributor = InfoContributor::default();
181    for child in child_elements(node) {
182        match child.tag_name().name() {
183            "name" => contributor.name = child.text().map(str::to_owned),
184            "uri" => contributor.uri = child.text().map(str::to_owned),
185            "email" => contributor.email = child.text().map(str::to_owned),
186            _ => {}
187        }
188    }
189    contributor
190}
191
192fn parse_locale(node: Node<'_, '_>, input: &str) -> LocaleBlock {
193    let xml = match input.get(node.range()) {
194        Some(text) => text.to_owned(),
195        None => String::new(),
196    };
197    LocaleBlock { xml }
198}
199
200fn parse_citation(node: Node<'_, '_>) -> Result<Citation, CslParseError> {
201    let (layout, sort) = parse_layout_and_sort(node)?;
202    Ok(Citation {
203        layout,
204        sort,
205        options: parse_citation_options(node),
206    })
207}
208
209fn parse_bibliography(node: Node<'_, '_>) -> Result<Bibliography, CslParseError> {
210    let (layout, sort) = parse_layout_and_sort(node)?;
211    Ok(Bibliography {
212        layout,
213        sort,
214        options: parse_bibliography_options(node),
215    })
216}
217
218fn parse_layout_and_sort(node: Node<'_, '_>) -> Result<(Layout, Vec<SortKey>), CslParseError> {
219    let mut layout = None;
220    let mut sort = Vec::new();
221    for child in child_elements(node) {
222        match child.tag_name().name() {
223            "layout" => layout = Some(parse_layout(child)?),
224            "sort" => sort = parse_sort(child),
225            other => {
226                return Err(err_at(
227                    child,
228                    CslParseErrorKind::UnsupportedElement(other.to_owned()),
229                ));
230            }
231        }
232    }
233    let layout = layout.ok_or_else(|| err_at(node, CslParseErrorKind::MissingLayout))?;
234    Ok((layout, sort))
235}
236
237fn parse_layout(node: Node<'_, '_>) -> Result<Layout, CslParseError> {
238    Ok(Layout {
239        elements: parse_elements(node)?,
240        common: parse_common(node),
241    })
242}
243
244fn parse_sort(node: Node<'_, '_>) -> Vec<SortKey> {
245    let mut keys = Vec::new();
246    for child in child_elements(node) {
247        if child.tag_name().name() == "key" {
248            let target = child.attribute("macro").map_or_else(
249                || SortTarget::Variable(attr(child, "variable").unwrap_or_default()),
250                |name| SortTarget::Macro(name.to_owned()),
251            );
252            keys.push(SortKey {
253                target,
254                descending: child.attribute("sort") == Some("descending"),
255                options: parse_sort_key_options(child),
256            });
257        }
258    }
259    keys
260}
261
262fn parse_elements(node: Node<'_, '_>) -> Result<Vec<Element>, CslParseError> {
263    let mut elements = Vec::new();
264    for child in child_elements(node) {
265        elements.push(parse_element(child)?);
266    }
267    Ok(elements)
268}
269
270fn parse_element(node: Node<'_, '_>) -> Result<Element, CslParseError> {
271    let element = match node.tag_name().name() {
272        "text" => Element::Text(parse_text(node)?),
273        "number" => Element::Number(parse_number(node)),
274        "date" => Element::Date(parse_date(node)),
275        "names" => Element::Names(Box::new(parse_names(node)?)),
276        "label" => Element::Label(parse_label(node)),
277        "group" => Element::Group(parse_group(node)?),
278        "choose" => Element::Choose(parse_choose(node)?),
279        other => {
280            return Err(err_at(
281                node,
282                CslParseErrorKind::UnsupportedElement(other.to_owned()),
283            ));
284        }
285    };
286    Ok(element)
287}
288
289fn parse_text(node: Node<'_, '_>) -> Result<Text, CslParseError> {
290    let source_count = [
291        node.attribute("variable"),
292        node.attribute("macro"),
293        node.attribute("term"),
294        node.attribute("value"),
295    ]
296    .into_iter()
297    .flatten()
298    .count();
299    if source_count > 1 {
300        return Err(err_at(node, CslParseErrorKind::TextWithMultipleSources));
301    }
302
303    let source = if let Some(variable) = node.attribute("variable") {
304        TextSource::Variable {
305            name: variable.to_owned(),
306            form: attr(node, "form"),
307        }
308    } else if let Some(name) = node.attribute("macro") {
309        TextSource::Macro(name.to_owned())
310    } else if let Some(term) = node.attribute("term") {
311        TextSource::Term {
312            name: term.to_owned(),
313            form: attr(node, "form"),
314            plural: bool_attr(node, "plural"),
315        }
316    } else if let Some(value) = node.attribute("value") {
317        TextSource::Value(value.to_owned())
318    } else {
319        return Err(err_at(node, CslParseErrorKind::TextWithoutSource));
320    };
321    Ok(Text {
322        source,
323        quotes: bool_attr(node, "quotes"),
324        strip_periods: bool_attr(node, "strip-periods"),
325        common: parse_common(node),
326    })
327}
328
329fn parse_number(node: Node<'_, '_>) -> Number {
330    Number {
331        variable: attr(node, "variable").unwrap_or_default(),
332        form: attr(node, "form"),
333        common: parse_common(node),
334    }
335}
336
337fn parse_date(node: Node<'_, '_>) -> DateElement {
338    let mut parts = Vec::new();
339    for child in child_elements(node) {
340        if child.tag_name().name() == "date-part" {
341            parts.push(DatePart {
342                name: attr(child, "name").unwrap_or_default(),
343                form: attr(child, "form"),
344                range_delimiter: attr(child, "range-delimiter"),
345                strip_periods: attr(child, "strip-periods"),
346                common: parse_common(child),
347            });
348        }
349    }
350    DateElement {
351        variable: attr(node, "variable").unwrap_or_default(),
352        form: attr(node, "form"),
353        date_parts: attr(node, "date-parts"),
354        parts,
355        common: parse_common(node),
356    }
357}
358
359fn parse_names(node: Node<'_, '_>) -> Result<Names, CslParseError> {
360    let variables = attr(node, "variable")
361        .unwrap_or_default()
362        .split_whitespace()
363        .map(str::to_owned)
364        .collect();
365    let mut name = None;
366    let mut et_al = None;
367    let mut label = None;
368    let mut substitute = Vec::new();
369    for child in child_elements(node) {
370        match child.tag_name().name() {
371            "name" => {
372                name = Some(parse_name_element(child));
373            }
374            "et-al" => {
375                et_al = Some(EtAl {
376                    term: attr(child, "term"),
377                    common: parse_common(child),
378                });
379            }
380            "label" => label = Some(parse_label(child)),
381            "substitute" => substitute = parse_elements(child)?,
382            other => {
383                return Err(err_at(
384                    child,
385                    CslParseErrorKind::UnsupportedElement(other.to_owned()),
386                ));
387            }
388        }
389    }
390    Ok(Names {
391        variables,
392        name,
393        et_al,
394        label,
395        substitute,
396        common: parse_common(node),
397    })
398}
399
400fn parse_name_element(node: Node<'_, '_>) -> NameElement {
401    let mut parts = Vec::new();
402    for child in child_elements(node) {
403        if child.tag_name().name() == "name-part" {
404            parts.push(NamePart {
405                name: attr(child, "name"),
406                common: parse_common(child),
407            });
408        }
409    }
410
411    NameElement {
412        form: attr(node, "form"),
413        options: parse_inheritable_name_options(node),
414        parts,
415        common: parse_common(node),
416    }
417}
418
419fn parse_label(node: Node<'_, '_>) -> Label {
420    Label {
421        variable: attr(node, "variable"),
422        form: attr(node, "form"),
423        plural: attr(node, "plural"),
424        strip_periods: attr(node, "strip-periods"),
425        common: parse_common(node),
426    }
427}
428
429fn parse_group(node: Node<'_, '_>) -> Result<Group, CslParseError> {
430    Ok(Group {
431        children: parse_elements(node)?,
432        common: parse_common(node),
433    })
434}
435
436fn parse_choose(node: Node<'_, '_>) -> Result<Choose, CslParseError> {
437    let mut branches = Vec::new();
438    let mut otherwise = Vec::new();
439    let mut seen_if = false;
440    let mut seen_else = false;
441    for child in child_elements(node) {
442        match child.tag_name().name() {
443            "if" => {
444                if seen_if || seen_else {
445                    return Err(err_at(child, CslParseErrorKind::InvalidChooseOrder));
446                }
447                seen_if = true;
448                branches.push(Branch {
449                    conditions: parse_conditions(child),
450                    children: parse_elements(child)?,
451                });
452            }
453            "else-if" => {
454                if !seen_if || seen_else {
455                    return Err(err_at(child, CslParseErrorKind::InvalidChooseOrder));
456                }
457                branches.push(Branch {
458                    conditions: parse_conditions(child),
459                    children: parse_elements(child)?,
460                });
461            }
462            "else" => {
463                if !seen_if || seen_else {
464                    return Err(err_at(child, CslParseErrorKind::InvalidChooseOrder));
465                }
466                seen_else = true;
467                otherwise = parse_elements(child)?;
468            }
469            other => {
470                return Err(err_at(
471                    child,
472                    CslParseErrorKind::UnsupportedElement(other.to_owned()),
473                ));
474            }
475        }
476    }
477    if !seen_if {
478        return Err(err_at(node, CslParseErrorKind::InvalidChooseOrder));
479    }
480    Ok(Choose {
481        branches,
482        otherwise,
483    })
484}
485
486fn parse_conditions(node: Node<'_, '_>) -> Conditions {
487    let match_mode = match node.attribute("match") {
488        Some("any") => Match::Any,
489        Some("none") => Match::None,
490        _ => Match::All,
491    };
492    Conditions {
493        match_mode,
494        kind: tokens(node, "type"),
495        variable: tokens(node, "variable"),
496        is_numeric: tokens(node, "is-numeric"),
497        is_uncertain_date: tokens(node, "is-uncertain-date"),
498        locator: tokens(node, "locator"),
499        position: tokens(node, "position"),
500        disambiguate: bool_attr(node, "disambiguate"),
501    }
502}
503
504fn parse_common(node: Node<'_, '_>) -> Common {
505    Common {
506        prefix: attr(node, "prefix"),
507        suffix: attr(node, "suffix"),
508        delimiter: attr(node, "delimiter"),
509        font_style: attr(node, "font-style"),
510        font_variant: attr(node, "font-variant"),
511        font_weight: attr(node, "font-weight"),
512        text_decoration: attr(node, "text-decoration"),
513        vertical_align: attr(node, "vertical-align"),
514        text_case: attr(node, "text-case"),
515        display: attr(node, "display"),
516    }
517}
518
519fn parse_style_options(node: Node<'_, '_>) -> StyleOptions {
520    StyleOptions {
521        page_range_format: attr(node, "page-range-format"),
522        demote_non_dropping_particle: attr(node, "demote-non-dropping-particle"),
523        initialize_with_hyphen: attr(node, "initialize-with-hyphen"),
524        names: parse_inheritable_name_options(node),
525    }
526}
527
528fn parse_citation_options(node: Node<'_, '_>) -> CitationOptions {
529    CitationOptions {
530        collapse: attr(node, "collapse"),
531        cite_group_delimiter: attr(node, "cite-group-delimiter"),
532        year_suffix_delimiter: attr(node, "year-suffix-delimiter"),
533        after_collapse_delimiter: attr(node, "after-collapse-delimiter"),
534        disambiguate_add_names: attr(node, "disambiguate-add-names"),
535        disambiguate_add_givenname: attr(node, "disambiguate-add-givenname"),
536        disambiguate_add_year_suffix: attr(node, "disambiguate-add-year-suffix"),
537        givenname_disambiguation_rule: attr(node, "givenname-disambiguation-rule"),
538        near_note_distance: attr(node, "near-note-distance"),
539        names: parse_inheritable_name_options(node),
540    }
541}
542
543fn parse_bibliography_options(node: Node<'_, '_>) -> BibliographyOptions {
544    BibliographyOptions {
545        hanging_indent: attr(node, "hanging-indent"),
546        second_field_align: attr(node, "second-field-align"),
547        line_spacing: attr(node, "line-spacing"),
548        entry_spacing: attr(node, "entry-spacing"),
549        subsequent_author_substitute: attr(node, "subsequent-author-substitute"),
550        subsequent_author_substitute_rule: attr(node, "subsequent-author-substitute-rule"),
551        names: parse_inheritable_name_options(node),
552    }
553}
554
555fn parse_sort_key_options(node: Node<'_, '_>) -> SortKeyOptions {
556    SortKeyOptions {
557        names_min: attr(node, "names-min"),
558        names_use_first: attr(node, "names-use-first"),
559        names_use_last: attr(node, "names-use-last"),
560    }
561}
562
563fn parse_inheritable_name_options(node: Node<'_, '_>) -> InheritableNameOptions {
564    InheritableNameOptions {
565        et_al_min: attr(node, "et-al-min"),
566        et_al_use_first: attr(node, "et-al-use-first"),
567        et_al_subsequent_min: attr(node, "et-al-subsequent-min"),
568        et_al_subsequent_use_first: attr(node, "et-al-subsequent-use-first"),
569        et_al_use_last: attr(node, "et-al-use-last"),
570        and: attr(node, "and"),
571        delimiter_precedes_et_al: attr(node, "delimiter-precedes-et-al"),
572        delimiter_precedes_last: attr(node, "delimiter-precedes-last"),
573        initialize: attr(node, "initialize"),
574        initialize_with: attr(node, "initialize-with"),
575        name_as_sort_order: attr(node, "name-as-sort-order"),
576        sort_separator: attr(node, "sort-separator"),
577    }
578}
579
580/// Element-only children of `node` (skips text, comments, and whitespace).
581fn child_elements<'a, 'input>(node: Node<'a, 'input>) -> impl Iterator<Item = Node<'a, 'input>> {
582    node.children().filter(Node::is_element)
583}
584
585/// An attribute as an owned `String`, if present.
586fn attr(node: Node<'_, '_>, name: &str) -> Option<String> {
587    node.attribute(name).map(str::to_owned)
588}
589
590/// A boolean attribute: `true` only when the value is exactly `"true"`.
591fn bool_attr(node: Node<'_, '_>, name: &str) -> bool {
592    node.attribute(name) == Some("true")
593}
594
595/// A whitespace-separated attribute split into owned tokens.
596fn tokens(node: Node<'_, '_>, name: &str) -> Vec<String> {
597    node.attribute(name)
598        .map(|value| value.split_whitespace().map(str::to_owned).collect())
599        .unwrap_or_default()
600}
601
602fn text_pos_to_byte_offset(input: &str, position: roxmltree::TextPos) -> Option<usize> {
603    let row = usize::try_from(position.row).ok()?;
604    let col = usize::try_from(position.col).ok()?;
605    if row == 0 || col == 0 {
606        return None;
607    }
608
609    let (line_start, line) = line_at(input, row)?;
610    let col_offset = column_to_byte_offset(line, col)?;
611    Some(line_start + col_offset)
612}
613
614fn line_at(input: &str, row: usize) -> Option<(usize, &str)> {
615    let mut line_start = 0;
616    for (line_index, line) in input.split_inclusive('\n').enumerate() {
617        if line_index + 1 == row {
618            let line_without_newline = match line.strip_suffix('\n') {
619                Some(stripped) => stripped,
620                None => line,
621            };
622            return Some((line_start, line_without_newline));
623        }
624        line_start += line.len();
625    }
626
627    if row == 1 && input.is_empty() {
628        return Some((0, ""));
629    }
630    None
631}
632
633fn column_to_byte_offset(line: &str, col: usize) -> Option<usize> {
634    let target_chars = col.checked_sub(1)?;
635    let mut chars_seen = 0;
636    for (byte_offset, _) in line.char_indices() {
637        if chars_seen == target_chars {
638            return Some(byte_offset);
639        }
640        chars_seen += 1;
641    }
642
643    if chars_seen == target_chars {
644        Some(line.len())
645    } else {
646        None
647    }
648}
649
650/// Build an error anchored at a node's start byte offset.
651fn err_at(node: Node<'_, '_>, kind: CslParseErrorKind) -> CslParseError {
652    CslParseError::new(kind, node.range().start)
653}